From c4cbaca327174135e28353c3438241b08bf96755 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 26 Aug 2015 14:04:03 -0400
Subject: [PATCH 01/82] nouveau: avoid build failures since 0fc21ecf

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv30/nv30_context.c | 2 +-
 src/gallium/drivers/nouveau/nv50/nv50_context.c | 2 +-
 src/gallium/drivers/nouveau/nvc0/nvc0_context.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.c b/src/gallium/drivers/nouveau/nv30/nv30_context.c
index 46590eecdf3..a36fd57fae7 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.c
@@ -190,7 +190,7 @@ nv30_context_destroy(struct pipe_context *pipe)
    } while(0)
 
 struct pipe_context *
-nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
+nv30_context_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 {
    struct nv30_screen *screen = nv30_screen(pscreen);
    struct nv30_context *nv30 = CALLOC_STRUCT(nv30_context);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 11638dd7f14..4949459a803 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -240,7 +240,7 @@ nv50_context_get_sample_position(struct pipe_context *, unsigned, unsigned,
                                  float *);
 
 struct pipe_context *
-nv50_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
+nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 {
    struct nv50_screen *screen = nv50_screen(pscreen);
    struct nv50_context *nv50;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 613cad69aa5..f7604f11788 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -262,7 +262,7 @@ nvc0_context_get_sample_position(struct pipe_context *, unsigned, unsigned,
                                  float *);
 
 struct pipe_context *
-nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
+nvc0_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 {
    struct nvc0_screen *screen = nvc0_screen(pscreen);
    struct nvc0_context *nvc0;

From 8ae37365f30594498184fe5428f961a9c310fd8c Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 19 May 2015 10:35:39 -0700
Subject: [PATCH 02/82] mesa/formats: define the 2D ASTC formats

Define the mesa formats and make changes necessary for compilation
without errors. Also add support for _mesa_get_srgb_format_linear().

v2. conform the ASTC MESA_FORMAT enums to the existing naming convention.
v3. remove ASTC cases for _mesa_get_uncompressed_format(). This function is
    only used for generating mipmaps - something ASTC formats do not support
    due to lack of online compression.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/format_info.py |  3 +++
 src/mesa/main/formats.c      | 43 ++++++++++++++++++++++++++++++++++++
 src/mesa/main/formats.csv    | 31 ++++++++++++++++++++++++++
 src/mesa/main/formats.h      | 31 ++++++++++++++++++++++++++
 src/mesa/swrast/s_texfetch.c | 32 ++++++++++++++++++++++++++-
 5 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/format_info.py b/src/mesa/main/format_info.py
index 839d4073c61..22eb5a734a6 100644
--- a/src/mesa/main/format_info.py
+++ b/src/mesa/main/format_info.py
@@ -122,6 +122,9 @@ def get_channel_bits(fmat, chan_name):
       elif fmat.layout == 'bptc':
          bits = 16 if fmat.name.endswith('_FLOAT') else 8
          return bits if fmat.has_channel(chan_name) else 0
+      elif fmat.layout == 'astc':
+         bits = 16 if 'RGBA' in fmat.name else 8
+         return bits if fmat.has_channel(chan_name) else 0
       else:
          assert False
    else:
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 34a4434c3ba..587221ca5a0 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -197,6 +197,7 @@ _mesa_get_format_max_bits(mesa_format format)
  *    MESA_FORMAT_LAYOUT_ETC1
  *    MESA_FORMAT_LAYOUT_ETC2
  *    MESA_FORMAT_LAYOUT_BPTC
+ *    MESA_FORMAT_LAYOUT_ASTC
  *    MESA_FORMAT_LAYOUT_OTHER
  */
 extern enum mesa_format_layout
@@ -663,6 +664,48 @@ _mesa_get_srgb_format_linear(mesa_format format)
    case MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM:
       format = MESA_FORMAT_BPTC_RGBA_UNORM;
       break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4:
+      format = MESA_FORMAT_RGBA_ASTC_4x4;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4:
+      format = MESA_FORMAT_RGBA_ASTC_5x4;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5:
+      format = MESA_FORMAT_RGBA_ASTC_5x5;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5:
+      format = MESA_FORMAT_RGBA_ASTC_6x5;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6:
+      format = MESA_FORMAT_RGBA_ASTC_6x6;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5:
+      format = MESA_FORMAT_RGBA_ASTC_8x5;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6:
+      format = MESA_FORMAT_RGBA_ASTC_8x6;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8:
+      format = MESA_FORMAT_RGBA_ASTC_8x8;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5:
+      format = MESA_FORMAT_RGBA_ASTC_10x5;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6:
+      format = MESA_FORMAT_RGBA_ASTC_10x6;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8:
+      format = MESA_FORMAT_RGBA_ASTC_10x8;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10:
+      format = MESA_FORMAT_RGBA_ASTC_10x10;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10:
+      format = MESA_FORMAT_RGBA_ASTC_12x10;
+      break;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12:
+      format = MESA_FORMAT_RGBA_ASTC_12x12;
+      break;
    case MESA_FORMAT_B8G8R8X8_SRGB:
       format = MESA_FORMAT_B8G8R8X8_UNORM;
       break;
diff --git a/src/mesa/main/formats.csv b/src/mesa/main/formats.csv
index e159e7dd6aa..80729d98787 100644
--- a/src/mesa/main/formats.csv
+++ b/src/mesa/main/formats.csv
@@ -301,3 +301,34 @@ MESA_FORMAT_BPTC_RGBA_UNORM               , bptc  , 4, 4, x128,     ,     ,
 MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM         , bptc  , 4, 4, x128,     ,     ,     , xyzw, srgb
 MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT         , bptc  , 4, 4, x128,     ,     ,     , xyz1, rgb
 MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT       , bptc  , 4, 4, x128,     ,     ,     , xyz1, rgb
+
+# ASTC compressed formats
+MESA_FORMAT_RGBA_ASTC_4x4                 , astc  , 4, 4, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_5x4                 , astc  , 5, 4, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_5x5                 , astc  , 5, 5, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_6x5                 , astc  , 6, 5, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_6x6                 , astc  , 6, 6, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_8x5                 , astc  , 8, 5, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_8x6                 , astc  , 8, 6, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_8x8                 , astc  , 8, 8, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_10x5                , astc  ,10, 5, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_10x6                , astc  ,10, 6, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_10x8                , astc  ,10, 8, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_10x10               , astc  ,10,10, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_12x10               , astc  ,12,10, x128,     ,     ,     , xyzw, rgb
+MESA_FORMAT_RGBA_ASTC_12x12               , astc  ,12,12, x128,     ,     ,     , xyzw, rgb
+
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4         , astc  , 4, 4, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4         , astc  , 5, 4, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5         , astc  , 5, 5, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5         , astc  , 6, 5, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6         , astc  , 6, 6, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5         , astc  , 8, 5, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6         , astc  , 8, 6, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8         , astc  , 8, 8, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5        , astc  ,10, 5, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6        , astc  ,10, 6, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8        , astc  ,10, 8, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10       , astc  ,10,10, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10       , astc  ,12,10, x128,     ,     ,     , xyzw, srgb
+MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12       , astc  ,12,12, x128,     ,     ,     , xyzw, srgb
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index 4936fa0d482..ccb09b263ff 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -70,6 +70,7 @@ enum mesa_format_layout {
    MESA_FORMAT_LAYOUT_ETC1,
    MESA_FORMAT_LAYOUT_ETC2,
    MESA_FORMAT_LAYOUT_BPTC,
+   MESA_FORMAT_LAYOUT_ASTC,
    MESA_FORMAT_LAYOUT_OTHER,
 };
 
@@ -586,6 +587,36 @@ typedef enum
    MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT,
    MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT,
 
+   /* ASTC compressed formats */
+   MESA_FORMAT_RGBA_ASTC_4x4,
+   MESA_FORMAT_RGBA_ASTC_5x4,
+   MESA_FORMAT_RGBA_ASTC_5x5,
+   MESA_FORMAT_RGBA_ASTC_6x5,
+   MESA_FORMAT_RGBA_ASTC_6x6,
+   MESA_FORMAT_RGBA_ASTC_8x5,
+   MESA_FORMAT_RGBA_ASTC_8x6,
+   MESA_FORMAT_RGBA_ASTC_8x8,
+   MESA_FORMAT_RGBA_ASTC_10x5,
+   MESA_FORMAT_RGBA_ASTC_10x6,
+   MESA_FORMAT_RGBA_ASTC_10x8,
+   MESA_FORMAT_RGBA_ASTC_10x10,
+   MESA_FORMAT_RGBA_ASTC_12x10,
+   MESA_FORMAT_RGBA_ASTC_12x12,
+
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10,
+   MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12,
    MESA_FORMAT_COUNT
 } mesa_format;
 
diff --git a/src/mesa/swrast/s_texfetch.c b/src/mesa/swrast/s_texfetch.c
index 1fe21c0b469..754d982bcd5 100644
--- a/src/mesa/swrast/s_texfetch.c
+++ b/src/mesa/swrast/s_texfetch.c
@@ -551,7 +551,37 @@ texfetch_funcs[] =
       fetch_compressed,
       fetch_compressed,
       fetch_compressed
-   }
+   },
+
+   /* ASTC compressed formats */
+   FETCH_NULL(RGBA_ASTC_4x4),
+   FETCH_NULL(RGBA_ASTC_5x4),
+   FETCH_NULL(RGBA_ASTC_5x5),
+   FETCH_NULL(RGBA_ASTC_6x5),
+   FETCH_NULL(RGBA_ASTC_6x6),
+   FETCH_NULL(RGBA_ASTC_8x5),
+   FETCH_NULL(RGBA_ASTC_8x6),
+   FETCH_NULL(RGBA_ASTC_8x8),
+   FETCH_NULL(RGBA_ASTC_10x5),
+   FETCH_NULL(RGBA_ASTC_10x6),
+   FETCH_NULL(RGBA_ASTC_10x8),
+   FETCH_NULL(RGBA_ASTC_10x10),
+   FETCH_NULL(RGBA_ASTC_12x10),
+   FETCH_NULL(RGBA_ASTC_12x12),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_4x4),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_5x4),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_5x5),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_6x5),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_6x6),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_8x5),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_8x6),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_8x8),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_10x5),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_10x6),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_10x8),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_10x10),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_12x10),
+   FETCH_NULL(SRGB8_ALPHA8_ASTC_12x12)
 };
 
 

From e9fd8e154fdb0394cbaed5e14ac52e689a020ebe Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 28 Apr 2015 14:41:49 -0700
Subject: [PATCH 03/82] glapi: add support for KHR_texture_compression_astc_ldr

v2: correct the spelling of the sRGB variants.
    remove spaces around "=" when setting the enum value.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 .../gen/KHR_texture_compression_astc.xml      | 40 +++++++++++++++++++
 src/mapi/glapi/gen/Makefile.am                |  1 +
 src/mapi/glapi/gen/gl_API.xml                 |  2 +-
 3 files changed, 42 insertions(+), 1 deletion(-)
 create mode 100644 src/mapi/glapi/gen/KHR_texture_compression_astc.xml

diff --git a/src/mapi/glapi/gen/KHR_texture_compression_astc.xml b/src/mapi/glapi/gen/KHR_texture_compression_astc.xml
new file mode 100644
index 00000000000..7b5864d4e13
--- /dev/null
+++ b/src/mapi/glapi/gen/KHR_texture_compression_astc.xml
@@ -0,0 +1,40 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_KHR_texture_compression_astc_ldr" number="118">
+
+    <enum name="COMPRESSED_RGBA_ASTC_4x4_KHR"   value="0x93B0"/>
+    <enum name="COMPRESSED_RGBA_ASTC_5x4_KHR"   value="0x93B1"/>
+    <enum name="COMPRESSED_RGBA_ASTC_5x5_KHR"   value="0x93B2"/>
+    <enum name="COMPRESSED_RGBA_ASTC_6x5_KHR"   value="0x93B3"/>
+    <enum name="COMPRESSED_RGBA_ASTC_6x6_KHR"   value="0x93B4"/>
+    <enum name="COMPRESSED_RGBA_ASTC_8x5_KHR"   value="0x93B5"/>
+    <enum name="COMPRESSED_RGBA_ASTC_8x6_KHR"   value="0x93B6"/>
+    <enum name="COMPRESSED_RGBA_ASTC_8x8_KHR"   value="0x93B7"/>
+    <enum name="COMPRESSED_RGBA_ASTC_10x5_KHR"  value="0x93B8"/>
+    <enum name="COMPRESSED_RGBA_ASTC_10x6_KHR"  value="0x93B9"/>
+    <enum name="COMPRESSED_RGBA_ASTC_10x8_KHR"  value="0x93BA"/>
+    <enum name="COMPRESSED_RGBA_ASTC_10x10_KHR" value="0x93BB"/>
+    <enum name="COMPRESSED_RGBA_ASTC_12x10_KHR" value="0x93BC"/>
+    <enum name="COMPRESSED_RGBA_ASTC_12x12_KHR" value="0x93BD"/>
+
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR"   value="0x93D0"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR"   value="0x93D1"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR"   value="0x93D2"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR"   value="0x93D3"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR"   value="0x93D4"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR"   value="0x93D5"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR"   value="0x93D6"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR"   value="0x93D7"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR"  value="0x93D8"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR"  value="0x93D9"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR"  value="0x93DA"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR" value="0x93DB"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR" value="0x93DC"/>
+    <enum name="COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR" value="0x93DD"/>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 86a92437f16..9224de2b9aa 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -190,6 +190,7 @@ API_XML = \
 	INTEL_performance_query.xml \
 	KHR_debug.xml \
 	KHR_context_flush_control.xml \
+	KHR_texture_compression_astc.xml \
 	NV_conditional_render.xml \
 	NV_primitive_restart.xml \
 	NV_texture_barrier.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index 658efa485f6..f0dcdca2aee 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8168,7 +8168,7 @@
 
 <xi:include href="ARB_texture_storage.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extension #118 -->
+<xi:include href="KHR_texture_compression_astc.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <xi:include href="KHR_debug.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 

From 582ce1ea976a16aa8f32ff72cb2fecb00186e253 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Mon, 18 May 2015 16:30:30 -0700
Subject: [PATCH 04/82] mesa: don't enable online compression for ASTC formats

In agreement with the ASTC spec, this makes calls to TexImage*D unsuccessful.
Implied by the spec, Generate[Texture]Mipmap and [Copy]Tex[Sub]Image*D calls
must be unsuccessful as well.

v2. actually force attempts to compress online to fail.
v3. indentation (Matt).
v4. update copytexture_error_check to account for CopyTexImage*D (Chad).

Reviewed-by: Chad Versace <chad.versace@intel.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/genmipmap.c   |  1 +
 src/mesa/main/glformats.c   | 41 +++++++++++++++++++++++++++++++++++++
 src/mesa/main/glformats.h   |  3 +++
 src/mesa/main/texcompress.c | 22 ++++++++++++++++++++
 src/mesa/main/teximage.c    | 17 +++++++++++----
 5 files changed, 80 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index c18f9d5223f..4ec8385ec2f 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -111,6 +111,7 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
 
    if (_mesa_is_enum_format_integer(srcImage->InternalFormat) ||
        _mesa_is_depthstencil_format(srcImage->InternalFormat) ||
+       _mesa_is_astc_format(srcImage->InternalFormat) ||
        _mesa_is_stencil_format(srcImage->InternalFormat)) {
       _mesa_unlock_texture(ctx, texObj);
       _mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 3eb66dab7f8..fd8336cf8e2 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -820,6 +820,47 @@ _mesa_is_enum_format_signed_int(GLenum format)
    }
 }
 
+/**
+ * Test if the given format is an ASTC format.
+ */
+GLboolean
+_mesa_is_astc_format(GLenum internalFormat)
+{
+   switch (internalFormat) {
+   case GL_COMPRESSED_RGBA_ASTC_4x4_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_5x4_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_5x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_6x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_6x6_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_8x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_8x6_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_8x8_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x6_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x8_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x10_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_12x10_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_12x12_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR:
+      return true;
+   default:
+      return false;
+   }
+}
+
 
 /**
  * Test if the given format is an integer (non-normalized) format.
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index 419955a6033..aec905d2342 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -56,6 +56,9 @@ _mesa_bytes_per_pixel( GLenum format, GLenum type );
 extern GLint
 _mesa_bytes_per_vertex_attrib(GLint comps, GLenum type);
 
+extern GLboolean
+_mesa_is_astc_format(GLenum internalFormat);
+
 extern GLboolean
 _mesa_is_type_unsigned(GLenum type);
 
diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index edfb03625c2..c028daa46e4 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -229,6 +229,28 @@ _mesa_gl_compressed_format_base_format(GLenum format)
  *        what GL_NUM_COMPRESSED_TEXTURE_FORMATS and
  *        GL_COMPRESSED_TEXTURE_FORMATS return."
  *
+ * The KHR_texture_compression_astc_hdr spec says:
+ *
+ *    "Interactions with OpenGL 4.2
+ *
+ *        OpenGL 4.2 supports the feature that compressed textures can be
+ *        compressed online, by passing the compressed texture format enum as
+ *        the internal format when uploading a texture using TexImage1D,
+ *        TexImage2D or TexImage3D (see Section 3.9.3, Texture Image
+ *        Specification, subsection Encoding of Special Internal Formats).
+ *
+ *        Due to the complexity of the ASTC compression algorithm, it is not
+ *        usually suitable for online use, and therefore ASTC support will be
+ *        limited to pre-compressed textures only. Where on-device compression
+ *        is required, a domain-specific limited compressor will typically
+ *        be used, and this is therefore not suitable for implementation in
+ *        the driver.
+ *
+ *        In particular, the ASTC format specifiers will not be added to
+ *        Table 3.14, and thus will not be accepted by the TexImage*D
+ *        functions, and will not be returned by the (already deprecated)
+ *        COMPRESSED_TEXTURE_FORMATS query."
+ *
  * There is no formal spec for GL_ATI_texture_compression_3dc.  Since the
  * formats added by this extension are luminance-alpha formats, it is
  * reasonable to expect them to follow the same rules as
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 274ecad44e9..0a641cf2cad 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1784,6 +1784,15 @@ compressedteximage_only_format(const struct gl_context *ctx, GLenum format)
    }
 }
 
+/**
+ * Return true if the format doesn't support online compression.
+ */
+static bool
+_mesa_format_no_online_compression(const struct gl_context *ctx, GLenum format)
+{
+   return _mesa_is_astc_format(format) ||
+          compressedteximage_only_format(ctx, format);
+}
 
 /* Writes to an GL error pointer if non-null and returns whether or not the
  * error is GL_NO_ERROR */
@@ -2328,7 +2337,7 @@ texture_error_check( struct gl_context *ctx,
                      "glTexImage%dD(target can't be compressed)", dimensions);
          return GL_TRUE;
       }
-      if (compressedteximage_only_format(ctx, internalFormat)) {
+      if (_mesa_format_no_online_compression(ctx, internalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glTexImage%dD(no compression for format)", dimensions);
          return GL_TRUE;
@@ -2592,7 +2601,7 @@ texsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
-      if (compressedteximage_only_format(ctx, texImage->InternalFormat)) {
+      if (_mesa_format_no_online_compression(ctx, texImage->InternalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(no compression for format)", callerName);
          return GL_TRUE;
@@ -2850,7 +2859,7 @@ copytexture_error_check( struct gl_context *ctx, GLuint dimensions,
                      "glCopyTexImage%dD(target can't be compressed)", dimensions);
          return GL_TRUE;
       }
-      if (compressedteximage_only_format(ctx, internalFormat)) {
+      if (_mesa_format_no_online_compression(ctx, internalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                "glCopyTexImage%dD(no compression for format)", dimensions);
          return GL_TRUE;
@@ -2931,7 +2940,7 @@ copytexsubimage_error_check(struct gl_context *ctx, GLuint dimensions,
    }
 
    if (_mesa_is_format_compressed(texImage->TexFormat)) {
-      if (compressedteximage_only_format(ctx, texImage->InternalFormat)) {
+      if (_mesa_format_no_online_compression(ctx, texImage->InternalFormat)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                "%s(no compression for format)", caller);
          return GL_TRUE;

From 4143511b15e8f1d63176257ae8a83b4906e3204c Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 19 May 2015 15:41:28 -0700
Subject: [PATCH 05/82] mesa: add ASTC extensions to the extensions table

v2: alphabetize the extensions.
    remove OES ASTC extension.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/extensions.c | 2 ++
 src/mesa/main/mtypes.h     | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 4a3c231e36f..03303ac2650 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -342,6 +342,8 @@ static const struct extension extension_table[] = {
    /* KHR extensions */
    { "GL_KHR_debug",                               o(dummy_true),                              GL,             2012 },
    { "GL_KHR_context_flush_control",               o(dummy_true),                              GL       | ES2, 2014 },
+   { "GL_KHR_texture_compression_astc_hdr",        o(KHR_texture_compression_astc_hdr),        GL       | ES2, 2012 },
+   { "GL_KHR_texture_compression_astc_ldr",        o(KHR_texture_compression_astc_ldr),        GL       | ES2, 2012 },
 
    /* Vendor extensions */
    { "GL_3DFX_texture_compression_FXT1",           o(TDFX_texture_compression_FXT1),           GL,             1999 },
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5031b0840cb..a172952c1fb 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3853,6 +3853,8 @@ struct gl_extensions
    GLboolean ATI_fragment_shader;
    GLboolean ATI_separate_stencil;
    GLboolean INTEL_performance_query;
+   GLboolean KHR_texture_compression_astc_hdr;
+   GLboolean KHR_texture_compression_astc_ldr;
    GLboolean MESA_pack_invert;
    GLboolean MESA_ycbcr_texture;
    GLboolean NV_conditional_render;

From 692578ed134bacff1a315489ad47514450b3387f Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 19 May 2015 15:41:56 -0700
Subject: [PATCH 06/82] mesa/glformats: recognize ASTC formats as compressed

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/glformats.c | 29 +++++++++++++++++++++++++++++
 1 file changed, 29 insertions(+)

diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index fd8336cf8e2..ce66699db8f 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -1303,6 +1303,35 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
    case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
       return _mesa_is_desktop_gl(ctx) &&
          ctx->Extensions.ARB_texture_compression_bptc;
+   case GL_COMPRESSED_RGBA_ASTC_4x4_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_5x4_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_5x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_6x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_6x6_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_8x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_8x6_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_8x8_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x5_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x6_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x8_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_10x10_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_12x10_KHR:
+   case GL_COMPRESSED_RGBA_ASTC_12x12_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR:
+      return ctx->Extensions.KHR_texture_compression_astc_ldr;
    case GL_PALETTE4_RGB8_OES:
    case GL_PALETTE4_RGBA8_OES:
    case GL_PALETTE4_R5_G6_B5_OES:

From 23c9cd5a9613ad07bdbe55708a56562ee23f25d5 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 28 Apr 2015 15:08:32 -0700
Subject: [PATCH 07/82] mesa/texcompress: enable translation between MESA and
 GL ASTC formats

v3. conform the ASTC MESA_FORMAT enums to the existing naming convention.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/texcompress.c | 114 ++++++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)

diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index c028daa46e4..bb94137a940 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -471,6 +471,63 @@ _mesa_glenum_to_compressed_format(GLenum format)
    case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
       return MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT;
 
+   case GL_COMPRESSED_RGBA_ASTC_4x4_KHR:
+      return MESA_FORMAT_RGBA_ASTC_4x4;
+   case GL_COMPRESSED_RGBA_ASTC_5x4_KHR:
+      return MESA_FORMAT_RGBA_ASTC_5x4;
+   case GL_COMPRESSED_RGBA_ASTC_5x5_KHR:
+      return MESA_FORMAT_RGBA_ASTC_5x5;
+   case GL_COMPRESSED_RGBA_ASTC_6x5_KHR:
+      return MESA_FORMAT_RGBA_ASTC_6x5;
+   case GL_COMPRESSED_RGBA_ASTC_6x6_KHR:
+      return MESA_FORMAT_RGBA_ASTC_6x6;
+   case GL_COMPRESSED_RGBA_ASTC_8x5_KHR:
+      return MESA_FORMAT_RGBA_ASTC_8x5;
+   case GL_COMPRESSED_RGBA_ASTC_8x6_KHR:
+      return MESA_FORMAT_RGBA_ASTC_8x6;
+   case GL_COMPRESSED_RGBA_ASTC_8x8_KHR:
+      return MESA_FORMAT_RGBA_ASTC_8x8;
+   case GL_COMPRESSED_RGBA_ASTC_10x5_KHR:
+      return MESA_FORMAT_RGBA_ASTC_10x5;
+   case GL_COMPRESSED_RGBA_ASTC_10x6_KHR:
+      return MESA_FORMAT_RGBA_ASTC_10x6;
+   case GL_COMPRESSED_RGBA_ASTC_10x8_KHR:
+      return MESA_FORMAT_RGBA_ASTC_10x8;
+   case GL_COMPRESSED_RGBA_ASTC_10x10_KHR:
+      return MESA_FORMAT_RGBA_ASTC_10x10;
+   case GL_COMPRESSED_RGBA_ASTC_12x10_KHR:
+      return MESA_FORMAT_RGBA_ASTC_12x10;
+   case GL_COMPRESSED_RGBA_ASTC_12x12_KHR:
+      return MESA_FORMAT_RGBA_ASTC_12x12;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10;
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR:
+      return MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12;
+
    default:
       return MESA_FORMAT_NONE;
    }
@@ -561,6 +618,63 @@ _mesa_compressed_format_to_glenum(struct gl_context *ctx, mesa_format mesaFormat
    case MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT:
       return GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT;
 
+   case MESA_FORMAT_RGBA_ASTC_4x4:
+      return GL_COMPRESSED_RGBA_ASTC_4x4_KHR;
+   case MESA_FORMAT_RGBA_ASTC_5x4:
+      return GL_COMPRESSED_RGBA_ASTC_5x4_KHR;
+   case MESA_FORMAT_RGBA_ASTC_5x5:
+      return GL_COMPRESSED_RGBA_ASTC_5x5_KHR;
+   case MESA_FORMAT_RGBA_ASTC_6x5:
+      return GL_COMPRESSED_RGBA_ASTC_6x5_KHR;
+   case MESA_FORMAT_RGBA_ASTC_6x6:
+      return GL_COMPRESSED_RGBA_ASTC_6x6_KHR;
+   case MESA_FORMAT_RGBA_ASTC_8x5:
+      return GL_COMPRESSED_RGBA_ASTC_8x5_KHR;
+   case MESA_FORMAT_RGBA_ASTC_8x6:
+      return GL_COMPRESSED_RGBA_ASTC_8x6_KHR;
+   case MESA_FORMAT_RGBA_ASTC_8x8:
+      return GL_COMPRESSED_RGBA_ASTC_8x8_KHR;
+   case MESA_FORMAT_RGBA_ASTC_10x5:
+      return GL_COMPRESSED_RGBA_ASTC_10x5_KHR;
+   case MESA_FORMAT_RGBA_ASTC_10x6:
+      return GL_COMPRESSED_RGBA_ASTC_10x6_KHR;
+   case MESA_FORMAT_RGBA_ASTC_10x8:
+      return GL_COMPRESSED_RGBA_ASTC_10x8_KHR;
+   case MESA_FORMAT_RGBA_ASTC_10x10:
+      return GL_COMPRESSED_RGBA_ASTC_10x10_KHR;
+   case MESA_FORMAT_RGBA_ASTC_12x10:
+      return GL_COMPRESSED_RGBA_ASTC_12x10_KHR;
+   case MESA_FORMAT_RGBA_ASTC_12x12:
+      return GL_COMPRESSED_RGBA_ASTC_12x12_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR;
+   case MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12:
+      return GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR;
+
    default:
       _mesa_problem(ctx, "Unexpected mesa texture format in"
                     " _mesa_compressed_format_to_glenum()");

From 12b519b4571d27a45abd3266f35b126d00dcb926 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Mon, 27 Jul 2015 16:09:09 -0700
Subject: [PATCH 08/82] mesa/teximage: accept ASTC formats for 3D texture
 specification

The ASTC spec was revised as follows:

   Revision 2, April 28, 2015 - added CompressedTex{Sub,}Image3D to
   commands accepting ASTC format tokens in the New Tokens section [...].

Support only exists in the HDR submode:

   Add a second new column "3D Tex." which is empty for all non-ASTC
   formats. If only the LDR profile is supported by the implementation,
   this column is also empty for all ASTC formats. If both the LDR and HDR
   profiles are supported only, this column is checked for all ASTC
   formats.

LDR-only systems should generate an INVALID_OPERATION error when
attempting to call CompressedTexImage3D with the TEXTURE_3D target.

v2. return the proper error for LDR-only systems.
v3. update is_astc_format().
v4. use _mesa_is_astc_format().
v5. place logic in _mesa_target_can_be_compressed.
v6. fix issues handling ASTC formats.

Reviewed-by: Chad Versace <chad.versace@intel.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/teximage.c | 63 +++++++++++++++++++++++++++++++++++-----
 1 file changed, 56 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 0a641cf2cad..56ae4150a6f 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1854,19 +1854,68 @@ _mesa_target_can_be_compressed(const struct gl_context *ctx, GLenum target,
             return write_error(error, GL_INVALID_OPERATION);
 
       target_can_be_compresed = ctx->Extensions.ARB_texture_cube_map_array;
-      break;
-   case GL_TEXTURE_3D:
 
-      /* See ETC2/EAC comment in switch case GL_TEXTURE_CUBE_MAP_ARRAY. */
-      if (layout == MESA_FORMAT_LAYOUT_ETC2 && _mesa_is_gles3(ctx))
+      /* From the KHR_texture_compression_astc_hdr spec:
+       *
+       *     Add a second new column "3D Tex." which is empty for all non-ASTC
+       *     formats. If only the LDR profile is supported by the
+       *     implementation, this column is also empty for all ASTC formats. If
+       *     both the LDR and HDR profiles are supported only, this column is
+       *     checked for all ASTC formats.
+       *
+       *     Add a third new column "Cube Map Array Tex." which is empty for all
+       *     non-ASTC formats, and checked for all ASTC formats.
+       *
+       * and,
+       *
+       *     'An INVALID_OPERATION error is generated by CompressedTexImage3D
+       *      if <internalformat> is TEXTURE_CUBE_MAP_ARRAY and the
+       *      "Cube Map Array" column of table 8.19 is *not* checked, or if
+       *      <internalformat> is TEXTURE_3D and the "3D Tex." column of table
+       *      8.19 is *not* checked'
+       *
+       * The instances of <internalformat> above should say <target>.
+       */
+
+      /* Throw an INVALID_OPERATION error if the target is
+       * TEXTURE_CUBE_MAP_ARRAY and the format is not ASTC.
+       */
+      if (target_can_be_compresed &&
+          ctx->Extensions.KHR_texture_compression_astc_ldr &&
+          layout != MESA_FORMAT_LAYOUT_ASTC)
          return write_error(error, GL_INVALID_OPERATION);
 
-      if (layout == MESA_FORMAT_LAYOUT_BPTC) {
+      break;
+   case GL_TEXTURE_3D:
+      switch (layout) {
+      case MESA_FORMAT_LAYOUT_ETC2:
+         /* See ETC2/EAC comment in case GL_TEXTURE_CUBE_MAP_ARRAY. */
+         if (_mesa_is_gles3(ctx))
+            return write_error(error, GL_INVALID_OPERATION);
+         break;
+      case MESA_FORMAT_LAYOUT_BPTC:
          target_can_be_compresed = ctx->Extensions.ARB_texture_compression_bptc;
          break;
-      }
+      case MESA_FORMAT_LAYOUT_ASTC:
+         target_can_be_compresed =
+                             ctx->Extensions.KHR_texture_compression_astc_hdr;
 
-      break;
+         /* Throw an INVALID_OPERATION error if the target is TEXTURE_3D and
+          * and the hdr extension is not supported.
+          * See comment in switch case GL_TEXTURE_CUBE_MAP_ARRAY for more info.
+          */
+         if (!target_can_be_compresed)
+            return write_error(error, GL_INVALID_OPERATION);
+         break;
+      default:
+         /* Throw an INVALID_OPERATION error if the target is TEXTURE_3D and
+          * the format is not ASTC.
+          * See comment in switch case GL_TEXTURE_CUBE_MAP_ARRAY for more info.
+          */
+         if (ctx->Extensions.KHR_texture_compression_astc_ldr)
+            return write_error(error, GL_INVALID_OPERATION);
+         break;
+      }
    default:
       break;
    }

From cd49b97a8a2c0dd8dc1d7f32b86f519e936571fd Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 28 Apr 2015 15:10:11 -0700
Subject: [PATCH 09/82] mesa/teximage: return the base internal format of the
 ASTC formats

This is necesary to initialize the gl_texture_image struct.

From the KHR_texture_compression_astc_ldr spec:
  "Added to Section 3.8.6, Compressed Texture Images

   Add the tokens specified above to Table 3.16, Compressed Internal Formats.
   In all cases, the base internal format will be RGBA. The encoding allows
   images to be encoded with fewer channels, but this is always presented as
   RGBA to the sampler."

v2. use _mesa_is_astc_format().

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/teximage.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 56ae4150a6f..0535db35f26 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -565,6 +565,10 @@ _mesa_base_tex_format( struct gl_context *ctx, GLint internalFormat )
       }
    }
 
+   if (ctx->Extensions.KHR_texture_compression_astc_ldr &&
+       _mesa_is_astc_format(internalFormat))
+         return GL_RGBA;
+
    if (_mesa_is_gles3(ctx) || ctx->Extensions.ARB_ES3_compatibility) {
       switch (internalFormat) {
       case GL_COMPRESSED_RGB8_ETC2:

From 8b1f008e9acf94645a28c27fa261f6450a3edb84 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Wed, 15 Apr 2015 14:15:10 -0700
Subject: [PATCH 10/82] i965/surface_formats: add support for 2D ASTC surface
 formats

Define two-thirds of the 2D Intel ASTC surface formats (LDR-only). This allows
a 1-to-1 mapping from the mesa format to the Intel format.

ASTC textures will default to being processed in LDR mode. If there is
hardware support for HDR/Full mode and the texture is not sRGB, add the
format bit necessary to process it in HDR/Full mode.

v2: remove extra newlines.
v3: follow existing coding style in translate_tex_format().
v4: expound on the GEN9_SURFACE_ASTC_HDR_FORMAT_BIT comment.
    update SF table - ASTC is actually supported in Gen8.
v5: conform the ASTC MESA_FORMAT enums to the existing naming convention.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h       | 32 +++++++
 .../drivers/dri/i965/brw_surface_formats.c    | 87 +++++++++++++++++++
 2 files changed, 119 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 82a36357de9..cb5c82a002d 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -504,6 +504,38 @@
 #define BRW_SURFACEFORMAT_R8G8B8_UINT                    0x1C8
 #define BRW_SURFACEFORMAT_R8G8B8_SINT                    0x1C9
 #define BRW_SURFACEFORMAT_RAW                            0x1FF
+
+#define GEN9_SURFACE_ASTC_HDR_FORMAT_BIT                 0x100
+
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_4x4_U8sRGB         0x200
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_5x4_U8sRGB         0x208
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_5x5_U8sRGB         0x209
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_6x5_U8sRGB         0x211
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_6x6_U8sRGB         0x212
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_8x5_U8sRGB         0x221
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_8x6_U8sRGB         0x222
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_8x8_U8sRGB         0x224
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x5_U8sRGB        0x231
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x6_U8sRGB        0x232
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x8_U8sRGB        0x234
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x10_U8sRGB       0x236
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_12x10_U8sRGB       0x23E
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_12x12_U8sRGB       0x23F
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_4x4_FLT16          0x240
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_5x4_FLT16          0x248
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_5x5_FLT16          0x249
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_6x5_FLT16          0x251
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_6x6_FLT16          0x252
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_8x5_FLT16          0x261
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_8x6_FLT16          0x262
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_8x8_FLT16          0x264
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x5_FLT16         0x271
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x6_FLT16         0x272
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x8_FLT16         0x274
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_10x10_FLT16        0x276
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_12x10_FLT16        0x27E
+#define BRW_SURFACEFORMAT_ASTC_LDR_2D_12x12_FLT16        0x27F
+
 #define BRW_SURFACE_FORMAT_SHIFT	18
 #define BRW_SURFACE_FORMAT_MASK		INTEL_MASK(26, 18)
 
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index a33fd88a026..97fff60f3e5 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -307,6 +307,34 @@ const struct surface_format_info surface_formats[] = {
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_SRGB8_A8)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UINT)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_SINT)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_4x4_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x4_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x6_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x6_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x8_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x5_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x6_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x8_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x10_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x10_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x12_FLT16)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_4x4_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x4_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_5x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_6x6_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x6_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_8x8_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x5_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x6_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x8_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_10x10_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x10_U8sRGB)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x, ASTC_LDR_2D_12x12_U8sRGB)
 };
 #undef x
 #undef Y
@@ -503,6 +531,35 @@ brw_format_for_mesa_format(mesa_format mesa_format)
       [MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT] = BRW_SURFACEFORMAT_BC6H_SF16,
       [MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT] = BRW_SURFACEFORMAT_BC6H_UF16,
 
+      [MESA_FORMAT_RGBA_ASTC_4x4]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_4x4_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_5x4]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_5x4_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_5x5]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_5x5_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_6x5]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_6x5_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_6x6]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_6x6_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_8x5]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_8x5_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_8x6]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_8x6_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_8x8]           = BRW_SURFACEFORMAT_ASTC_LDR_2D_8x8_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_10x5]          = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x5_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_10x6]          = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x6_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_10x8]          = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x8_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_10x10]         = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x10_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_12x10]         = BRW_SURFACEFORMAT_ASTC_LDR_2D_12x10_FLT16,
+      [MESA_FORMAT_RGBA_ASTC_12x12]         = BRW_SURFACEFORMAT_ASTC_LDR_2D_12x12_FLT16,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_4x4]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_4x4_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x4]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_5x4_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_5x5]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_5x5_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x5]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_6x5_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_6x6]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_6x6_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x5]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_8x5_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x6]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_8x6_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_8x8]   = BRW_SURFACEFORMAT_ASTC_LDR_2D_8x8_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x5]  = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x5_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x6]  = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x6_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x8]  = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x8_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_10x10] = BRW_SURFACEFORMAT_ASTC_LDR_2D_10x10_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x10] = BRW_SURFACEFORMAT_ASTC_LDR_2D_12x10_U8sRGB,
+      [MESA_FORMAT_SRGB8_ALPHA8_ASTC_12x12] = BRW_SURFACEFORMAT_ASTC_LDR_2D_12x12_U8sRGB,
+
       [MESA_FORMAT_A_SNORM8] = 0,
       [MESA_FORMAT_L_SNORM8] = 0,
       [MESA_FORMAT_L8A8_SNORM] = 0,
@@ -768,6 +825,36 @@ translate_tex_format(struct brw_context *brw,
       }
       return brw_format_for_mesa_format(mesa_format);
 
+   case MESA_FORMAT_RGBA_ASTC_4x4:
+   case MESA_FORMAT_RGBA_ASTC_5x4:
+   case MESA_FORMAT_RGBA_ASTC_5x5:
+   case MESA_FORMAT_RGBA_ASTC_6x5:
+   case MESA_FORMAT_RGBA_ASTC_6x6:
+   case MESA_FORMAT_RGBA_ASTC_8x5:
+   case MESA_FORMAT_RGBA_ASTC_8x6:
+   case MESA_FORMAT_RGBA_ASTC_8x8:
+   case MESA_FORMAT_RGBA_ASTC_10x5:
+   case MESA_FORMAT_RGBA_ASTC_10x6:
+   case MESA_FORMAT_RGBA_ASTC_10x8:
+   case MESA_FORMAT_RGBA_ASTC_10x10:
+   case MESA_FORMAT_RGBA_ASTC_12x10:
+   case MESA_FORMAT_RGBA_ASTC_12x12: {
+      GLuint brw_fmt = brw_format_for_mesa_format(mesa_format);
+
+      /**
+       * On Gen9+, it is possible to process these formats using the LDR
+       * Profile or the Full Profile mode of the hardware. Because, it isn't
+       * possible to determine if an HDR or LDR texture is being rendered, we
+       * can't determine which mode to enable in the hardware. Therefore, to
+       * handle all cases, always default to Full profile unless we are
+       * processing sRGBs, which are incompatible with this mode.
+       */
+      if (brw->gen >= 9)
+         brw_fmt |= GEN9_SURFACE_ASTC_HDR_FORMAT_BIT;
+
+      return brw_fmt;
+   }
+
    default:
       assert(brw_format_for_mesa_format(mesa_format) != 0);
       return brw_format_for_mesa_format(mesa_format);

From 97f4efd573aed7ffc0ea9395f4e69ccdeb5041f6 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Wed, 27 May 2015 13:25:30 -0700
Subject: [PATCH 11/82] mesa/macros: add power-of-two assertions for alignment
 macros

ALIGN and ROUND_DOWN_TO both require that the alignment value passed
into the macro be a power of two in the comments. Using software assertions
verifies this to be the case.

v2: use static inline functions instead of gcc-specific statement expressions (Brian).
v3: fix indendation (Brian).
v4: add greater than zero requirement (Anuj).

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp |  2 +-
 src/mesa/main/macros.h                   | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index a62dbb8b0ad..430efb3021d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -131,7 +131,7 @@ fs_visitor::nir_setup_outputs(nir_shader *shader)
 
       switch (stage) {
       case MESA_SHADER_VERTEX:
-         for (int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
+         for (unsigned int i = 0; i < ALIGN(type_size_scalar(var->type), 4) / 4; i++) {
             int output = var->data.location + i;
             this->outputs[output] = offset(reg, bld, 4 * i);
             this->output_components[output] = vector_elements;
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index 54df50c9cfe..c3ef42a4282 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -690,7 +690,12 @@ minify(unsigned value, unsigned levels)
  *
  * \sa ROUND_DOWN_TO()
  */
-#define ALIGN(value, alignment)  (((value) + (alignment) - 1) & ~((alignment) - 1))
+static inline uintptr_t
+ALIGN(uintptr_t value, int32_t alignment)
+{
+   assert((alignment > 0) && _mesa_is_pow_two(alignment));
+   return (((value) + (alignment) - 1) & ~((alignment) - 1));
+}
 
 /**
  * Align a value down to an alignment value
@@ -703,7 +708,12 @@ minify(unsigned value, unsigned levels)
  *
  * \sa ALIGN()
  */
-#define ROUND_DOWN_TO(value, alignment) ((value) & ~(alignment - 1))
+static inline uintptr_t
+ROUND_DOWN_TO(uintptr_t value, int32_t alignment)
+{
+   assert((alignment > 0) && _mesa_is_pow_two(alignment));
+   return ((value) & ~(alignment - 1));
+}
 
 
 /** Cross product of two 3-element vectors */

From 54d2aa4258f0bfcc669b2bc4e82332f7ff4876dd Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Tue, 2 Jun 2015 11:03:22 -0700
Subject: [PATCH 12/82] mesa/macros: move ALIGN_NPOT to macros.h

Aligning with a non-power-of-two number is a general task that can be used in
various places. This commit is required for the next one.

v2: add greater than 0 assertion (Anuj).
    convert the macro to a static inline function.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/intel_upload.c |  6 ------
 src/mesa/main/macros.h                   | 10 ++++++++++
 2 files changed, 10 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_upload.c b/src/mesa/drivers/dri/i965/intel_upload.c
index 870aabc8863..deaae6c7ed5 100644
--- a/src/mesa/drivers/dri/i965/intel_upload.c
+++ b/src/mesa/drivers/dri/i965/intel_upload.c
@@ -44,12 +44,6 @@
 
 #define INTEL_UPLOAD_SIZE (64*1024)
 
-/**
- * Like ALIGN(), but works with a non-power-of-two alignment.
- */
-#define ALIGN_NPOT(value, alignment) \
-   (((value) + (alignment) - 1) / (alignment) * (alignment))
-
 void
 intel_upload_finish(struct brw_context *brw)
 {
diff --git a/src/mesa/main/macros.h b/src/mesa/main/macros.h
index c3ef42a4282..ed207d44a64 100644
--- a/src/mesa/main/macros.h
+++ b/src/mesa/main/macros.h
@@ -697,6 +697,16 @@ ALIGN(uintptr_t value, int32_t alignment)
    return (((value) + (alignment) - 1) & ~((alignment) - 1));
 }
 
+/**
+ * Like ALIGN(), but works with a non-power-of-two alignment.
+ */
+static inline uintptr_t
+ALIGN_NPOT(uintptr_t value, int32_t alignment)
+{
+   assert(alignment > 0);
+   return (value + alignment - 1) / alignment * alignment;
+}
+
 /**
  * Align a value down to an alignment value
  *

From 10ff64fd3d19bc9da793fa43eb746c29608bfddd Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Thu, 21 May 2015 14:27:55 -0700
Subject: [PATCH 13/82] i965: use ALIGN_NPOT for setting ASTC mipmap layouts

ALIGN is changed to ALIGN_NPOT because alignment values are sometimes not
powers of two when working with ASTC.

v2: handle texture arrays and LDR-only systems.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c    | 26 +++++++++----------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c |  4 +--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index e96732a1908..1d8eb090556 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -367,7 +367,7 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
    mt->total_width = mt->physical_width0;
 
    if (mt->compressed)
-       mt->total_width = ALIGN(mt->total_width, bw);
+       mt->total_width = ALIGN_NPOT(mt->total_width, bw);
 
    /* May need to adjust width to accommodate the placement of
     * the 2nd mipmap.  This occurs when the alignment
@@ -378,10 +378,10 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
        unsigned mip1_width;
 
        if (mt->compressed) {
-          mip1_width = ALIGN(minify(mt->physical_width0, 1), mt->align_w) +
-             ALIGN(minify(mt->physical_width0, 2), bw);
+          mip1_width = ALIGN_NPOT(minify(mt->physical_width0, 1), mt->align_w) +
+             ALIGN_NPOT(minify(mt->physical_width0, 2), bw);
        } else {
-          mip1_width = ALIGN(minify(mt->physical_width0, 1), mt->align_w) +
+          mip1_width = ALIGN_NPOT(minify(mt->physical_width0, 1), mt->align_w) +
              minify(mt->physical_width0, 2);
        }
 
@@ -397,7 +397,7 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
 
       intel_miptree_set_level_info(mt, level, x, y, depth);
 
-      img_height = ALIGN(height, mt->align_h);
+      img_height = ALIGN_NPOT(height, mt->align_h);
       if (mt->compressed)
 	 img_height /= bh;
 
@@ -414,7 +414,7 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
       /* Layout_below: step right after second mipmap.
        */
       if (level == mt->first_level + 1) {
-	 x += ALIGN(width, mt->align_w);
+	 x += ALIGN_NPOT(width, mt->align_w);
       } else {
 	 y += img_height;
       }
@@ -434,7 +434,7 @@ brw_miptree_get_horizontal_slice_pitch(const struct brw_context *brw,
 {
    if ((brw->gen < 9 && mt->target == GL_TEXTURE_3D) ||
        (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP)) {
-      return ALIGN(minify(mt->physical_width0, level), mt->align_w);
+      return ALIGN_NPOT(minify(mt->physical_width0, level), mt->align_w);
    } else {
       return 0;
    }
@@ -475,11 +475,11 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
    } else if (mt->target == GL_TEXTURE_3D ||
               (brw->gen == 4 && mt->target == GL_TEXTURE_CUBE_MAP) ||
               mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
-      return ALIGN(minify(mt->physical_height0, level), mt->align_h);
+      return ALIGN_NPOT(minify(mt->physical_height0, level), mt->align_h);
 
    } else {
-      const unsigned h0 = ALIGN(mt->physical_height0, mt->align_h);
-      const unsigned h1 = ALIGN(minify(mt->physical_height0, 1), mt->align_h);
+      const unsigned h0 = ALIGN_NPOT(mt->physical_height0, mt->align_h);
+      const unsigned h1 = ALIGN_NPOT(minify(mt->physical_height0, 1), mt->align_h);
 
       return h0 + h1 + (brw->gen >= 7 ? 12 : 11) * mt->align_h;
    }
@@ -551,7 +551,7 @@ brw_miptree_layout_texture_array(struct brw_context *brw,
 
    for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
       unsigned img_height;
-      img_height = ALIGN(height, mt->align_h);
+      img_height = ALIGN_NPOT(height, mt->align_h);
       if (mt->compressed)
          img_height /= mt->align_h;
 
@@ -584,8 +584,8 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
       unsigned WL = MAX2(mt->physical_width0 >> level, 1);
       unsigned HL = MAX2(mt->physical_height0 >> level, 1);
       unsigned DL = MAX2(mt->physical_depth0 >> level, 1);
-      unsigned wL = ALIGN(WL, mt->align_w);
-      unsigned hL = ALIGN(HL, mt->align_h);
+      unsigned wL = ALIGN_NPOT(WL, mt->align_w);
+      unsigned hL = ALIGN_NPOT(HL, mt->align_h);
 
       if (mt->target == GL_TEXTURE_CUBE_MAP)
          DL = 6;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index e85c3f00c7b..44eb91327d3 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -1272,8 +1272,8 @@ intel_miptree_copy_slice(struct brw_context *brw,
    if (dst_mt->compressed) {
       unsigned int i, j;
       _mesa_get_format_block_size(dst_mt->format, &i, &j);
-      height = ALIGN(height, j) / j;
-      width = ALIGN(width, i);
+      height = ALIGN_NPOT(height, j) / j;
+      width = ALIGN_NPOT(width, i);
    }
 
    /* If it's a packed depth/stencil buffer with separate stencil, the blit

From 1a9ceed4ba764cf73a643f8f2135b5b84cfe4581 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Thu, 18 Jun 2015 11:02:17 -0700
Subject: [PATCH 14/82] i965: correct mt->align_h for 2D textures on Skylake

In agreement with commit 4ab8d59a23, vertical alignment values are equal to
four times the block height on Gen9+.

v2: add newlines to separate declarations, statments, and comments.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Reviewed-by: Neil Roberts <neil@linux.intel.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 1d8eb090556..e8a92dde8a9 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -270,9 +270,14 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
     * Where "*" means either VALIGN_2 or VALIGN_4 depending on the setting of
     * the SURFACE_STATE "Surface Vertical Alignment" field.
     */
-   if (_mesa_is_format_compressed(mt->format))
-      /* See comment above for the horizontal alignment */
-      return brw->gen >= 9 ? 16 : 4;
+    if (_mesa_is_format_compressed(mt->format)) {
+       unsigned int i, j;
+
+       _mesa_get_format_block_size(mt->format, &i, &j);
+
+       /* See comment above for the horizontal alignment */
+       return brw->gen >= 9 ? j * 4 : 4;
+    }
 
    if (mt->format == MESA_FORMAT_S_UINT8)
       return brw->gen >= 7 ? 8 : 4;

From a6877341358e1534e74dd9e5fc72934a20b78228 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Thu, 21 May 2015 14:27:55 -0700
Subject: [PATCH 15/82] i965: change the meaning of cpp for compressed textures

An ASTC block takes up 16 bytes for all block width and height configurations.
This size is not integrally divisible by all ASTC block widths. Therefore cpp
is changed to mean bytes per block if the texture is compressed.

Because the original definition was bytes per block divided by block width, all
references to the mipmap width must be divided the block width. This keeps the
address calculation formulas consistent. For example, the units for miptree_level
x_offset and miptree total_width has changed from pixels to blocks.

v2: reuse preexisting ALIGN_NPOT macro located in an i965 driver file.
v3: move ALIGN_NPOT into seperate commit.
    simplify cpp assignment in copy_image_with_blitter().
    update miptree width and offset variables in: intel_miptree_copy_slice(),
        intel_miptree_map_gtt(), and brw_miptree_layout_texture_3d().

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c    | 15 +++++++++------
 src/mesa/drivers/dri/i965/intel_copy_image.c  | 19 +------------------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 14 ++++----------
 src/mesa/drivers/dri/i965/intel_mipmap_tree.h |  2 +-
 4 files changed, 15 insertions(+), 35 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index e8a92dde8a9..a95ac95f15d 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -395,6 +395,7 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
        }
    }
 
+   mt->total_width /= bw;
    mt->total_height = 0;
 
    for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
@@ -419,7 +420,7 @@ brw_miptree_layout_2d(struct intel_mipmap_tree *mt)
       /* Layout_below: step right after second mipmap.
        */
       if (level == mt->first_level + 1) {
-	 x += ALIGN_NPOT(width, mt->align_w);
+	 x += ALIGN_NPOT(width, mt->align_w) / bw;
       } else {
 	 y += img_height;
       }
@@ -579,12 +580,14 @@ static void
 brw_miptree_layout_texture_3d(struct brw_context *brw,
                               struct intel_mipmap_tree *mt)
 {
-   unsigned yscale = mt->compressed ? 4 : 1;
-
    mt->total_width = 0;
    mt->total_height = 0;
 
    unsigned ysum = 0;
+   unsigned bh, bw;
+
+   _mesa_get_format_block_size(mt->format, &bw, &bh);
+
    for (unsigned level = mt->first_level; level <= mt->last_level; level++) {
       unsigned WL = MAX2(mt->physical_width0 >> level, 1);
       unsigned HL = MAX2(mt->physical_height0 >> level, 1);
@@ -601,9 +604,9 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
          unsigned x = (q % (1 << level)) * wL;
          unsigned y = ysum + (q >> level) * hL;
 
-         intel_miptree_set_image_offset(mt, level, q, x, y / yscale);
-         mt->total_width = MAX2(mt->total_width, x + wL);
-         mt->total_height = MAX2(mt->total_height, (y + hL) / yscale);
+         intel_miptree_set_image_offset(mt, level, q, x / bw, y / bh);
+         mt->total_width = MAX2(mt->total_width, (x + wL) / bw);
+         mt->total_height = MAX2(mt->total_height, (y + hL) / bh);
       }
 
       ysum += ALIGN(DL, 1 << level) / (1 << level) * hL;
diff --git a/src/mesa/drivers/dri/i965/intel_copy_image.c b/src/mesa/drivers/dri/i965/intel_copy_image.c
index 3706704bf1a..ac2738f59a0 100644
--- a/src/mesa/drivers/dri/i965/intel_copy_image.c
+++ b/src/mesa/drivers/dri/i965/intel_copy_image.c
@@ -41,7 +41,6 @@ copy_image_with_blitter(struct brw_context *brw,
 {
    GLuint bw, bh;
    uint32_t src_image_x, src_image_y, dst_image_x, dst_image_y;
-   int cpp;
 
    /* The blitter doesn't understand multisampling at all. */
    if (src_mt->num_samples > 0 || dst_mt->num_samples > 0)
@@ -86,16 +85,6 @@ copy_image_with_blitter(struct brw_context *brw,
       src_y /= (int)bh;
       src_width /= (int)bw;
       src_height /= (int)bh;
-
-      /* Inside of the miptree, the x offsets are stored in pixels while
-       * the y offsets are stored in blocks.  We need to scale just the x
-       * offset.
-       */
-      src_image_x /= bw;
-
-      cpp = _mesa_get_format_bytes(src_mt->format);
-   } else {
-      cpp = src_mt->cpp;
    }
    src_x += src_image_x;
    src_y += src_image_y;
@@ -111,18 +100,12 @@ copy_image_with_blitter(struct brw_context *brw,
 
       dst_x /= (int)bw;
       dst_y /= (int)bh;
-
-      /* Inside of the miptree, the x offsets are stored in pixels while
-       * the y offsets are stored in blocks.  We need to scale just the x
-       * offset.
-       */
-      dst_image_x /= bw;
    }
    dst_x += dst_image_x;
    dst_y += dst_image_y;
 
    return intelEmitCopyBlit(brw,
-                            cpp,
+                            src_mt->cpp,
                             src_mt->pitch,
                             src_mt->bo, src_mt->offset,
                             src_mt->tiling,
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 44eb91327d3..0bcbbbcde8f 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -313,15 +313,7 @@ intel_miptree_create_layout(struct brw_context *brw,
    mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS;
    mt->disable_aux_buffers = (layout_flags & MIPTREE_LAYOUT_DISABLE_AUX) != 0;
    exec_list_make_empty(&mt->hiz_map);
-
-   /* The cpp is bytes per (1, blockheight)-sized block for compressed
-    * textures.  This is why you'll see divides by blockheight all over
-    */
-   unsigned bw, bh;
-   _mesa_get_format_block_size(format, &bw, &bh);
-   assert(_mesa_get_format_bytes(mt->format) % bw == 0);
-   mt->cpp = _mesa_get_format_bytes(mt->format) / bw;
-
+   mt->cpp = _mesa_get_format_bytes(format);
    mt->num_samples = num_samples;
    mt->compressed = _mesa_is_format_compressed(format);
    mt->msaa_layout = INTEL_MSAA_LAYOUT_NONE;
@@ -1273,7 +1265,7 @@ intel_miptree_copy_slice(struct brw_context *brw,
       unsigned int i, j;
       _mesa_get_format_block_size(dst_mt->format, &i, &j);
       height = ALIGN_NPOT(height, j) / j;
-      width = ALIGN_NPOT(width, i);
+      width = ALIGN_NPOT(width, i) / i;
    }
 
    /* If it's a packed depth/stencil buffer with separate stencil, the blit
@@ -2105,7 +2097,9 @@ intel_miptree_map_gtt(struct brw_context *brw,
     */
    _mesa_get_format_block_size(mt->format, &bw, &bh);
    assert(y % bh == 0);
+   assert(x % bw == 0);
    y /= bh;
+   x /= bw;
 
    base = intel_miptree_map_raw(brw, mt) + mt->offset;
 
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 790d3129207..c28162a1983 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -390,7 +390,7 @@ struct intel_mipmap_tree
     */
    GLuint physical_width0, physical_height0, physical_depth0;
 
-   GLuint cpp; /**< bytes per pixel */
+   GLuint cpp; /**< bytes per pixel (or bytes per block if compressed) */
    GLuint num_samples;
    bool compressed;
 

From 43d5b4db969930f9d85f605c75ef9ffe67e58ad3 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Thu, 28 May 2015 16:02:34 -0700
Subject: [PATCH 16/82] i965: refactor miptree alignment calculation code

Remove redundant checks and comments by grouping our calculations for
align_w and align_h wherever possible.

v2: reintroduce brw.
    don't include functional changes.
    don't adjust function parameters or create a new function.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/drivers/dri/i965/brw_tex_layout.c | 85 ++++++++--------------
 1 file changed, 30 insertions(+), 55 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index a95ac95f15d..268b995f92e 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -123,12 +123,6 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
       return 16;
 
    /**
-    * From the "Alignment Unit Size" section of various specs, namely:
-    * - Gen3 Spec: "Memory Data Formats" Volume,         Section 1.20.1.4
-    * - i965 and G45 PRMs:             Volume 1,         Section 6.17.3.4.
-    * - Ironlake and Sandybridge PRMs: Volume 1, Part 1, Section 7.18.3.4
-    * - BSpec (for Ivybridge and slight variations in separate stencil)
-    *
     * +----------------------------------------------------------------------+
     * |                                        | alignment unit width  ("i") |
     * | Surface Property                       |-----------------------------|
@@ -146,32 +140,6 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
     * On IVB+, non-special cases can be overridden by setting the SURFACE_STATE
     * "Surface Horizontal Alignment" field to HALIGN_4 or HALIGN_8.
     */
-    if (_mesa_is_format_compressed(mt->format)) {
-       /* The hardware alignment requirements for compressed textures
-        * happen to match the block boundaries.
-        */
-      unsigned int i, j;
-      _mesa_get_format_block_size(mt->format, &i, &j);
-
-      /* On Gen9+ we can pick our own alignment for compressed textures but it
-       * has to be a multiple of the block size. The minimum alignment we can
-       * pick is 4 so we effectively have to align to 4 times the block
-       * size
-       */
-      if (brw->gen >= 9)
-         return i * 4;
-      else
-         return i;
-    }
-
-   if (mt->format == MESA_FORMAT_S_UINT8)
-      return 8;
-
-   if (brw->gen >= 9 && mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
-      uint32_t align = tr_mode_horizontal_texture_alignment(brw, mt);
-      /* XY_FAST_COPY_BLT doesn't support horizontal alignment < 32. */
-      return align < 32 ? 32 : align;
-   }
 
    if (brw->gen >= 7 && mt->format == MESA_FORMAT_Z_UNORM16)
       return 8;
@@ -248,12 +216,6 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
                                       const struct intel_mipmap_tree *mt)
 {
    /**
-    * From the "Alignment Unit Size" section of various specs, namely:
-    * - Gen3 Spec: "Memory Data Formats" Volume,         Section 1.20.1.4
-    * - i965 and G45 PRMs:             Volume 1,         Section 6.17.3.4.
-    * - Ironlake and Sandybridge PRMs: Volume 1, Part 1, Section 7.18.3.4
-    * - BSpec (for Ivybridge and slight variations in separate stencil)
-    *
     * +----------------------------------------------------------------------+
     * |                                        | alignment unit height ("j") |
     * | Surface Property                       |-----------------------------|
@@ -270,23 +232,6 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
     * Where "*" means either VALIGN_2 or VALIGN_4 depending on the setting of
     * the SURFACE_STATE "Surface Vertical Alignment" field.
     */
-    if (_mesa_is_format_compressed(mt->format)) {
-       unsigned int i, j;
-
-       _mesa_get_format_block_size(mt->format, &i, &j);
-
-       /* See comment above for the horizontal alignment */
-       return brw->gen >= 9 ? j * 4 : 4;
-    }
-
-   if (mt->format == MESA_FORMAT_S_UINT8)
-      return brw->gen >= 7 ? 8 : 4;
-
-   if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
-      uint32_t align = tr_mode_vertical_texture_alignment(brw, mt);
-      /* XY_FAST_COPY_BLT doesn't support vertical alignment < 64 */
-      return align < 64 ? 64 : align;
-   }
 
    /* Broadwell only supports VALIGN of 4, 8, and 16.  The BSpec says 4
     * should always be used, except for stencil buffers, which should be 8.
@@ -775,6 +720,13 @@ intel_miptree_set_alignment(struct brw_context *brw,
                             struct intel_mipmap_tree *mt,
                             uint32_t layout_flags)
 {
+   /**
+    * From the "Alignment Unit Size" section of various specs, namely:
+    * - Gen3 Spec: "Memory Data Formats" Volume,         Section 1.20.1.4
+    * - i965 and G45 PRMs:             Volume 1,         Section 6.17.3.4.
+    * - Ironlake and Sandybridge PRMs: Volume 1, Part 1, Section 7.18.3.4
+    * - BSpec (for Ivybridge and slight variations in separate stencil)
+    */
    bool gen6_hiz_or_stencil = false;
 
    if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
@@ -806,6 +758,29 @@ intel_miptree_set_alignment(struct brw_context *brw,
          mt->align_w = 128 / mt->cpp;
          mt->align_h = 32;
       }
+   } else if (mt->compressed) {
+       /* The hardware alignment requirements for compressed textures
+        * happen to match the block boundaries.
+        */
+      _mesa_get_format_block_size(mt->format, &mt->align_w, &mt->align_h);
+
+      /* On Gen9+ we can pick our own alignment for compressed textures but it
+       * has to be a multiple of the block size. The minimum alignment we can
+       * pick is 4 so we effectively have to align to 4 times the block
+       * size
+       */
+      if (brw->gen >= 9) {
+         mt->align_w *= 4;
+         mt->align_h *= 4;
+      }
+   } else if (mt->format == MESA_FORMAT_S_UINT8) {
+      mt->align_w = 8;
+      mt->align_h = brw->gen >= 7 ? 8 : 4;
+   } else if (brw->gen >= 9 && mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      /* XY_FAST_COPY_BLT doesn't support horizontal alignment < 32 or
+       * vertical alignment < 64. */
+      mt->align_w = MAX2(tr_mode_horizontal_texture_alignment(brw, mt), 32);
+      mt->align_h = MAX2(tr_mode_vertical_texture_alignment(brw, mt), 64);
    } else {
       mt->align_w =
          intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);

From 42ee16176dca797c395592b0245f370ea58ca3a8 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Wed, 17 Jun 2015 17:14:40 -0700
Subject: [PATCH 17/82] mesa: return bool instead of GLboolean in
 compressedteximage_only_format()

In agreement with the coding style, functions that aren't directly visible
to the GL API should prefer the use of bool over GLboolean.

Suggested-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/teximage.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 0535db35f26..8d94903db67 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1767,7 +1767,7 @@ _mesa_test_proxy_teximage(struct gl_context *ctx, GLenum target, GLint level,
 /**
  * Return true if the format is only valid for glCompressedTexImage.
  */
-static GLboolean
+static bool
 compressedteximage_only_format(const struct gl_context *ctx, GLenum format)
 {
    switch (format) {
@@ -1782,9 +1782,9 @@ compressedteximage_only_format(const struct gl_context *ctx, GLenum format)
    case GL_PALETTE8_R5_G6_B5_OES:
    case GL_PALETTE8_RGBA4_OES:
    case GL_PALETTE8_RGB5_A1_OES:
-      return GL_TRUE;
+      return true;
    default:
-      return GL_FALSE;
+      return false;
    }
 }
 

From 9a759a6ee002203a045e9df1f91823204609fd4a Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Sun, 31 May 2015 13:29:41 -0700
Subject: [PATCH 18/82] swrast: add a new macro, FETCH_COMPRESSED

This patch creates a new macro, FETCH_COMPRESSED - similar in nature
to the other FETCH_* macros. This reduces repetition in the code that
deals with compressed textures.

Reviewed-by: Anuj Phogat <anuj.phogat@gmail.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/swrast/s_texfetch.c | 239 ++++++-----------------------------
 1 file changed, 41 insertions(+), 198 deletions(-)

diff --git a/src/mesa/swrast/s_texfetch.c b/src/mesa/swrast/s_texfetch.c
index 754d982bcd5..acb06e6ae92 100644
--- a/src/mesa/swrast/s_texfetch.c
+++ b/src/mesa/swrast/s_texfetch.c
@@ -116,6 +116,14 @@ static void fetch_null_texelf( const struct swrast_texture_image *texImage,
       NULL                      \
    }
 
+#define FETCH_COMPRESSED(NAME)  \
+   {                            \
+      MESA_FORMAT_ ## NAME,     \
+      fetch_compressed,         \
+      fetch_compressed,         \
+      fetch_compressed          \
+   }
+
 /**
  * Table to map MESA_FORMAT_ to texel fetch/store funcs.
  */
@@ -344,214 +352,49 @@ texfetch_funcs[] =
    FETCH_NULL(RGBX_SINT32),
 
    /* DXT compressed formats */
-   {
-      MESA_FORMAT_RGB_DXT1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_RGBA_DXT1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_RGBA_DXT3,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_RGBA_DXT5,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
+   FETCH_COMPRESSED(RGB_DXT1),
+   FETCH_COMPRESSED(RGBA_DXT1),
+   FETCH_COMPRESSED(RGBA_DXT3),
+   FETCH_COMPRESSED(RGBA_DXT5),
 
    /* DXT sRGB compressed formats */
-   {
-      MESA_FORMAT_SRGB_DXT1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_SRGBA_DXT1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_SRGBA_DXT3,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_SRGBA_DXT5,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
+   FETCH_COMPRESSED(SRGB_DXT1),
+   FETCH_COMPRESSED(SRGBA_DXT1),
+   FETCH_COMPRESSED(SRGBA_DXT3),
+   FETCH_COMPRESSED(SRGBA_DXT5),
 
    /* FXT1 compressed formats */
-   {
-      MESA_FORMAT_RGB_FXT1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_RGBA_FXT1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
+   FETCH_COMPRESSED(RGB_FXT1),
+   FETCH_COMPRESSED(RGBA_FXT1),
 
    /* RGTC compressed formats */
-   {
-      MESA_FORMAT_R_RGTC1_UNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_R_RGTC1_SNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_RG_RGTC2_UNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_RG_RGTC2_SNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
+   FETCH_COMPRESSED(R_RGTC1_UNORM),
+   FETCH_COMPRESSED(R_RGTC1_SNORM),
+   FETCH_COMPRESSED(RG_RGTC2_UNORM),
+   FETCH_COMPRESSED(RG_RGTC2_SNORM),
 
    /* LATC1/2 compressed formats */
-   {
-      MESA_FORMAT_L_LATC1_UNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_L_LATC1_SNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_LA_LATC2_UNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_LA_LATC2_SNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
+   FETCH_COMPRESSED(L_LATC1_UNORM),
+   FETCH_COMPRESSED(L_LATC1_SNORM),
+   FETCH_COMPRESSED(LA_LATC2_UNORM),
+   FETCH_COMPRESSED(LA_LATC2_SNORM),
 
    /* ETC1/2 compressed formats */
-   {
-      MESA_FORMAT_ETC1_RGB8,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_RGB8,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_SRGB8,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_RGBA8_EAC,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_R11_EAC,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_RG11_EAC,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_SIGNED_R11_EAC,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_SIGNED_RG11_EAC,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_BPTC_RGBA_UNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
-   {
-      MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT,
-      fetch_compressed,
-      fetch_compressed,
-      fetch_compressed
-   },
+   FETCH_COMPRESSED(ETC1_RGB8),
+   FETCH_COMPRESSED(ETC2_RGB8),
+   FETCH_COMPRESSED(ETC2_SRGB8),
+   FETCH_COMPRESSED(ETC2_RGBA8_EAC),
+   FETCH_COMPRESSED(ETC2_SRGB8_ALPHA8_EAC),
+   FETCH_COMPRESSED(ETC2_R11_EAC),
+   FETCH_COMPRESSED(ETC2_RG11_EAC),
+   FETCH_COMPRESSED(ETC2_SIGNED_R11_EAC),
+   FETCH_COMPRESSED(ETC2_SIGNED_RG11_EAC),
+   FETCH_COMPRESSED(ETC2_RGB8_PUNCHTHROUGH_ALPHA1),
+   FETCH_COMPRESSED(ETC2_SRGB8_PUNCHTHROUGH_ALPHA1),
+   FETCH_COMPRESSED(BPTC_RGBA_UNORM),
+   FETCH_COMPRESSED(BPTC_SRGB_ALPHA_UNORM),
+   FETCH_COMPRESSED(BPTC_RGB_SIGNED_FLOAT),
+   FETCH_COMPRESSED(BPTC_RGB_UNSIGNED_FLOAT),
 
    /* ASTC compressed formats */
    FETCH_NULL(RGBA_ASTC_4x4),

From f4280b740d835d5c62b330e2fd563eeb119005b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Fri, 21 Aug 2015 09:43:27 +0300
Subject: [PATCH 19/82] glapi: add GL_OES_texture_storage_multisample_2d_array
 extension
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mapi/glapi/gen/es_EXT.xml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 642e3b319bb..cfca5a980bb 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -798,4 +798,23 @@
     </function>
 </category>
 
+<!-- 174. GL_OES_texture_storage_multisample_2d_array -->
+<category name="GL_OES_texture_storage_multisample_2d_array" number="174">
+    <enum name="TEXTURE_2D_MULTISAMPLE_ARRAY_OES"              value="0x9102"/>
+    <enum name="TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY_OES"      value="0x9105"/>
+    <enum name="SAMPLER_2D_MULTISAMPLE_ARRAY_OES"              value="0x910B"/>
+    <enum name="INT_SAMPLER_2D_MULTISAMPLE_ARRAY_OES"          value="0x910C"/>
+    <enum name="UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY_OES" value="0x910D"/>
+
+    <function name="TexStorage3DMultisampleOES" alias="TexStorage3DMultisample" es2="3.1">
+        <param name="target" type="GLenum"/>
+        <param name="samples" type="GLsizei"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="width" type="GLsizei"/>
+        <param name="height" type="GLsizei"/>
+        <param name="depth" type="GLsizei"/>
+        <param name="fixedsamplelocations" type="GLboolean"/>
+    </function>
+</category>
+
 </OpenGLAPI>

From b9101b14439836c337abeffafc4b058a8d80d3ef Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Fri, 21 Aug 2015 09:40:11 +0300
Subject: [PATCH 20/82] mesa: Add extension enable for
 OES_texture_storage_multisample_2d_array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: use ARB_texture_multisample bit to enable extension

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/extensions.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 03303ac2650..a57d5baeafd 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -336,6 +336,7 @@ static const struct extension extension_table[] = {
    { "GL_OES_texture_half_float",                  o(OES_texture_half_float),                             ES2, 2005 },
    { "GL_OES_texture_half_float_linear",           o(OES_texture_half_float_linear),                      ES2, 2005 },
    { "GL_OES_texture_mirrored_repeat",             o(dummy_true),                                   ES1,       2005 },
+   { "GL_OES_texture_storage_multisample_2d_array",o(ARB_texture_multisample),                           ES31, 2014 },
    { "GL_OES_texture_npot",                        o(ARB_texture_non_power_of_two),                 ES1 | ES2, 2005 },
    { "GL_OES_vertex_array_object",                 o(dummy_true),                                   ES1 | ES2, 2010 },
 

From c2c64fd26999cedf4b63c754145f7258517f5bce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Fri, 21 Aug 2015 09:42:10 +0300
Subject: [PATCH 21/82] glsl: add support for
 OES_texture_storage_multisample_2d_array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: use ARB_texture_multisample enable bit

Patch adds extension enable bit and enables required keywords
and builtin functions for the extension.

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/glsl/builtin_functions.cpp  | 5 +++--
 src/glsl/builtin_types.cpp      | 3 ++-
 src/glsl/glcpp/glcpp-parse.y    | 2 ++
 src/glsl/glsl_lexer.ll          | 6 +++---
 src/glsl/glsl_parser_extras.cpp | 1 +
 src/glsl/glsl_parser_extras.h   | 2 ++
 6 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 1bc3de4aec5..4092d682c69 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -284,8 +284,9 @@ texture_multisample(const _mesa_glsl_parse_state *state)
 static bool
 texture_multisample_array(const _mesa_glsl_parse_state *state)
 {
-   return state->is_version(150, 0) ||
-          state->ARB_texture_multisample_enable;
+   return state->is_version(150, 320) ||
+          state->ARB_texture_multisample_enable ||
+          state->OES_texture_storage_multisample_2d_array_enable;
 }
 
 static bool
diff --git a/src/glsl/builtin_types.cpp b/src/glsl/builtin_types.cpp
index 9cf198fd127..0d0d71d56df 100644
--- a/src/glsl/builtin_types.cpp
+++ b/src/glsl/builtin_types.cpp
@@ -307,7 +307,8 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
       add_type(symbols, glsl_type::usamplerCubeArray_type);
    }
 
-   if (state->ARB_texture_multisample_enable) {
+   if (state->ARB_texture_multisample_enable ||
+       state->OES_texture_storage_multisample_2d_array_enable) {
       add_type(symbols, glsl_type::sampler2DMS_type);
       add_type(symbols, glsl_type::isampler2DMS_type);
       add_type(symbols, glsl_type::usampler2DMS_type);
diff --git a/src/glsl/glcpp/glcpp-parse.y b/src/glsl/glcpp/glcpp-parse.y
index 18e50afe476..2d631f08c29 100644
--- a/src/glsl/glcpp/glcpp-parse.y
+++ b/src/glsl/glcpp/glcpp-parse.y
@@ -2382,6 +2382,8 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	         add_builtin_define(parser, "GL_OES_EGL_image_external", 1);
               if (extensions->OES_standard_derivatives)
                  add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
+              if (extensions->ARB_texture_multisample)
+                 add_builtin_define(parser, "GL_OES_texture_storage_multisample_2d_array", 1);
 	   }
 	} else {
 	   add_builtin_define(parser, "GL_ARB_draw_buffers", 1);
diff --git a/src/glsl/glsl_lexer.ll b/src/glsl/glsl_lexer.ll
index 24998c19467..90e84ed1bfe 100644
--- a/src/glsl/glsl_lexer.ll
+++ b/src/glsl/glsl_lexer.ll
@@ -347,9 +347,9 @@ usampler2DArray		KEYWORD(130, 300, 130, 300, USAMPLER2DARRAY);
 sampler2DMS        KEYWORD_WITH_ALT(150, 300, 150, 310, yyextra->ARB_texture_multisample_enable, SAMPLER2DMS);
 isampler2DMS       KEYWORD_WITH_ALT(150, 300, 150, 310, yyextra->ARB_texture_multisample_enable, ISAMPLER2DMS);
 usampler2DMS       KEYWORD_WITH_ALT(150, 300, 150, 310, yyextra->ARB_texture_multisample_enable, USAMPLER2DMS);
-sampler2DMSArray   KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, SAMPLER2DMSARRAY);
-isampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, ISAMPLER2DMSARRAY);
-usampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 0, yyextra->ARB_texture_multisample_enable, USAMPLER2DMSARRAY);
+sampler2DMSArray   KEYWORD_WITH_ALT(150, 300, 150, 320, yyextra->ARB_texture_multisample_enable || yyextra->OES_texture_storage_multisample_2d_array_enable, SAMPLER2DMSARRAY);
+isampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 320, yyextra->ARB_texture_multisample_enable || yyextra->OES_texture_storage_multisample_2d_array_enable, ISAMPLER2DMSARRAY);
+usampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 320, yyextra->ARB_texture_multisample_enable || yyextra->OES_texture_storage_multisample_2d_array_enable, USAMPLER2DMSARRAY);
 
    /* keywords available with ARB_texture_cube_map_array_enable extension on desktop GLSL */
 samplerCubeArray   KEYWORD_WITH_ALT(400, 0, 400, 0, yyextra->ARB_texture_cube_map_array_enable, SAMPLERCUBEARRAY);
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 6440a9691fb..939a03cb0d7 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -626,6 +626,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(OES_EGL_image_external,         false, true,      OES_EGL_image_external),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
    EXT(OES_texture_3D,                 false, true,      EXT_texture3D),
+   EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample),
 
    /* All other extensions go here, sorted alphabetically.
     */
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index e2145bea5fa..295cd10ba14 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -548,6 +548,8 @@ struct _mesa_glsl_parse_state {
    bool OES_standard_derivatives_warn;
    bool OES_texture_3D_enable;
    bool OES_texture_3D_warn;
+   bool OES_texture_storage_multisample_2d_array_enable;
+   bool OES_texture_storage_multisample_2d_array_warn;
 
    /* All other extensions go here, sorted alphabetically.
     */

From 16ad1d2a8d9d1d2d816f8b8d70f1134a7d4ba8dc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Tapani=20P=C3=A4lli?= <tapani.palli@intel.com>
Date: Mon, 24 Aug 2015 10:09:52 +0300
Subject: [PATCH 22/82] mesa: enable enums for
 OES_texture_storage_multisample_2d_array
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: use _mesa_is_gles31(ctx) for verifying we are on ES 3.1,
    remove _es31 usage from get_hash_params.py

Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/get_hash_params.py | 6 +++---
 src/mesa/main/texobj.c           | 3 ++-
 src/mesa/main/texparam.c         | 2 +-
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 73213f407f3..b3c337e9d45 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -434,6 +434,9 @@ descriptor=[
   [ "SAMPLE_MASK", "CONTEXT_BOOL(Multisample.SampleMask), extra_ARB_texture_multisample" ],
   [ "MAX_SAMPLE_MASK_WORDS", "CONST(1), extra_ARB_texture_multisample" ],
 
+# GL_ARB_texture_multisample / ES 3.1 with GL_OES_texture_storage_multisample_2d_array
+  [ "TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX, extra_ARB_texture_multisample" ],
+
 # GL_ARB_texture_gather / GLES 3.1
   [ "MIN_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MinProgramTextureGatherOffset), extra_ARB_texture_gather"],
   [ "MAX_PROGRAM_TEXTURE_GATHER_OFFSET", "CONTEXT_INT(Const.MaxProgramTextureGatherOffset), extra_ARB_texture_gather"],
@@ -740,9 +743,6 @@ descriptor=[
   [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
   [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
 
-# GL_ARB_texture_multisample / GL 3.2
-  [ "TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY", "LOC_CUSTOM, TYPE_INT, TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX, extra_ARB_texture_multisample" ],
-
 # GL 3.0
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
 
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index c5d83e145a6..a1be1e33042 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -1612,7 +1612,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
       return ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample) ||
               _mesa_is_gles31(ctx)) ? TEXTURE_2D_MULTISAMPLE_INDEX: -1;
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
+      return ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample) ||
+              _mesa_is_gles31(ctx))
          ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: -1;
    default:
       return -1;
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 72d36117498..3f6f8ba2e20 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1223,6 +1223,7 @@ legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target,
    case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z_ARB:
       return ctx->Extensions.ARB_texture_cube_map;
    case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return ctx->Extensions.ARB_texture_multisample;
    }
 
@@ -1267,7 +1268,6 @@ legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target,
        * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
        */
       return ctx->API == API_OPENGL_CORE && ctx->Version >= 31;
-   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return ctx->Extensions.ARB_texture_multisample;

From 885a9b058cd8f2d49ae675513003eb1164888ad3 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 25 Aug 2015 16:17:14 -0700
Subject: [PATCH 23/82] i965: Rename INTEL_DEBUG=vec4vs to INTEL_DEBUG=vec4.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

driParseDebugString() doesn't have actual code to parse comma separated
lists (or any other supported options?); instead it dumbly uses strstr().

This means that INTEL_DEBUG="vec4vs" will trigger both DEBUG_VEC4VS and
DEBUG_VS, as "vs" is also a substring.

We should probably improve the driconf parsing, but for now, just rename
the option so it's usable in the meantime.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Jason Ekstrand <jason.ekstrand@intel.com>
Acked-by: Kristian Høgsberg <krh@bitplanet.net>
---
 src/mesa/drivers/dri/i965/intel_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index a0777310e2a..b3b3c21f491 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -68,7 +68,7 @@ static const struct dri_debug_control debug_control[] = {
    { "optimizer",   DEBUG_OPTIMIZER },
    { "ann",         DEBUG_ANNOTATION },
    { "no8",         DEBUG_NO8 },
-   { "vec4vs",      DEBUG_VEC4VS },
+   { "vec4",        DEBUG_VEC4VS },
    { "spill",       DEBUG_SPILL },
    { "cs",          DEBUG_CS },
    { NULL,    0 }

From f2e667172a6382f81d1f3e709f02c7ee6cfda4c7 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 18 Aug 2015 17:40:02 -0700
Subject: [PATCH 24/82] i964/fs: Refactor assign_constant_locations

Now that all constant locations are assigned in a single function, we can
refactor it a bit to unify things.  In particular, we now handle
pull_constant_loc and push_constant_loc more similarly and we only modify
stage_prog_data->params[] in one place at the end of the function.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 90 +++++++++++++---------------
 1 file changed, 42 insertions(+), 48 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 8f2056ee049..a575181a74a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1784,54 +1784,46 @@ fs_visitor::assign_constant_locations()
    if (dispatch_width != 8)
       return;
 
+   unsigned int num_pull_constants = 0;
+
    pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
    memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
 
-   /* Walk through and find array access of uniforms.  Put a copy of that
-    * uniform in the pull constant buffer.
+   bool is_live[uniforms];
+   memset(is_live, 0, sizeof(is_live));
+
+   /* First, we walk through the instructions and do two things:
+    *
+    *  1) Figure out which uniforms are live.
+    *
+    *  2) Find all indirect access of uniform arrays and flag them as needing
+    *     to go into the pull constant buffer.
     *
     * Note that we don't move constant-indexed accesses to arrays.  No
     * testing has been done of the performance impact of this choice.
     */
    foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
       for (int i = 0 ; i < inst->sources; i++) {
-         if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
-            continue;
-
-         int uniform = inst->src[i].reg;
-
-         /* If this array isn't already present in the pull constant buffer,
-          * add it.
-          */
-         if (pull_constant_loc[uniform] == -1) {
-            const gl_constant_value **values = &stage_prog_data->param[uniform];
-
-            assert(param_size[uniform]);
-
-            for (int j = 0; j < param_size[uniform]; j++) {
-               pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
-
-               stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
-                  values[j];
-            }
-         }
-      }
-   }
-
-   /* Find which UNIFORM registers are still in use. */
-   bool is_live[uniforms];
-   for (unsigned int i = 0; i < uniforms; i++) {
-      is_live[i] = false;
-   }
-
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file != UNIFORM)
             continue;
 
-         int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
-         if (constant_nr >= 0 && constant_nr < (int) uniforms)
-            is_live[constant_nr] = true;
+         if (inst->src[i].reladdr) {
+            int uniform = inst->src[i].reg;
+
+            /* If this array isn't already present in the pull constant buffer,
+             * add it.
+             */
+            if (pull_constant_loc[uniform] == -1) {
+               assert(param_size[uniform]);
+               for (int j = 0; j < param_size[uniform]; j++)
+                  pull_constant_loc[uniform + j] = num_pull_constants++;
+            }
+         } else {
+            /* Mark the the one accessed uniform as live */
+            int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
+            if (constant_nr >= 0 && constant_nr < (int) uniforms)
+               is_live[constant_nr] = true;
+         }
       }
    }
 
@@ -1865,27 +1857,29 @@ fs_visitor::assign_constant_locations()
       } else {
          /* Demote to a pull constant. */
          push_constant_loc[i] = -1;
-
-         int pull_index = stage_prog_data->nr_pull_params++;
-         stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
-         pull_constant_loc[i] = pull_index;
+         pull_constant_loc[i] = num_pull_constants++;
       }
    }
 
    stage_prog_data->nr_params = num_push_constants;
+   stage_prog_data->nr_pull_params = num_pull_constants;
 
    /* Up until now, the param[] array has been indexed by reg + reg_offset
-    * of UNIFORM registers.  Condense it to only contain the uniforms we
-    * chose to upload as push constants.
+    * of UNIFORM registers.  Move pull constants into pull_param[] and
+    * condense param[] to only contain the uniforms we chose to push.
+    *
+    * NOTE: Because we are condensing the params[] array, we know that
+    * push_constant_loc[i] <= i and we can do it in one smooth loop without
+    * having to make a copy.
     */
    for (unsigned int i = 0; i < uniforms; i++) {
-      int remapped = push_constant_loc[i];
+      const gl_constant_value *value = stage_prog_data->param[i];
 
-      if (remapped == -1)
-         continue;
-
-      assert(remapped <= (int)i);
-      stage_prog_data->param[remapped] = stage_prog_data->param[i];
+      if (pull_constant_loc[i] != -1) {
+         stage_prog_data->pull_param[pull_constant_loc[i]] = value;
+      } else if (push_constant_loc[i] != -1) {
+         stage_prog_data->param[push_constant_loc[i]] = value;
+      }
    }
 }
 

From fee0c5af11dd0995de96e7053377d425a66d03a0 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Wed, 19 Aug 2015 14:29:53 -0700
Subject: [PATCH 25/82] i965/fs: Split VGRFs after lowering pull constants

The split_virtual_grfs code doesn't properly rewrite reladdr so we need to
make sure that any uniform indirects are lowered away first.

This fixes the glsl-fs-uniform-indexed-by-swizzled-vec4.shader_test in piglit

Cc: "10.6" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index a575181a74a..81009a09128 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -4780,11 +4780,11 @@ fs_visitor::optimize()
     */
    bld = fs_builder(this, 64);
 
-   split_virtual_grfs();
-
    assign_constant_locations();
    demote_pull_constants();
 
+   split_virtual_grfs();
+
 #define OPT(pass, args...) ({                                           \
       pass_num++;                                                       \
       bool this_progress = pass(args);                                  \

From 3c256f572b0377e5cff23e022aef49c356055514 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 25 Aug 2015 13:25:31 -0600
Subject: [PATCH 26/82] gallium/util: fix code formatting in u_blitter.h

Trivial.
---
 src/gallium/auxiliary/util/u_blitter.h | 55 ++++++++++++--------------
 1 file changed, 25 insertions(+), 30 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index becdb029f13..eab48c5f00d 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -372,30 +372,28 @@ void util_blitter_custom_resolve_color(struct blitter_context *blitter,
  *
  * States not listed here are not affected by util_blitter. */
 
-static inline
-void util_blitter_save_blend(struct blitter_context *blitter,
-                             void *state)
+static inline void
+util_blitter_save_blend(struct blitter_context *blitter, void *state)
 {
    blitter->saved_blend_state = state;
 }
 
-static inline
-void util_blitter_save_depth_stencil_alpha(struct blitter_context *blitter,
-                                           void *state)
+static inline void
+util_blitter_save_depth_stencil_alpha(struct blitter_context *blitter,
+                                      void *state)
 {
    blitter->saved_dsa_state = state;
 }
 
-static inline
-void util_blitter_save_vertex_elements(struct blitter_context *blitter,
-                                       void *state)
+static inline void
+util_blitter_save_vertex_elements(struct blitter_context *blitter, void *state)
 {
    blitter->saved_velem_state = state;
 }
 
-static inline
-void util_blitter_save_stencil_ref(struct blitter_context *blitter,
-                                   const struct pipe_stencil_ref *state)
+static inline void
+util_blitter_save_stencil_ref(struct blitter_context *blitter,
+                              const struct pipe_stencil_ref *state)
 {
    blitter->saved_stencil_ref = *state;
 }
@@ -407,23 +405,20 @@ void util_blitter_save_rasterizer(struct blitter_context *blitter,
    blitter->saved_rs_state = state;
 }
 
-static inline
-void util_blitter_save_fragment_shader(struct blitter_context *blitter,
-                                       void *fs)
+static inline void
+util_blitter_save_fragment_shader(struct blitter_context *blitter, void *fs)
 {
    blitter->saved_fs = fs;
 }
 
-static inline
-void util_blitter_save_vertex_shader(struct blitter_context *blitter,
-                                     void *vs)
+static inline void
+util_blitter_save_vertex_shader(struct blitter_context *blitter, void *vs)
 {
    blitter->saved_vs = vs;
 }
 
-static inline
-void util_blitter_save_geometry_shader(struct blitter_context *blitter,
-                                       void *gs)
+static inline void
+util_blitter_save_geometry_shader(struct blitter_context *blitter, void *gs)
 {
    blitter->saved_gs = gs;
 }
@@ -442,24 +437,24 @@ util_blitter_save_tesseval_shader(struct blitter_context *blitter,
    blitter->saved_tes = sh;
 }
 
-static inline
-void util_blitter_save_framebuffer(struct blitter_context *blitter,
-                                   const struct pipe_framebuffer_state *state)
+static inline void
+util_blitter_save_framebuffer(struct blitter_context *blitter,
+                              const struct pipe_framebuffer_state *state)
 {
    blitter->saved_fb_state.nr_cbufs = 0; /* It's ~0 now, meaning it's unsaved. */
    util_copy_framebuffer_state(&blitter->saved_fb_state, state);
 }
 
-static inline
-void util_blitter_save_viewport(struct blitter_context *blitter,
-                                struct pipe_viewport_state *state)
+static inline void
+util_blitter_save_viewport(struct blitter_context *blitter,
+                           struct pipe_viewport_state *state)
 {
    blitter->saved_viewport = *state;
 }
 
-static inline
-void util_blitter_save_scissor(struct blitter_context *blitter,
-                               struct pipe_scissor_state *state)
+static inline void
+util_blitter_save_scissor(struct blitter_context *blitter,
+                          struct pipe_scissor_state *state)
 {
    blitter->saved_scissor = *state;
 }

From 42c7be5877665dfcbeb317c7eb163a03d56fe661 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 26 Aug 2015 13:23:47 -0600
Subject: [PATCH 27/82] glsl: fix comment typo: s/filed/field/

---
 src/glsl/glsl_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index e7c73dac3c3..02a398f6112 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -613,7 +613,7 @@ struct glsl_type {
    const glsl_type *field_type(const char *name) const;
 
    /**
-    * Get the location of a filed within a record type
+    * Get the location of a field within a record type
     */
    int field_index(const char *name) const;
 

From bcae4640c89bc2775d1a85b2b27d2787cac8f843 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 26 Aug 2015 13:58:23 -0600
Subject: [PATCH 28/82] st/mesa: use PROGRAM_ARRAY for storing structs
 containing arrays
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously, we used PROGRAM_ARRAY only for variables which were
arrays or matrices.  But if the variable is a structure containing
an array or matrix, we need to use PROGRAM_ARRAY for that too.

Before, we failed an assertion:
  state_tracker/st_glsl_to_tgsi.cpp:4900:
  Assertion `src_reg->file != PROGRAM_TEMPORARY' failed.
when running the piglit test
glsl-1.20/execution/fs-const-array-of-struct-of-array.shader_test

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 32 ++++++++++++++++++++--
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index cba98819718..695644117ac 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -1123,6 +1123,34 @@ type_size(const struct glsl_type *type)
    return 0;
 }
 
+
+/**
+ * If the given GLSL type is an array or matrix or a structure containing
+ * an array/matrix member, return true.  Else return false.
+ *
+ * This is used to determine which kind of temp storage (PROGRAM_TEMPORARY
+ * or PROGRAM_ARRAY) should be used for variables of this type.  Anytime
+ * we have an array that might be indexed with a variable, we need to use
+ * the later storage type.
+ */
+static bool
+type_has_array_or_matrix(const glsl_type *type)
+{
+   if (type->is_array() || type->is_matrix())
+      return true;
+
+   if (type->is_record()) {
+      for (unsigned i = 0; i < type->length; i++) {
+         if (type_has_array_or_matrix(type->fields.structure[i].type)) {
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+
 /**
  * In the initial pass of codegen, we assign temporary numbers to
  * intermediate results.  (not SSA -- variable assignments will reuse
@@ -1137,9 +1165,7 @@ glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
    src.reladdr = NULL;
    src.negate = 0;
 
-   if (!options->EmitNoIndirectTemp &&
-       (type->is_array() || type->is_matrix())) {
-
+   if (!options->EmitNoIndirectTemp && type_has_array_or_matrix(type)) {
       if (next_array >= max_num_arrays) {
          max_num_arrays += 32;
          array_sizes = (unsigned*)

From c44d50775209266b5c8bad0ab7a7c4ccd7db14a4 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 24 Aug 2015 17:30:08 -0700
Subject: [PATCH 29/82] nir: Strengthen "no jumps" assertions in instruction
 insertion API.

Jumps must be the last instruction in a block, so inserting another
instruction after a jump is illegal.

Previously, we only checked this when the new instruction being inserted
was a jump.  This is a red herring - inserting *any* kind of instruction
after a jump is illegal.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Acked-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/nir/nir.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index 77cc4f078a3..ff758f447e8 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -675,9 +675,10 @@ nir_instr_insert_before(nir_instr *instr, nir_instr *before)
 void
 nir_instr_insert_after(nir_instr *instr, nir_instr *after)
 {
+   assert(instr->type != nir_instr_type_jump);
+
    if (after->type == nir_instr_type_jump) {
       assert(instr == nir_block_last_instr(instr->block));
-      assert(instr->type != nir_instr_type_jump);
    }
 
    after->block = instr->block;
@@ -705,10 +706,9 @@ nir_instr_insert_before_block(nir_block *block, nir_instr *before)
 void
 nir_instr_insert_after_block(nir_block *block, nir_instr *after)
 {
-   if (after->type == nir_instr_type_jump) {
-      assert(exec_list_is_empty(&block->instr_list) ||
-             nir_block_last_instr(block)->type != nir_instr_type_jump);
-   }
+   nir_instr *last = nir_block_last_instr(block);
+   assert(last == NULL || last->type != nir_instr_type_jump);
+   (void) last;
 
    after->block = block;
    add_defs_uses(after);

From f90c6b1ce0d96c7dbdd32ab913d5c88913700ba2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 25 Aug 2015 10:01:31 -0700
Subject: [PATCH 30/82] nir: Move nir_cursor to nir.h.

We want to use this for normal instruction insertion too, not just
control flow.  Generally these functions are going to be extremely
useful when working with NIR, so I want them to be widely available
without having to include a separate file.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Acked-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/nir/nir.h              | 97 +++++++++++++++++++++++++++++++++
 src/glsl/nir/nir_control_flow.h | 89 ------------------------------
 2 files changed, 97 insertions(+), 89 deletions(-)

diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 40871f73e96..49430cdce68 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1546,6 +1546,101 @@ nir_deref *nir_copy_deref(void *mem_ctx, nir_deref *deref);
 nir_load_const_instr *
 nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref);
 
+/**
+ * NIR Cursors and Instruction Insertion API
+ * @{
+ *
+ * A tiny struct representing a point to insert/extract instructions or
+ * control flow nodes.  Helps reduce the combinatorial explosion of possible
+ * points to insert/extract.
+ *
+ * \sa nir_control_flow.h
+ */
+typedef enum {
+   nir_cursor_before_block,
+   nir_cursor_after_block,
+   nir_cursor_before_instr,
+   nir_cursor_after_instr,
+} nir_cursor_option;
+
+typedef struct {
+   nir_cursor_option option;
+   union {
+      nir_block *block;
+      nir_instr *instr;
+   };
+} nir_cursor;
+
+static inline nir_cursor
+nir_before_block(nir_block *block)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_before_block;
+   cursor.block = block;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_after_block(nir_block *block)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_after_block;
+   cursor.block = block;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_before_instr(nir_instr *instr)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_before_instr;
+   cursor.instr = instr;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_after_instr(nir_instr *instr)
+{
+   nir_cursor cursor;
+   cursor.option = nir_cursor_after_instr;
+   cursor.instr = instr;
+   return cursor;
+}
+
+static inline nir_cursor
+nir_before_cf_node(nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_block)
+      return nir_before_block(nir_cf_node_as_block(node));
+
+   return nir_after_block(nir_cf_node_as_block(nir_cf_node_prev(node)));
+}
+
+static inline nir_cursor
+nir_after_cf_node(nir_cf_node *node)
+{
+   if (node->type == nir_cf_node_block)
+      return nir_after_block(nir_cf_node_as_block(node));
+
+   return nir_before_block(nir_cf_node_as_block(nir_cf_node_next(node)));
+}
+
+static inline nir_cursor
+nir_before_cf_list(struct exec_list *cf_list)
+{
+   nir_cf_node *first_node = exec_node_data(nir_cf_node,
+                                            exec_list_get_head(cf_list), node);
+   return nir_before_cf_node(first_node);
+}
+
+static inline nir_cursor
+nir_after_cf_list(struct exec_list *cf_list)
+{
+   nir_cf_node *last_node = exec_node_data(nir_cf_node,
+                                           exec_list_get_tail(cf_list), node);
+   return nir_after_cf_node(last_node);
+}
+
 void nir_instr_insert_before(nir_instr *instr, nir_instr *before);
 void nir_instr_insert_after(nir_instr *instr, nir_instr *after);
 
@@ -1560,6 +1655,8 @@ void nir_instr_insert_after_cf_list(struct exec_list *list, nir_instr *after);
 
 void nir_instr_remove(nir_instr *instr);
 
+/** @} */
+
 typedef bool (*nir_foreach_ssa_def_cb)(nir_ssa_def *def, void *state);
 typedef bool (*nir_foreach_dest_cb)(nir_dest *dest, void *state);
 typedef bool (*nir_foreach_src_cb)(nir_src *src, void *state);
diff --git a/src/glsl/nir/nir_control_flow.h b/src/glsl/nir/nir_control_flow.h
index 5efd41caadf..b71382fc597 100644
--- a/src/glsl/nir/nir_control_flow.h
+++ b/src/glsl/nir/nir_control_flow.h
@@ -45,95 +45,6 @@ extern "C" {
  *    deleting them.
  */
 
-/* Helper struct for representing a point to extract/insert. Helps reduce the
- * combinatorial explosion of possible points to extract.
- */
-
-typedef enum {
-   nir_cursor_before_block,
-   nir_cursor_after_block,
-   nir_cursor_before_instr,
-   nir_cursor_after_instr,
-} nir_cursor_option;
-
-typedef struct {
-   nir_cursor_option option;
-   union {
-      nir_block *block;
-      nir_instr *instr;
-   };
-} nir_cursor;
-
-static inline nir_cursor
-nir_before_block(nir_block *block)
-{
-   nir_cursor cursor;
-   cursor.option = nir_cursor_before_block;
-   cursor.block = block;
-   return cursor;
-}
-
-static inline nir_cursor
-nir_after_block(nir_block *block)
-{
-   nir_cursor cursor;
-   cursor.option = nir_cursor_after_block;
-   cursor.block = block;
-   return cursor;
-}
-
-static inline nir_cursor
-nir_before_instr(nir_instr *instr)
-{
-   nir_cursor cursor;
-   cursor.option = nir_cursor_before_instr;
-   cursor.instr = instr;
-   return cursor;
-}
-
-static inline nir_cursor
-nir_after_instr(nir_instr *instr)
-{
-   nir_cursor cursor;
-   cursor.option = nir_cursor_after_instr;
-   cursor.instr = instr;
-   return cursor;
-}
-
-static inline nir_cursor
-nir_before_cf_node(nir_cf_node *node)
-{
-   if (node->type == nir_cf_node_block)
-      return nir_before_block(nir_cf_node_as_block(node));
-
-   return nir_after_block(nir_cf_node_as_block(nir_cf_node_prev(node)));
-}
-
-static inline nir_cursor
-nir_after_cf_node(nir_cf_node *node)
-{
-   if (node->type == nir_cf_node_block)
-      return nir_after_block(nir_cf_node_as_block(node));
-
-   return nir_before_block(nir_cf_node_as_block(nir_cf_node_next(node)));
-}
-
-static inline nir_cursor
-nir_before_cf_list(struct exec_list *cf_list)
-{
-   nir_cf_node *first_node = exec_node_data(nir_cf_node,
-                                            exec_list_get_head(cf_list), node);
-   return nir_before_cf_node(first_node);
-}
-
-static inline nir_cursor
-nir_after_cf_list(struct exec_list *cf_list)
-{
-   nir_cf_node *last_node = exec_node_data(nir_cf_node,
-                                           exec_list_get_tail(cf_list), node);
-   return nir_after_cf_node(last_node);
-}
-
 /** Control flow insertion. */
 
 /** puts a control flow node where the cursor is */

From 3e3cb77901c9c9efbf4cf550da80509fe6dbbd9f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Sun, 9 Aug 2015 18:30:33 -0700
Subject: [PATCH 31/82] nir: Convert the NIR instruction insertion API to use
 cursors.

This patch implements a general nir_instr_insert() function that takes a
nir_cursor for the insertion point.  It then reworks the existing API to
simply be a wrapper around that for compatibility.

This largely involves moving the existing code into a new function.

Suggested by Connor Abbott.

v2: Make the legacy functions static inline in nir.h (requested by
    Connor Abbott).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Acked-by: Connor Abbott <cwabbott0@gmail.com>
---
 src/glsl/nir/nir.c | 131 ++++++++++++++-------------------------------
 src/glsl/nir/nir.h |  59 +++++++++++++++++---
 2 files changed, 91 insertions(+), 99 deletions(-)

diff --git a/src/glsl/nir/nir.c b/src/glsl/nir/nir.c
index ff758f447e8..bf001312121 100644
--- a/src/glsl/nir/nir.c
+++ b/src/glsl/nir/nir.c
@@ -664,102 +664,51 @@ add_defs_uses(nir_instr *instr)
 }
 
 void
-nir_instr_insert_before(nir_instr *instr, nir_instr *before)
+nir_instr_insert(nir_cursor cursor, nir_instr *instr)
 {
-   assert(before->type != nir_instr_type_jump);
-   before->block = instr->block;
-   add_defs_uses(before);
-   exec_node_insert_node_before(&instr->node, &before->node);
-}
+   switch (cursor.option) {
+   case nir_cursor_before_block:
+      /* Only allow inserting jumps into empty blocks. */
+      if (instr->type == nir_instr_type_jump)
+         assert(exec_list_is_empty(&cursor.block->instr_list));
 
-void
-nir_instr_insert_after(nir_instr *instr, nir_instr *after)
-{
-   assert(instr->type != nir_instr_type_jump);
+      instr->block = cursor.block;
+      add_defs_uses(instr);
+      exec_list_push_head(&cursor.block->instr_list, &instr->node);
+      break;
+   case nir_cursor_after_block: {
+      /* Inserting instructions after a jump is illegal. */
+      nir_instr *last = nir_block_last_instr(cursor.block);
+      assert(last == NULL || last->type != nir_instr_type_jump);
+      (void) last;
 
-   if (after->type == nir_instr_type_jump) {
-      assert(instr == nir_block_last_instr(instr->block));
+      instr->block = cursor.block;
+      add_defs_uses(instr);
+      exec_list_push_tail(&cursor.block->instr_list, &instr->node);
+      break;
+   }
+   case nir_cursor_before_instr:
+      assert(instr->type != nir_instr_type_jump);
+      instr->block = cursor.instr->block;
+      add_defs_uses(instr);
+      exec_node_insert_node_before(&cursor.instr->node, &instr->node);
+      break;
+   case nir_cursor_after_instr:
+      /* Inserting instructions after a jump is illegal. */
+      assert(cursor.instr->type != nir_instr_type_jump);
+
+      /* Only allow inserting jumps at the end of the block. */
+      if (instr->type == nir_instr_type_jump)
+         assert(cursor.instr == nir_block_last_instr(cursor.instr->block));
+
+      instr->block = cursor.instr->block;
+      add_defs_uses(instr);
+      exec_node_insert_after(&cursor.instr->node, &instr->node);
+      break;
    }
 
-   after->block = instr->block;
-   add_defs_uses(after);
-   exec_node_insert_after(&instr->node, &after->node);
-
-   if (after->type == nir_instr_type_jump)
-      nir_handle_add_jump(after->block);
-}
-
-void
-nir_instr_insert_before_block(nir_block *block, nir_instr *before)
-{
-   if (before->type == nir_instr_type_jump)
-      assert(exec_list_is_empty(&block->instr_list));
-
-   before->block = block;
-   add_defs_uses(before);
-   exec_list_push_head(&block->instr_list, &before->node);
-
-   if (before->type == nir_instr_type_jump)
-      nir_handle_add_jump(block);
-}
-
-void
-nir_instr_insert_after_block(nir_block *block, nir_instr *after)
-{
-   nir_instr *last = nir_block_last_instr(block);
-   assert(last == NULL || last->type != nir_instr_type_jump);
-   (void) last;
-
-   after->block = block;
-   add_defs_uses(after);
-   exec_list_push_tail(&block->instr_list, &after->node);
-
-   if (after->type == nir_instr_type_jump)
-      nir_handle_add_jump(block);
-}
-
-void
-nir_instr_insert_before_cf(nir_cf_node *node, nir_instr *before)
-{
-   if (node->type == nir_cf_node_block) {
-      nir_instr_insert_before_block(nir_cf_node_as_block(node), before);
-   } else {
-      nir_cf_node *prev = nir_cf_node_prev(node);
-      assert(prev->type == nir_cf_node_block);
-      nir_block *prev_block = nir_cf_node_as_block(prev);
-
-      nir_instr_insert_before_block(prev_block, before);
-   }
-}
-
-void
-nir_instr_insert_after_cf(nir_cf_node *node, nir_instr *after)
-{
-   if (node->type == nir_cf_node_block) {
-      nir_instr_insert_after_block(nir_cf_node_as_block(node), after);
-   } else {
-      nir_cf_node *next = nir_cf_node_next(node);
-      assert(next->type == nir_cf_node_block);
-      nir_block *next_block = nir_cf_node_as_block(next);
-
-      nir_instr_insert_before_block(next_block, after);
-   }
-}
-
-void
-nir_instr_insert_before_cf_list(struct exec_list *list, nir_instr *before)
-{
-   nir_cf_node *first_node = exec_node_data(nir_cf_node,
-                                            exec_list_get_head(list), node);
-   nir_instr_insert_before_cf(first_node, before);
-}
-
-void
-nir_instr_insert_after_cf_list(struct exec_list *list, nir_instr *after)
-{
-   nir_cf_node *last_node = exec_node_data(nir_cf_node,
-                                           exec_list_get_tail(list), node);
-   nir_instr_insert_after_cf(last_node, after);
+   if (instr->type == nir_instr_type_jump)
+      nir_handle_add_jump(instr->block);
 }
 
 static bool
diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
index 49430cdce68..9703372fcc0 100644
--- a/src/glsl/nir/nir.h
+++ b/src/glsl/nir/nir.h
@@ -1641,17 +1641,60 @@ nir_after_cf_list(struct exec_list *cf_list)
    return nir_after_cf_node(last_node);
 }
 
-void nir_instr_insert_before(nir_instr *instr, nir_instr *before);
-void nir_instr_insert_after(nir_instr *instr, nir_instr *after);
+/**
+ * Insert a NIR instruction at the given cursor.
+ *
+ * Note: This does not update the cursor.
+ */
+void nir_instr_insert(nir_cursor cursor, nir_instr *instr);
 
-void nir_instr_insert_before_block(nir_block *block, nir_instr *before);
-void nir_instr_insert_after_block(nir_block *block, nir_instr *after);
+static inline void
+nir_instr_insert_before(nir_instr *instr, nir_instr *before)
+{
+   nir_instr_insert(nir_before_instr(instr), before);
+}
 
-void nir_instr_insert_before_cf(nir_cf_node *node, nir_instr *before);
-void nir_instr_insert_after_cf(nir_cf_node *node, nir_instr *after);
+static inline void
+nir_instr_insert_after(nir_instr *instr, nir_instr *after)
+{
+   nir_instr_insert(nir_after_instr(instr), after);
+}
 
-void nir_instr_insert_before_cf_list(struct exec_list *list, nir_instr *before);
-void nir_instr_insert_after_cf_list(struct exec_list *list, nir_instr *after);
+static inline void
+nir_instr_insert_before_block(nir_block *block, nir_instr *before)
+{
+   nir_instr_insert(nir_before_block(block), before);
+}
+
+static inline void
+nir_instr_insert_after_block(nir_block *block, nir_instr *after)
+{
+   nir_instr_insert(nir_after_block(block), after);
+}
+
+static inline void
+nir_instr_insert_before_cf(nir_cf_node *node, nir_instr *before)
+{
+   nir_instr_insert(nir_before_cf_node(node), before);
+}
+
+static inline void
+nir_instr_insert_after_cf(nir_cf_node *node, nir_instr *after)
+{
+   nir_instr_insert(nir_after_cf_node(node), after);
+}
+
+static inline void
+nir_instr_insert_before_cf_list(struct exec_list *list, nir_instr *before)
+{
+   nir_instr_insert(nir_before_cf_list(list), before);
+}
+
+static inline void
+nir_instr_insert_after_cf_list(struct exec_list *list, nir_instr *after)
+{
+   nir_instr_insert(nir_after_cf_list(list), after);
+}
 
 void nir_instr_remove(nir_instr *instr);
 

From 0a913a9d85f2eb772be6a133965c5b8a4aa3c800 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 6 Aug 2015 07:16:07 -0700
Subject: [PATCH 32/82] nir: Convert the builder to use the new NIR cursor API.

The NIR cursor API is exactly what we want for the builder's insertion
point.  This simplifies the API, the implementation, and is actually
more flexible as well.

This required a bit of reworking of TGSI->NIR's if/loop stack handling;
we now store cursors instead of cf_node_lists, for better or worse.

v2: Actually move the cursor in the after_instr case.
v3: Take advantage of nir_instr_insert (suggested by Connor).
v4: vc4 build fixes (thanks to Eric).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net> [v1]
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com> [v4]
Acked-by: Connor Abbott <cwabbott0@gmail.com> [v4]
---
 src/gallium/auxiliary/nir/tgsi_to_nir.c       | 34 ++++++-------
 .../freedreno/ir3/ir3_nir_lower_if_else.c     |  2 +-
 src/gallium/drivers/vc4/vc4_nir_lower_blend.c |  2 +-
 src/gallium/drivers/vc4/vc4_nir_lower_io.c    |  6 +--
 src/glsl/nir/nir_builder.h                    | 51 +++++--------------
 src/glsl/nir/nir_lower_idiv.c                 |  2 +-
 src/glsl/nir/nir_lower_io.c                   |  2 +-
 src/glsl/nir/nir_lower_load_const_to_scalar.c |  2 +-
 src/glsl/nir/nir_lower_tex_projector.c        |  2 +-
 src/glsl/nir/nir_normalize_cubemap_coords.c   |  2 +-
 src/mesa/program/prog_to_nir.c                |  2 +-
 11 files changed, 42 insertions(+), 65 deletions(-)

diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 278d5e9bf5b..db50734efd5 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -65,24 +65,24 @@ struct ttn_compile {
    nir_register *addr_reg;
 
    /**
-    * Stack of cf_node_lists where instructions should be pushed as we pop
+    * Stack of nir_cursors where instructions should be pushed as we pop
     * back out of the control flow stack.
     *
     * For each IF/ELSE/ENDIF block, if_stack[if_stack_pos] has where the else
     * instructions should be placed, and if_stack[if_stack_pos - 1] has where
     * the next instructions outside of the if/then/else block go.
     */
-   struct exec_list **if_stack;
+   nir_cursor *if_stack;
    unsigned if_stack_pos;
 
    /**
-    * Stack of cf_node_lists where instructions should be pushed as we pop
+    * Stack of nir_cursors where instructions should be pushed as we pop
     * back out of the control flow stack.
     *
     * loop_stack[loop_stack_pos - 1] contains the cf_node_list for the outside
     * of the loop.
     */
-   struct exec_list **loop_stack;
+   nir_cursor *loop_stack;
    unsigned loop_stack_pos;
 
    /* How many TGSI_FILE_IMMEDIATE vec4s have been parsed so far. */
@@ -922,7 +922,7 @@ ttn_if(struct ttn_compile *c, nir_ssa_def *src, bool is_uint)
    nir_builder *b = &c->build;
 
    /* Save the outside-of-the-if-statement node list. */
-   c->if_stack[c->if_stack_pos] = b->cf_node_list;
+   c->if_stack[c->if_stack_pos] = b->cursor;
    c->if_stack_pos++;
 
    src = ttn_channel(b, src, X);
@@ -933,11 +933,11 @@ ttn_if(struct ttn_compile *c, nir_ssa_def *src, bool is_uint)
    } else {
       if_stmt->condition = nir_src_for_ssa(nir_fne(b, src, nir_imm_int(b, 0)));
    }
-   nir_cf_node_insert_end(b->cf_node_list, &if_stmt->cf_node);
+   nir_builder_cf_insert(b, &if_stmt->cf_node);
 
-   nir_builder_insert_after_cf_list(b, &if_stmt->then_list);
+   b->cursor = nir_after_cf_list(&if_stmt->then_list);
 
-   c->if_stack[c->if_stack_pos] = &if_stmt->else_list;
+   c->if_stack[c->if_stack_pos] = nir_after_cf_list(&if_stmt->else_list);
    c->if_stack_pos++;
 }
 
@@ -946,7 +946,7 @@ ttn_else(struct ttn_compile *c)
 {
    nir_builder *b = &c->build;
 
-   nir_builder_insert_after_cf_list(b, c->if_stack[c->if_stack_pos - 1]);
+   b->cursor = c->if_stack[c->if_stack_pos - 1];
 }
 
 static void
@@ -955,7 +955,7 @@ ttn_endif(struct ttn_compile *c)
    nir_builder *b = &c->build;
 
    c->if_stack_pos -= 2;
-   nir_builder_insert_after_cf_list(b, c->if_stack[c->if_stack_pos]);
+   b->cursor = c->if_stack[c->if_stack_pos];
 }
 
 static void
@@ -964,13 +964,13 @@ ttn_bgnloop(struct ttn_compile *c)
    nir_builder *b = &c->build;
 
    /* Save the outside-of-the-loop node list. */
-   c->loop_stack[c->loop_stack_pos] = b->cf_node_list;
+   c->loop_stack[c->loop_stack_pos] = b->cursor;
    c->loop_stack_pos++;
 
    nir_loop *loop = nir_loop_create(b->shader);
-   nir_cf_node_insert_end(b->cf_node_list, &loop->cf_node);
+   nir_builder_cf_insert(b, &loop->cf_node);
 
-   nir_builder_insert_after_cf_list(b, &loop->body);
+   b->cursor = nir_after_cf_list(&loop->body);
 }
 
 static void
@@ -993,7 +993,7 @@ ttn_endloop(struct ttn_compile *c)
    nir_builder *b = &c->build;
 
    c->loop_stack_pos--;
-   nir_builder_insert_after_cf_list(b, c->loop_stack[c->loop_stack_pos]);
+   b->cursor = c->loop_stack[c->loop_stack_pos];
 }
 
 static void
@@ -1803,7 +1803,7 @@ tgsi_to_nir(const void *tgsi_tokens,
    nir_function_impl *impl = nir_function_impl_create(overload);
 
    nir_builder_init(&c->build, impl);
-   nir_builder_insert_after_cf_list(&c->build, &impl->body);
+   c->build.cursor = nir_after_cf_list(&impl->body);
 
    s->num_inputs = scan.file_max[TGSI_FILE_INPUT] + 1;
    s->num_uniforms = scan.const_file_max[0] + 1;
@@ -1819,10 +1819,10 @@ tgsi_to_nir(const void *tgsi_tokens,
    c->num_samp_types = scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
    c->samp_types = rzalloc_array(c, nir_alu_type, c->num_samp_types);
 
-   c->if_stack = rzalloc_array(c, struct exec_list *,
+   c->if_stack = rzalloc_array(c, nir_cursor,
                                (scan.opcode_count[TGSI_OPCODE_IF] +
                                 scan.opcode_count[TGSI_OPCODE_UIF]) * 2);
-   c->loop_stack = rzalloc_array(c, struct exec_list *,
+   c->loop_stack = rzalloc_array(c, nir_cursor,
                                  scan.opcode_count[TGSI_OPCODE_BGNLOOP]);
 
    ret = tgsi_parse_init(&parser, tgsi_tokens);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index bed7b7b826a..d57eb2ba713 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -172,7 +172,7 @@ flatten_block(nir_builder *bld, nir_block *if_block, nir_block *prev_block,
 					(intr->intrinsic == nir_intrinsic_discard_if)) {
 				nir_ssa_def *discard_cond;
 
-				nir_builder_insert_after_instr(bld,
+				bld->cursor = nir_after_instr(
 						nir_block_last_instr(prev_block));
 
 				if (invert) {
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
index a372a6c0cdc..808cbea8fde 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_blend.c
@@ -409,7 +409,7 @@ vc4_nir_lower_blend_block(nir_block *block, void *state)
                         nir_cf_node_get_function(&block->cf_node);
                 nir_builder b;
                 nir_builder_init(&b, impl);
-                nir_builder_insert_before_instr(&b, &intr->instr);
+                b.cursor = nir_before_instr(&intr->instr);
                 vc4_nir_lower_blend_instr(c, &b, intr);
         }
         return true;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_io.c b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
index 229d41147d8..b632370cbb2 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_io.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_io.c
@@ -56,7 +56,7 @@ static void
 vc4_nir_lower_input(struct vc4_compile *c, nir_builder *b,
                     nir_intrinsic_instr *intr)
 {
-        nir_builder_insert_before_instr(b, &intr->instr);
+        b->cursor = nir_before_instr(&intr->instr);
 
         if (c->stage == QSTAGE_FRAG && intr->const_index[0] ==
             VC4_NIR_TLB_COLOR_READ_INPUT) {
@@ -160,7 +160,7 @@ vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b,
         /* All TGSI-to-NIR outputs are VEC4. */
         assert(intr->num_components == 4);
 
-        nir_builder_insert_before_instr(b, &intr->instr);
+        b->cursor = nir_before_instr(&intr->instr);
 
         for (unsigned i = 0; i < intr->num_components; i++) {
                 nir_intrinsic_instr *intr_comp =
@@ -189,7 +189,7 @@ vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b,
                 return;
         assert(intr->num_components == 4);
 
-        nir_builder_insert_before_instr(b, &intr->instr);
+        b->cursor = nir_before_instr(&intr->instr);
 
         /* Generate scalar loads equivalent to the original VEC4. */
         nir_ssa_def *dests[4];
diff --git a/src/glsl/nir/nir_builder.h b/src/glsl/nir/nir_builder.h
index 9223e838095..08b40f8ea7c 100644
--- a/src/glsl/nir/nir_builder.h
+++ b/src/glsl/nir/nir_builder.h
@@ -24,12 +24,12 @@
 #ifndef NIR_BUILDER_H
 #define NIR_BUILDER_H
 
+#include "nir_control_flow.h"
+
 struct exec_list;
 
 typedef struct nir_builder {
-   struct exec_list *cf_node_list;
-   nir_instr *before_instr;
-   nir_instr *after_instr;
+   nir_cursor cursor;
 
    nir_shader *shader;
    nir_function_impl *impl;
@@ -43,43 +43,20 @@ nir_builder_init(nir_builder *build, nir_function_impl *impl)
    build->shader = impl->overload->function->shader;
 }
 
-static inline void
-nir_builder_insert_after_cf_list(nir_builder *build,
-                                 struct exec_list *cf_node_list)
-{
-   build->cf_node_list = cf_node_list;
-   build->before_instr = NULL;
-   build->after_instr = NULL;
-}
-
-static inline void
-nir_builder_insert_before_instr(nir_builder *build, nir_instr *before_instr)
-{
-   build->cf_node_list = NULL;
-   build->before_instr = before_instr;
-   build->after_instr = NULL;
-}
-
-static inline void
-nir_builder_insert_after_instr(nir_builder *build, nir_instr *after_instr)
-{
-   build->cf_node_list = NULL;
-   build->before_instr = NULL;
-   build->after_instr = after_instr;
-}
-
 static inline void
 nir_builder_instr_insert(nir_builder *build, nir_instr *instr)
 {
-   if (build->cf_node_list) {
-      nir_instr_insert_after_cf_list(build->cf_node_list, instr);
-   } else if (build->before_instr) {
-      nir_instr_insert_before(build->before_instr, instr);
-   } else {
-      assert(build->after_instr);
-      nir_instr_insert_after(build->after_instr, instr);
-      build->after_instr = instr;
-   }
+   nir_instr_insert(build->cursor, instr);
+
+   /* Move the cursor forward. */
+   if (build->cursor.option == nir_cursor_after_instr)
+      build->cursor.instr = instr;
+}
+
+static inline void
+nir_builder_cf_insert(nir_builder *build, nir_cf_node *cf)
+{
+   nir_cf_node_insert(build->cursor, cf);
 }
 
 static inline nir_ssa_def *
diff --git a/src/glsl/nir/nir_lower_idiv.c b/src/glsl/nir/nir_lower_idiv.c
index 7b680320783..0e1653dd274 100644
--- a/src/glsl/nir/nir_lower_idiv.c
+++ b/src/glsl/nir/nir_lower_idiv.c
@@ -50,7 +50,7 @@ convert_instr(nir_builder *bld, nir_alu_instr *alu)
 
    is_signed = (op == nir_op_idiv);
 
-   nir_builder_insert_before_instr(bld, &alu->instr);
+   bld->cursor = nir_before_instr(&alu->instr);
 
    numer = nir_ssa_for_src(bld, alu->src[0].src,
                            nir_ssa_alu_instr_src_components(alu, 0));
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index c9697e7845e..afb463040cc 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -84,7 +84,7 @@ get_io_offset(nir_deref_var *deref, nir_instr *instr, nir_src *indirect,
    unsigned base_offset = 0;
 
    nir_builder *b = &state->builder;
-   nir_builder_insert_before_instr(b, instr);
+   b->cursor = nir_before_instr(instr);
 
    nir_deref *tail = &deref->deref;
    while (tail->child != NULL) {
diff --git a/src/glsl/nir/nir_lower_load_const_to_scalar.c b/src/glsl/nir/nir_lower_load_const_to_scalar.c
index a90e5245898..b83ef052ea9 100644
--- a/src/glsl/nir/nir_lower_load_const_to_scalar.c
+++ b/src/glsl/nir/nir_lower_load_const_to_scalar.c
@@ -43,7 +43,7 @@ lower_load_const_instr_scalar(nir_load_const_instr *lower)
 
    nir_builder b;
    nir_builder_init(&b, nir_cf_node_get_function(&lower->instr.block->cf_node));
-   nir_builder_insert_before_instr(&b, &lower->instr);
+   b.cursor = nir_before_instr(&lower->instr);
 
    /* Emit the individual loads. */
    nir_ssa_def *loads[4];
diff --git a/src/glsl/nir/nir_lower_tex_projector.c b/src/glsl/nir/nir_lower_tex_projector.c
index 357131cd728..8a482b182a9 100644
--- a/src/glsl/nir/nir_lower_tex_projector.c
+++ b/src/glsl/nir/nir_lower_tex_projector.c
@@ -46,7 +46,7 @@ nir_lower_tex_projector_block(nir_block *block, void *void_state)
          continue;
 
       nir_tex_instr *tex = nir_instr_as_tex(instr);
-      nir_builder_insert_before_instr(b, &tex->instr);
+      b->cursor = nir_before_instr(&tex->instr);
 
       /* Find the projector in the srcs list, if present. */
       int proj_index;
diff --git a/src/glsl/nir/nir_normalize_cubemap_coords.c b/src/glsl/nir/nir_normalize_cubemap_coords.c
index 0da8447aca1..75b647f96cb 100644
--- a/src/glsl/nir/nir_normalize_cubemap_coords.c
+++ b/src/glsl/nir/nir_normalize_cubemap_coords.c
@@ -52,7 +52,7 @@ normalize_cubemap_coords_block(nir_block *block, void *void_state)
       if (tex->sampler_dim != GLSL_SAMPLER_DIM_CUBE)
          continue;
 
-      nir_builder_insert_before_instr(b, &tex->instr);
+      b->cursor = nir_before_instr(&tex->instr);
 
       for (unsigned i = 0; i < tex->num_srcs; i++) {
          if (tex->src[i].src_type != nir_tex_src_coord)
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index d96b7bc8782..fccd16fc8c0 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -1108,7 +1108,7 @@ prog_to_nir(const struct gl_program *prog,
 
    c->build.shader = s;
    c->build.impl = impl;
-   nir_builder_insert_after_cf_list(&c->build, &impl->body);
+   c->build.cursor = nir_after_cf_list(&impl->body);
 
    setup_registers_and_variables(c);
    if (unlikely(c->error))

From 2259b111003f2e8c55cae42677ec45345fb1b6e3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 27 Aug 2015 15:28:24 -0400
Subject: [PATCH 33/82] mesa: only copy the requested teximage faces

Cube maps are special in that they have separate teximages for each
face. We handled that by copying the data to them separately, but in
case zoffset != 0 or depth != 6 we would read off the end of the client
array or modify the wrong images.

zoffset/depth have already been verified by the time the code gets to
this stage, so no need to double-check.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
Cc: "10.6 11.0" <mesa-stable@lists.freedesktop.org>
---
 src/mesa/main/teximage.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 8d94903db67..ee4b6105064 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -3805,12 +3805,12 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
       rowStride = _mesa_image_image_stride(&ctx->Unpack, width, height,
                                            format, type);
       /* Copy in each face. */
-      for (i = 0; i < 6; ++i) {
+      for (i = zoffset; i < zoffset + depth; ++i) {
          texImage = texObj->Image[i][level];
          assert(texImage);
 
          _mesa_texture_sub_image(ctx, 3, texObj, texImage, texObj->Target,
-                                 level, xoffset, yoffset, zoffset,
+                                 level, xoffset, yoffset, 0,
                                  width, height, 1, format,
                                  type, pixels, true);
          pixels = (GLubyte *) pixels + rowStride;

From 52f748792393c681f35025be7d843e6426fa327d Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 27 Aug 2015 14:33:40 -0600
Subject: [PATCH 34/82] mesa: rename rowStride to imageStride in
 texturesubimage()

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/main/teximage.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index ee4b6105064..bfb0858b9bb 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -3764,7 +3764,7 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
 
    /* Must handle special case GL_TEXTURE_CUBE_MAP. */
    if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
-      GLint rowStride;
+      GLint imageStride;
 
       /*
        * What do we do if the user created a texture with the following code
@@ -3802,8 +3802,8 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
          return;
       }
 
-      rowStride = _mesa_image_image_stride(&ctx->Unpack, width, height,
-                                           format, type);
+      imageStride = _mesa_image_image_stride(&ctx->Unpack, width, height,
+                                             format, type);
       /* Copy in each face. */
       for (i = zoffset; i < zoffset + depth; ++i) {
          texImage = texObj->Image[i][level];
@@ -3813,7 +3813,7 @@ texturesubimage(struct gl_context *ctx, GLuint dims,
                                  level, xoffset, yoffset, 0,
                                  width, height, 1, format,
                                  type, pixels, true);
-         pixels = (GLubyte *) pixels + rowStride;
+         pixels = (GLubyte *) pixels + imageStride;
       }
    }
    else {

From 36f1999a87258603b6720d55e6020d5d24c215c9 Mon Sep 17 00:00:00 2001
From: Glenn Kennard <glenn.kennard@gmail.com>
Date: Thu, 27 Aug 2015 19:04:15 +0200
Subject: [PATCH 35/82] r600g/sb: Handle undef in read port tracker

e8e443 missed adding check for undef values also in
unreserve function, leading to an assert triggering.

Signed-off-by: Glenn Kennard <glenn.kennard@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/sb/sb_sched.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp
index 62680788c5e..c98b8fff764 100644
--- a/src/gallium/drivers/r600/sb/sb_sched.cpp
+++ b/src/gallium/drivers/r600/sb/sb_sched.cpp
@@ -236,7 +236,7 @@ void rp_gpr_tracker::unreserve(alu_node* n) {
 
 	for (i = 0; i < nsrc; ++i) {
 		value *v = n->src[i];
-		if (v->is_readonly())
+		if (v->is_readonly() || v->is_undef())
 			continue;
 		if (i == 1 && opt)
 			continue;

From a830225adbb77073272961df409885cca6b861ee Mon Sep 17 00:00:00 2001
From: Glenn Kennard <glenn.kennard@gmail.com>
Date: Thu, 27 Aug 2015 19:04:16 +0200
Subject: [PATCH 36/82] r600g/sb: Don't read junk after EOP

Shaders that contain instruction data after an instruction with EOP could end
up parsing that as an instruction, leading to various crashes and asserts in
SB as it gets very confused if it sees for instance a loop start instruction
jumping off to some random point.

Add a couple of asserts, and print EOP bit if set in old asm printer.

Signed-off-by: Glenn Kennard <glenn.kennard@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_asm.c           | 2 ++
 src/gallium/drivers/r600/sb/sb_bc_decoder.cpp | 1 +
 src/gallium/drivers/r600/sb/sb_bc_parser.cpp  | 4 +++-
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 762cc7fac44..b514c58f9d8 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -2029,6 +2029,8 @@ void r600_bytecode_disasm(struct r600_bytecode *bc)
 					fprintf(stderr, "CND:%X ", cf->cond);
 				if (cf->pop_count)
 					fprintf(stderr, "POP:%X ", cf->pop_count);
+				if (cf->end_of_program)
+					fprintf(stderr, "EOP ");
 				fprintf(stderr, "\n");
 			}
 		}
diff --git a/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp b/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp
index 5e233f982ea..5fe8f50aa4c 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_decoder.cpp
@@ -32,6 +32,7 @@ int bc_decoder::decode_cf(unsigned &i, bc_cf& bc) {
 	int r = 0;
 	uint32_t dw0 = dw[i];
 	uint32_t dw1 = dw[i+1];
+	assert(i+1 <= ndw);
 
 	if ((dw1 >> 29) & 1) { // CF_ALU
 		return decode_cf_alu(i, bc);
diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
index 4879c036f9f..748aae29eeb 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
@@ -95,7 +95,7 @@ int bc_parser::decode_shader() {
 		if ((r = decode_cf(i, eop)))
 			return r;
 
-	} while (!eop || (i >> 1) <= max_cf);
+	} while (!eop || (i >> 1) < max_cf);
 
 	return 0;
 }
@@ -769,6 +769,7 @@ int bc_parser::prepare_ir() {
 }
 
 int bc_parser::prepare_loop(cf_node* c) {
+	assert(c->bc.addr-1 < cf_map.size());
 
 	cf_node *end = cf_map[c->bc.addr - 1];
 	assert(end->bc.op == CF_OP_LOOP_END);
@@ -788,6 +789,7 @@ int bc_parser::prepare_loop(cf_node* c) {
 }
 
 int bc_parser::prepare_if(cf_node* c) {
+	assert(c->bc.addr-1 < cf_map.size());
 	cf_node *c_else = NULL, *end = cf_map[c->bc.addr];
 
 	BCP_DUMP(

From 608c7b4a63d5818f7ae0b3d48496b02cf8458d9b Mon Sep 17 00:00:00 2001
From: Glenn Kennard <glenn.kennard@gmail.com>
Date: Thu, 27 Aug 2015 19:04:17 +0200
Subject: [PATCH 37/82] r600g/sb: Don't crash on empty if jump target

Signed-off-by: Glenn Kennard <glenn.kennard@gmail.com>
Cc: <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
index 748aae29eeb..c4799270d9f 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp
@@ -792,6 +792,9 @@ int bc_parser::prepare_if(cf_node* c) {
 	assert(c->bc.addr-1 < cf_map.size());
 	cf_node *c_else = NULL, *end = cf_map[c->bc.addr];
 
+	if (!end)
+		return 0; // not quite sure how this happens, malformed input?
+
 	BCP_DUMP(
 		sblog << "parsing JUMP @" << c->bc.id;
 		sblog << "\n";
@@ -817,7 +820,7 @@ int bc_parser::prepare_if(cf_node* c) {
 	if (c_else->parent != c->parent)
 		c_else = NULL;
 
-	if (end->parent != c->parent)
+	if (end && end->parent != c->parent)
 		end = NULL;
 
 	region_node *reg = sh->create_region();

From 4a6a47ed056f37544083048287fec96c88e0e386 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 12 Aug 2015 11:55:53 -0400
Subject: [PATCH 38/82] glsl: clean up textureSize prototype

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Timothy Arceri <t_arceri@yahoo.com.au>
---
 src/glsl/builtin_functions.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 4092d682c69..5e051996758 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -666,10 +666,7 @@ private:
    B1(any);
    B1(all);
    B1(not);
-   B2(textureSize);
-   ir_function_signature *_textureSize(builtin_available_predicate avail,
-                                       const glsl_type *return_type,
-                                       const glsl_type *sampler_type);
+   BA2(textureSize);
 
 /** Flags to _texture() */
 #define TEX_PROJECT 1

From 559b8842fa2d315f009a039e7bbc503c524de894 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 25 Sep 2014 11:49:48 -0700
Subject: [PATCH 39/82] glapi: Remove _x86_64_get_get_dispatch symbol from
 x86-64 assembly.

Never used.

Reviewed-by: Mark Janes <mark.a.janes@intel.com>
---
 src/mapi/glapi/gen/gl_x86-64_asm.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/mapi/glapi/gen/gl_x86-64_asm.py b/src/mapi/glapi/gen/gl_x86-64_asm.py
index cf42371f8c3..dcd113e268a 100644
--- a/src/mapi/glapi/gen/gl_x86-64_asm.py
+++ b/src/mapi/glapi/gen/gl_x86-64_asm.py
@@ -144,12 +144,6 @@ class PrintGenericStubs(gl_XML.gl_print_base):
         print ''
         print '#ifdef GLX_USE_TLS'
         print ''
-        print '\t.globl _x86_64_get_get_dispatch; HIDDEN(_x86_64_get_get_dispatch)'
-        print '_x86_64_get_get_dispatch:'
-        print '\tlea\t_x86_64_get_dispatch(%rip), %rax'
-        print '\tret'
-        print ''
-        print '\t.p2align\t4,,15'
         print '_x86_64_get_dispatch:'
         print '\tmovq\t_glapi_tls_Dispatch@GOTTPOFF(%rip), %rax'
         print '\tmovq\t%fs:(%rax), %rax'

From 86c57ebe0ed1acc98545746058862db7429412da Mon Sep 17 00:00:00 2001
From: Boyan Ding <boyan.j.ding@gmail.com>
Date: Fri, 21 Aug 2015 21:42:45 +0800
Subject: [PATCH 40/82] i965/nir: Make use of nir_opt_undef

Shader-db result on Ivy Bridge:
total instructions in shared programs: 145484 -> 145445 (-0.03%)
instructions in affected programs:     225 -> 186 (-17.33%)
helped:                                5
HURT:                                  0

Reviewed-by: Matt Turner <mattst88@gmail.com>
Reviewed-by: Thomas Helland <thomashelland90@gmail.com>
Signed-off-by: Boyan Ding <boyan.j.ding@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_nir.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 8c6d28a7cd8..247b223f2e2 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -63,6 +63,8 @@ nir_optimize(nir_shader *nir, bool is_scalar)
       nir_validate_shader(nir);
       progress |= nir_opt_remove_phis(nir);
       nir_validate_shader(nir);
+      progress |= nir_opt_undef(nir);
+      nir_validate_shader(nir);
    } while (progress);
 }
 

From 2ef5a4f8304ed368e56806e1e2be6e7d2bd290f7 Mon Sep 17 00:00:00 2001
From: Vinson Lee <vlee@freedesktop.org>
Date: Tue, 21 Jul 2015 14:02:01 -0700
Subject: [PATCH 41/82] ABI-check: Use more portable bash invocation.

Fixes 'make check' on FreeBSD.

Signed-off-by: Vinson Lee <vlee@freedesktop.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mapi/es1api/ABI-check | 2 +-
 src/mapi/es2api/ABI-check | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mapi/es1api/ABI-check b/src/mapi/es1api/ABI-check
index 44654cde863..819568f6d1a 100755
--- a/src/mapi/es1api/ABI-check
+++ b/src/mapi/es1api/ABI-check
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Print defined gl.* functions not in GL ES 1.1 or in
 # (FIXME, none of these should be part of the ABI)
diff --git a/src/mapi/es2api/ABI-check b/src/mapi/es2api/ABI-check
index abbb55c2232..e0bf3c83143 100755
--- a/src/mapi/es2api/ABI-check
+++ b/src/mapi/es2api/ABI-check
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # Print defined gl.* functions not in GL ES 3.0 or in
 # (FIXME, none of these should be part of the ABI)

From b319fd7c14707ff345b7ce1461e5fee81b75a4cf Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 28 Aug 2015 02:50:25 -0400
Subject: [PATCH 42/82] mesa: fix dispatch sanity with
 GL_OES_texture_storage_multisample_2d_array

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91785
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Acked-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/main/tests/dispatch_sanity.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 59107eb67b1..b941f3e522e 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2480,5 +2480,8 @@ const struct function gles31_functions_possible[] = {
    { "glVertexAttribBinding", 31, -1 },
    { "glVertexBindingDivisor", 31, -1 },
 
+   /* GL_OES_texture_storage_multisample_2d_array */
+   { "glTexStorage3DMultisampleOES", 31, -1 },
+
    { NULL, 0, -1 },
  };

From 2dbc6a0ad9f5432e5a9a1e66b1c27c574f0def80 Mon Sep 17 00:00:00 2001
From: Neil Roberts <neil@linux.intel.com>
Date: Fri, 28 Aug 2015 14:29:22 +0100
Subject: [PATCH 43/82] docs: Fix a typo in GL3.txt concerning
 GL_KHR_context_flush_control

---
 docs/GL3.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 331b2daaeb6..561f20421db 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -196,7 +196,7 @@ GL 4.5, GLSL 4.50:
   GL_ARB_get_texture_sub_image                         DONE (all drivers)
   GL_ARB_shader_texture_image_samples                  not started
   GL_ARB_texture_barrier                               DONE (nv50, nvc0, r600, radeonsi)
-  GL_KHR_context_flush_control                         DONE (all - but needs GLX/EXT extension to be useful)
+  GL_KHR_context_flush_control                         DONE (all - but needs GLX/EGL extension to be useful)
   GL_KHR_robust_buffer_access_behavior                 not started
   GL_KHR_robustness                                    90% done (the ARB variant)
   GL_EXT_shader_integer_mix                            DONE (all drivers that support GLSL)

From 5aaaaebf22c920745d577c49e463d23b90ba5ea8 Mon Sep 17 00:00:00 2001
From: Daniel Scharrer <daniel@constexpr.org>
Date: Fri, 28 Aug 2015 11:45:35 +0200
Subject: [PATCH 44/82] mesa: add missing queries for ARB_direct_state_access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This adds index queries (glGet*i_v) for GL_TEXTURE_BINDING_* and
GL_SAMPLER_BINDING, as well as textue queries
(glGetTex{,ture}Parameter*) for GL_TEXTURE_TARGET.

CC: "10.6 11.0" <mesa-stable@lists.freedesktop.org>

Reviewed-by: Fredrik Höglund <fredrik@kde.org>
Signed-off-by: Fredrik Höglund <fredrik@kde.org>
---
 src/mesa/main/get.c      | 86 ++++++++++++++++++++++++++++++++++++++++
 src/mesa/main/texparam.c | 12 ++++++
 2 files changed, 98 insertions(+)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 680576cab8f..976bff6653e 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -35,6 +35,7 @@
 #include "mtypes.h"
 #include "state.h"
 #include "texcompress.h"
+#include "texstate.h"
 #include "framebuffer.h"
 #include "samplerobj.h"
 #include "stencil.h"
@@ -1750,6 +1751,52 @@ _mesa_GetDoublev(GLenum pname, GLdouble *params)
    }
 }
 
+/**
+ * Convert a GL texture binding enum such as GL_TEXTURE_BINDING_2D
+ * into the corresponding Mesa texture target index.
+ * \return TEXTURE_x_INDEX or -1 if binding is invalid
+ */
+static int
+tex_binding_to_index(const struct gl_context *ctx, GLenum binding)
+{
+   switch (binding) {
+   case GL_TEXTURE_BINDING_1D:
+      return _mesa_is_desktop_gl(ctx) ? TEXTURE_1D_INDEX : -1;
+   case GL_TEXTURE_BINDING_2D:
+      return TEXTURE_2D_INDEX;
+   case GL_TEXTURE_BINDING_3D:
+      return ctx->API != API_OPENGLES ? TEXTURE_3D_INDEX : -1;
+   case GL_TEXTURE_BINDING_CUBE_MAP:
+      return ctx->Extensions.ARB_texture_cube_map
+         ? TEXTURE_CUBE_INDEX : -1;
+   case GL_TEXTURE_BINDING_RECTANGLE:
+      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.NV_texture_rectangle
+         ? TEXTURE_RECT_INDEX : -1;
+   case GL_TEXTURE_BINDING_1D_ARRAY:
+      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.EXT_texture_array
+         ? TEXTURE_1D_ARRAY_INDEX : -1;
+   case GL_TEXTURE_BINDING_2D_ARRAY:
+      return (_mesa_is_desktop_gl(ctx) && ctx->Extensions.EXT_texture_array)
+         || _mesa_is_gles3(ctx)
+         ? TEXTURE_2D_ARRAY_INDEX : -1;
+   case GL_TEXTURE_BINDING_BUFFER:
+      return ctx->API == API_OPENGL_CORE &&
+             ctx->Extensions.ARB_texture_buffer_object ?
+             TEXTURE_BUFFER_INDEX : -1;
+   case GL_TEXTURE_BINDING_CUBE_MAP_ARRAY:
+      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
+         ? TEXTURE_CUBE_ARRAY_INDEX : -1;
+   case GL_TEXTURE_BINDING_2D_MULTISAMPLE:
+      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
+         ? TEXTURE_2D_MULTISAMPLE_INDEX : -1;
+   case GL_TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY:
+      return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_multisample
+         ? TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX : -1;
+   default:
+      return -1;
+   }
+}
+
 static enum value_type
 find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
 {
@@ -2013,6 +2060,45 @@ find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
       v->value_int = ctx->ImageUnits[index].Format;
       return TYPE_INT;
 
+   /* ARB_direct_state_access */
+   case GL_TEXTURE_BINDING_1D:
+   case GL_TEXTURE_BINDING_1D_ARRAY:
+   case GL_TEXTURE_BINDING_2D:
+   case GL_TEXTURE_BINDING_2D_ARRAY:
+   case GL_TEXTURE_BINDING_2D_MULTISAMPLE:
+   case GL_TEXTURE_BINDING_2D_MULTISAMPLE_ARRAY:
+   case GL_TEXTURE_BINDING_3D:
+   case GL_TEXTURE_BINDING_BUFFER:
+   case GL_TEXTURE_BINDING_CUBE_MAP:
+   case GL_TEXTURE_BINDING_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_BINDING_RECTANGLE: {
+      int target;
+
+      if (ctx->API != API_OPENGL_CORE)
+         goto invalid_enum;
+      target = tex_binding_to_index(ctx, pname);
+      if (target < 0)
+         goto invalid_enum;
+      if (index >= _mesa_max_tex_unit(ctx))
+         goto invalid_value;
+
+      v->value_int = ctx->Texture.Unit[index].CurrentTex[target]->Name;
+      return TYPE_INT;
+   }
+
+   case GL_SAMPLER_BINDING: {
+      struct gl_sampler_object *samp;
+
+      if (ctx->API != API_OPENGL_CORE)
+         goto invalid_enum;
+      if (index >= _mesa_max_tex_unit(ctx))
+         goto invalid_value;
+
+      samp = ctx->Texture.Unit[index].Sampler;
+      v->value_int = samp ? samp->Name : 0;
+      return TYPE_INT;
+   }
+
    case GL_MAX_COMPUTE_WORK_GROUP_COUNT:
       if (!_mesa_has_compute_shaders(ctx))
          goto invalid_enum;
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 3f6f8ba2e20..89f286cc05e 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1926,6 +1926,12 @@ get_tex_parameterfv(struct gl_context *ctx,
          *params = (GLfloat) obj->ImageFormatCompatibilityType;
          break;
 
+      case GL_TEXTURE_TARGET:
+         if (ctx->API != API_OPENGL_CORE)
+            goto invalid_pname;
+         *params = ENUM_TO_FLOAT(obj->Target);
+         break;
+
       default:
          goto invalid_pname;
    }
@@ -2151,6 +2157,12 @@ get_tex_parameteriv(struct gl_context *ctx,
          *params = obj->ImageFormatCompatibilityType;
          break;
 
+      case GL_TEXTURE_TARGET:
+         if (ctx->API != API_OPENGL_CORE)
+            goto invalid_pname;
+         *params = (GLint) obj->Target;
+         break;
+
       default:
          goto invalid_pname;
    }

From 05161596137994ab4b31e054d5afbff877c0a074 Mon Sep 17 00:00:00 2001
From: Daniel Scharrer <daniel@constexpr.org>
Date: Fri, 28 Aug 2015 11:45:36 +0200
Subject: [PATCH 45/82] mesa: return old name for deleted samplers for
 SAMPLER_BINDING queries
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

If the sampler object has been deleted in the same context the binding
will have been cleared. If it has been deleted in another context, the
spec does not say what should returned. None of the other binding point
queries check for deletion in another context.

Also, as names of deleted objects are free for reuse, the current code
didn't even work reliably.

Reviewed-by: Fredrik Höglund <fredrik@kde.org>
Signed-off-by: Fredrik Höglund <fredrik@kde.org>
---
 src/mesa/main/get.c | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 976bff6653e..4855187aa6f 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -994,16 +994,7 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       {
          struct gl_sampler_object *samp =
             ctx->Texture.Unit[ctx->Texture.CurrentUnit].Sampler;
-
-         /*
-          * The sampler object may have been deleted on another context,
-          * so we try to lookup the sampler object before returning its Name.
-          */
-         if (samp && _mesa_lookup_samplerobj(ctx, samp->Name)) {
-            v->value_int = samp->Name;
-         } else {
-            v->value_int = 0;
-         }
+         v->value_int = samp ? samp->Name : 0;
       }
       break;
    /* GL_ARB_uniform_buffer_object */

From 2581fe931a48478123d8054ce7a291cffa851de9 Mon Sep 17 00:00:00 2001
From: Marta Lofstedt <marta.lofstedt@intel.com>
Date: Fri, 28 Aug 2015 10:22:41 +0200
Subject: [PATCH 46/82] i965/fs: Do not set the size for zero-size uniforms

Zero sized uniforms can exist in the list, but they don't get get any space
allocated in prog_data->params or in the param_size array, so the size
should not be set for them.  This was previously fixed in:

commit: 781dc7c0e1f41502f18e07c0940af949a78d2792.

However,

commit: 259f7291de2387aa3ac5f856b39b7b934a1d8e7d

removed the fix.

Signed-off-by: Marta Lofstedt <marta.lofstedt@intel.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 430efb3021d..9d14d1f2139 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -190,8 +190,8 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
             nir_setup_builtin_uniform(var);
          else
             nir_setup_uniform(var);
-
-         param_size[var->data.driver_location] = type_size_scalar(var->type);
+         if(type_size_scalar(var->type) > 0)
+            param_size[var->data.driver_location] = type_size_scalar(var->type);
       }
    } else {
       /* prog_to_nir only creates a single giant uniform variable so we can
@@ -202,7 +202,8 @@ fs_visitor::nir_setup_uniforms(nir_shader *shader)
                &prog->Parameters->ParameterValues[p][i];
          }
       }
-      param_size[0] = prog->Parameters->NumParameters * 4;
+      if(prog->Parameters->NumParameters > 0)
+         param_size[0] = prog->Parameters->NumParameters * 4;
    }
 }
 

From 8765f1d7ddfb00dc5b202e4e679ebe640a547d50 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 18 Aug 2015 14:28:03 -0700
Subject: [PATCH 47/82] i965: Only consider fixed_hw_reg in equals() if file is
 HW_REG/IMM.

Noticed when debugging things that lead to the next patch.

On G45 (and presumably ILK) this helps register coalescing:

total instructions in shared programs: 4077373 -> 4077340 (-0.00%)
instructions in affected programs:     43751 -> 43718 (-0.08%)
helped:                                52
HURT:                                  2

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.cpp   | 4 +++-
 src/mesa/drivers/dri/i965/brw_vec4.cpp | 5 +++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 81009a09128..269914d64a8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -427,7 +427,9 @@ fs_reg::equals(const fs_reg &r) const
            negate == r.negate &&
            abs == r.abs &&
            !reladdr && !r.reladdr &&
-           memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
+           ((file != HW_REG && file != IMM) ||
+            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
+                   sizeof(fixed_hw_reg)) == 0) &&
            stride == r.stride);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index f18915a8e38..b97b6c13a13 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -216,8 +216,9 @@ dst_reg::equals(const dst_reg &r) const
            writemask == r.writemask &&
            (reladdr == r.reladdr ||
             (reladdr && r.reladdr && reladdr->equals(*r.reladdr))) &&
-           memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
-                  sizeof(fixed_hw_reg)) == 0);
+           ((file != HW_REG && file != IMM) ||
+            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
+                   sizeof(fixed_hw_reg)) == 0));
 }
 
 bool

From f3d0a894af61d9ccc3f00086fbac3809bfed1160 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 18 Aug 2015 17:10:44 -0700
Subject: [PATCH 48/82] i965/fs: Use overwrites_reg() instead of dst.equals().

equals() returns false for registers with different types, using it
isn't appropriate to determine whether an is overwriting a register.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 72e873857ce..218cc614e6d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -128,8 +128,8 @@ can_coalesce_vars(brw::fs_live_variables *live_intervals,
       if (scan_ip > live_intervals->end[var_to])
          return true;
 
-      if (scan_inst->dst.equals(inst->dst) ||
-          scan_inst->dst.equals(inst->src[0]))
+      if (scan_inst->overwrites_reg(inst->dst) ||
+          scan_inst->overwrites_reg(inst->src[0]))
          return false;
    }
 

From f2f8c43af92ad446a5c56bd0f88af36e1a3e0506 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 17 Aug 2015 16:03:27 -0700
Subject: [PATCH 49/82] i965/fs: Improve register coalescing interference
 check.

I always thought that the is_control_flow() -> return false check was a
bad hack, and some previous attempts to remove it have failed and have
been reverted.

The previous two patches fix some problems that caused register
coalescing to not notice some interference between registers, which the
is_control_flow() check apparently works around.

With that fixed, we can calculate interference more accurately.

total instructions in shared programs: 6261319 -> 6257917 (-0.05%)
instructions in affected programs:     346282 -> 342880 (-0.98%)
helped:                                1552

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../dri/i965/brw_fs_register_coalesce.cpp     | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 218cc614e6d..452aee5a769 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -110,27 +110,30 @@ can_coalesce_vars(brw::fs_live_variables *live_intervals,
        (end_from > end_to && start_to < start_from))
       return false;
 
-   int start_ip = MIN2(start_to, start_from);
+   /* Check for a write to either register in the intersection of their live
+    * ranges.
+    */
+   int start_ip = MAX2(start_to, start_from);
+   int end_ip = MIN2(end_to, end_from);
    int scan_ip = -1;
 
    foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
       scan_ip++;
 
+      /* Ignore anything before the intersection of the live ranges */
       if (scan_ip < start_ip)
          continue;
 
-      if (scan_inst->is_control_flow())
-         return false;
-
-      if (scan_ip <= live_intervals->start[var_to])
+      /* Ignore the copying instruction itself */
+      if (scan_inst == inst)
          continue;
 
-      if (scan_ip > live_intervals->end[var_to])
-         return true;
+      if (scan_ip > end_ip)
+         return true; /* registers do not interfere */
 
       if (scan_inst->overwrites_reg(inst->dst) ||
           scan_inst->overwrites_reg(inst->src[0]))
-         return false;
+         return false; /* registers interfere */
    }
 
    return true;

From a2ff1e95a43d0ebcaeee4c239bd8d32963f5224c Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 18 Aug 2015 17:47:00 -0700
Subject: [PATCH 50/82] i965/fs: Skip blocks in register coalescing
 interference check.

No need to walk through instructions in blocks we know don't contain our
registers' live ranges.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../dri/i965/brw_fs_register_coalesce.cpp     | 34 +++++++++++--------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 452aee5a769..0329bc3ebc6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -115,25 +115,31 @@ can_coalesce_vars(brw::fs_live_variables *live_intervals,
     */
    int start_ip = MAX2(start_to, start_from);
    int end_ip = MIN2(end_to, end_from);
-   int scan_ip = -1;
 
-   foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
-      scan_ip++;
-
-      /* Ignore anything before the intersection of the live ranges */
-      if (scan_ip < start_ip)
+   foreach_block(block, cfg) {
+      if (block->end_ip < start_ip)
          continue;
 
-      /* Ignore the copying instruction itself */
-      if (scan_inst == inst)
-         continue;
+      int scan_ip = block->start_ip - 1;
 
-      if (scan_ip > end_ip)
-         return true; /* registers do not interfere */
+      foreach_inst_in_block(fs_inst, scan_inst, block) {
+         scan_ip++;
 
-      if (scan_inst->overwrites_reg(inst->dst) ||
-          scan_inst->overwrites_reg(inst->src[0]))
-         return false; /* registers interfere */
+         /* Ignore anything before the intersection of the live ranges */
+         if (scan_ip < start_ip)
+            continue;
+
+         /* Ignore the copying instruction itself */
+         if (scan_inst == inst)
+            continue;
+
+         if (scan_ip > end_ip)
+            return true; /* registers do not interfere */
+
+         if (scan_inst->overwrites_reg(inst->dst) ||
+             scan_inst->overwrites_reg(inst->src[0]))
+            return false; /* registers interfere */
+      }
    }
 
    return true;

From 64e312d7fab1b8a4bc0edb9cd9458a511e66d037 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 27 Aug 2015 18:30:34 -0700
Subject: [PATCH 51/82] i965/fs: Replace awful variable names.

   start_to      -> dst_start
   end_to        -> dst_end
   start_from    -> src_start
   end_from      -> src_end
   var_to        -> dst_var
   var_from      -> src_var
   reg_to        -> dst_reg
   reg_to_offset -> dst_reg_offset
   reg_from      -> src_reg

Not sure how these made sense to me before.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../dri/i965/brw_fs_register_coalesce.cpp     | 80 +++++++++----------
 1 file changed, 40 insertions(+), 40 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 0329bc3ebc6..34f8715eeb9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -95,26 +95,26 @@ is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
 static bool
 can_coalesce_vars(brw::fs_live_variables *live_intervals,
                   const cfg_t *cfg, const fs_inst *inst,
-                  int var_to, int var_from)
+                  int dst_var, int src_var)
 {
-   if (!live_intervals->vars_interfere(var_from, var_to))
+   if (!live_intervals->vars_interfere(src_var, dst_var))
       return true;
 
-   int start_to = live_intervals->start[var_to];
-   int end_to = live_intervals->end[var_to];
-   int start_from = live_intervals->start[var_from];
-   int end_from = live_intervals->end[var_from];
+   int dst_start = live_intervals->start[dst_var];
+   int dst_end = live_intervals->end[dst_var];
+   int src_start = live_intervals->start[src_var];
+   int src_end = live_intervals->end[src_var];
 
    /* Variables interfere and one line range isn't a subset of the other. */
-   if ((end_to > end_from && start_from < start_to) ||
-       (end_from > end_to && start_to < start_from))
+   if ((dst_end > src_end && src_start < dst_start) ||
+       (src_end > dst_end && dst_start < src_start))
       return false;
 
    /* Check for a write to either register in the intersection of their live
     * ranges.
     */
-   int start_ip = MAX2(start_to, start_from);
-   int end_ip = MIN2(end_to, end_from);
+   int start_ip = MAX2(dst_start, src_start);
+   int end_ip = MIN2(dst_end, src_end);
 
    foreach_block(block, cfg) {
       if (block->end_ip < start_ip)
@@ -154,11 +154,11 @@ fs_visitor::register_coalesce()
 
    int src_size = 0;
    int channels_remaining = 0;
-   int reg_from = -1, reg_to = -1;
-   int reg_to_offset[MAX_VGRF_SIZE];
+   int src_reg = -1, dst_reg = -1;
+   int dst_reg_offset[MAX_VGRF_SIZE];
    fs_inst *mov[MAX_VGRF_SIZE];
-   int var_to[MAX_VGRF_SIZE];
-   int var_from[MAX_VGRF_SIZE];
+   int dst_var[MAX_VGRF_SIZE];
+   int src_var[MAX_VGRF_SIZE];
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (!is_coalesce_candidate(this, inst))
@@ -170,8 +170,8 @@ fs_visitor::register_coalesce()
          continue;
       }
 
-      if (reg_from != inst->src[0].reg) {
-         reg_from = inst->src[0].reg;
+      if (src_reg != inst->src[0].reg) {
+         src_reg = inst->src[0].reg;
 
          src_size = alloc.sizes[inst->src[0].reg];
          assert(src_size <= MAX_VGRF_SIZE);
@@ -179,15 +179,15 @@ fs_visitor::register_coalesce()
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
-         reg_to = inst->dst.reg;
+         dst_reg = inst->dst.reg;
       }
 
-      if (reg_to != inst->dst.reg)
+      if (dst_reg != inst->dst.reg)
          continue;
 
       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
          for (int i = 0; i < src_size; i++) {
-            reg_to_offset[i] = i;
+            dst_reg_offset[i] = i;
          }
          mov[0] = inst;
          channels_remaining -= inst->regs_written;
@@ -203,9 +203,9 @@ fs_visitor::register_coalesce()
             channels_remaining = -1;
             continue;
          }
-         reg_to_offset[offset] = inst->dst.reg_offset;
+         dst_reg_offset[offset] = inst->dst.reg_offset;
          if (inst->regs_written > 1)
-            reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
+            dst_reg_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
          channels_remaining -= inst->regs_written;
       }
@@ -215,20 +215,20 @@ fs_visitor::register_coalesce()
 
       bool can_coalesce = true;
       for (int i = 0; i < src_size; i++) {
-         if (reg_to_offset[i] != reg_to_offset[0] + i) {
+         if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
             /* Registers are out-of-order. */
             can_coalesce = false;
-            reg_from = -1;
+            src_reg = -1;
             break;
          }
 
-         var_to[i] = live_intervals->var_from_vgrf[reg_to] + reg_to_offset[i];
-         var_from[i] = live_intervals->var_from_vgrf[reg_from] + i;
+         dst_var[i] = live_intervals->var_from_vgrf[dst_reg] + dst_reg_offset[i];
+         src_var[i] = live_intervals->var_from_vgrf[src_reg] + i;
 
          if (!can_coalesce_vars(live_intervals, cfg, inst,
-                                var_to[i], var_from[i])) {
+                                dst_var[i], src_var[i])) {
             can_coalesce = false;
-            reg_from = -1;
+            src_reg = -1;
             break;
          }
       }
@@ -251,31 +251,31 @@ fs_visitor::register_coalesce()
 
       foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
          if (scan_inst->dst.file == GRF &&
-             scan_inst->dst.reg == reg_from) {
-            scan_inst->dst.reg = reg_to;
+             scan_inst->dst.reg == src_reg) {
+            scan_inst->dst.reg = dst_reg;
             scan_inst->dst.reg_offset =
-               reg_to_offset[scan_inst->dst.reg_offset];
+               dst_reg_offset[scan_inst->dst.reg_offset];
          }
 
          for (int j = 0; j < scan_inst->sources; j++) {
             if (scan_inst->src[j].file == GRF &&
-                scan_inst->src[j].reg == reg_from) {
-               scan_inst->src[j].reg = reg_to;
+                scan_inst->src[j].reg == src_reg) {
+               scan_inst->src[j].reg = dst_reg;
                scan_inst->src[j].reg_offset =
-                  reg_to_offset[scan_inst->src[j].reg_offset];
+                  dst_reg_offset[scan_inst->src[j].reg_offset];
             }
          }
       }
 
       for (int i = 0; i < src_size; i++) {
-         live_intervals->start[var_to[i]] =
-            MIN2(live_intervals->start[var_to[i]],
-                 live_intervals->start[var_from[i]]);
-         live_intervals->end[var_to[i]] =
-            MAX2(live_intervals->end[var_to[i]],
-                 live_intervals->end[var_from[i]]);
+         live_intervals->start[dst_var[i]] =
+            MIN2(live_intervals->start[dst_var[i]],
+                 live_intervals->start[src_var[i]]);
+         live_intervals->end[dst_var[i]] =
+            MAX2(live_intervals->end[dst_var[i]],
+                 live_intervals->end[src_var[i]]);
       }
-      reg_from = -1;
+      src_reg = -1;
    }
 
    if (progress) {

From c676c432f30158190c260e7f3731ee6667ad4103 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 17 Aug 2015 14:38:31 -0700
Subject: [PATCH 52/82] i965/fs: Remove fs_visitor::try_replace_with_sel().

No shader-db changes on g4x, snb, hsw, or bdw.

Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/drivers/dri/i965/brw_fs.h           |  1 -
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp     |  2 -
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 89 --------------------
 3 files changed, 92 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 31f39fe0adc..0a89d2e7640 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -225,7 +225,6 @@ public:
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
    fs_reg resolve_source_modifiers(const fs_reg &src);
    void emit_discard_jump();
-   bool try_replace_with_sel();
    bool opt_peephole_sel();
    bool opt_peephole_predicated_break();
    bool opt_saturate_propagation();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 9d14d1f2139..9929dd6a42f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -416,8 +416,6 @@ fs_visitor::nir_emit_if(nir_if *if_stmt)
    nir_emit_cf_list(&if_stmt->else_list);
 
    bld.emit(BRW_OPCODE_ENDIF);
-
-   try_replace_with_sel();
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 111db8c4323..504673f8bd9 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -441,95 +441,6 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
    }
 }
 
-/**
- * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
- *
- * Many GLSL shaders contain the following pattern:
- *
- *    x = condition ? foo : bar
- *
- * The compiler emits an ir_if tree for this, since each subexpression might be
- * a complex tree that could have side-effects or short-circuit logic.
- *
- * However, the common case is to simply select one of two constants or
- * variable values---which is exactly what SEL is for.  In this case, the
- * assembly looks like:
- *
- *    (+f0) IF
- *    MOV dst src0
- *    ELSE
- *    MOV dst src1
- *    ENDIF
- *
- * which can be easily translated into:
- *
- *    (+f0) SEL dst src0 src1
- *
- * If src0 is an immediate value, we promote it to a temporary GRF.
- */
-bool
-fs_visitor::try_replace_with_sel()
-{
-   fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
-   assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
-
-   /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
-   int opcodes[] = {
-      BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
-   };
-
-   fs_inst *match = (fs_inst *) endif_inst->prev;
-   for (int i = 0; i < 4; i++) {
-      if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
-         return false;
-      match = (fs_inst *) match->prev;
-   }
-
-   /* The opcodes match; it looks like the right sequence of instructions. */
-   fs_inst *else_mov = (fs_inst *) endif_inst->prev;
-   fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
-   fs_inst *if_inst = (fs_inst *) then_mov->prev;
-
-   /* Check that the MOVs are the right form. */
-   if (then_mov->dst.equals(else_mov->dst) &&
-       !then_mov->is_partial_write() &&
-       !else_mov->is_partial_write()) {
-
-      /* Remove the matched instructions; we'll emit a SEL to replace them. */
-      while (!if_inst->next->is_tail_sentinel())
-         if_inst->next->exec_node::remove();
-      if_inst->exec_node::remove();
-
-      /* Only the last source register can be a constant, so if the MOV in
-       * the "then" clause uses a constant, we need to put it in a temporary.
-       */
-      fs_reg src0(then_mov->src[0]);
-      if (src0.file == IMM) {
-         src0 = vgrf(glsl_type::float_type);
-         src0.type = then_mov->src[0].type;
-         bld.MOV(src0, then_mov->src[0]);
-      }
-
-      if (if_inst->conditional_mod) {
-         /* Sandybridge-specific IF with embedded comparison */
-         bld.CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
-                 if_inst->conditional_mod);
-         set_predicate(BRW_PREDICATE_NORMAL,
-                       bld.emit(BRW_OPCODE_SEL, then_mov->dst,
-                                src0, else_mov->src[0]));
-      } else {
-         /* Separate CMP and IF instructions */
-         set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
-                           bld.emit(BRW_OPCODE_SEL, then_mov->dst,
-                                    src0, else_mov->src[0]));
-      }
-
-      return true;
-   }
-
-   return false;
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()

From 889a946a455c54a5a9bca144b2ea2fe66be39274 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 20 Aug 2015 20:52:32 -0400
Subject: [PATCH 53/82] glsl: use bitfield_insert instead of and + shift + or
 for packing

It is fairly tricky to detect the proper conditions for using bitfield
insert, but easy to just use it up front. This removes a lot of
instructions on nvc0 when invoking the packing builtins.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/ir_optimization.h                 |  4 +++-
 src/glsl/lower_packing_builtins.cpp        | 27 +++++++++++++++++++---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  3 +++
 3 files changed, 30 insertions(+), 4 deletions(-)

diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index eef107e5249..b955874df84 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -66,7 +66,9 @@ enum lower_packing_builtins_op {
    LOWER_UNPACK_SNORM_4x8               = 0x0200,
 
    LOWER_PACK_UNORM_4x8                 = 0x0400,
-   LOWER_UNPACK_UNORM_4x8               = 0x0800
+   LOWER_UNPACK_UNORM_4x8               = 0x0800,
+
+   LOWER_PACK_USE_BFI                   = 0x1000,
 };
 
 bool do_common_optimization(exec_list *ir, bool linked,
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
index a6fb8a8837e..1d76ebf935f 100644
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@@ -118,6 +118,7 @@ public:
          *rvalue = split_unpack_half_2x16(op0);
          break;
       case LOWER_PACK_UNPACK_NONE:
+      case LOWER_PACK_USE_BFI:
          assert(!"not reached");
          break;
       }
@@ -222,9 +223,16 @@ private:
 
       /* uvec2 u = UVEC2_RVAL; */
       ir_variable *u = factory.make_temp(glsl_type::uvec2_type,
-                                          "tmp_pack_uvec2_to_uint");
+                                         "tmp_pack_uvec2_to_uint");
       factory.emit(assign(u, uvec2_rval));
 
+      if (op_mask & LOWER_PACK_USE_BFI) {
+         return bitfield_insert(bit_and(swizzle_x(u), constant(0xffffu)),
+                                swizzle_y(u),
+                                constant(16),
+                                constant(16));
+      }
+
       /* return (u.y << 16) | (u.x & 0xffff); */
       return bit_or(lshift(swizzle_y(u), constant(16u)),
                     bit_and(swizzle_x(u), constant(0xffffu)));
@@ -242,9 +250,22 @@ private:
    {
       assert(uvec4_rval->type == glsl_type::uvec4_type);
 
-      /* uvec4 u = UVEC4_RVAL; */
       ir_variable *u = factory.make_temp(glsl_type::uvec4_type,
-                                          "tmp_pack_uvec4_to_uint");
+                                         "tmp_pack_uvec4_to_uint");
+
+      if (op_mask & LOWER_PACK_USE_BFI) {
+         /* uvec4 u = UVEC4_RVAL; */
+         factory.emit(assign(u, uvec4_rval));
+
+         return bitfield_insert(bitfield_insert(
+                                   bitfield_insert(
+                                      bit_and(swizzle_x(u), constant(0xffu)),
+                                      swizzle_y(u), constant(8), constant(8)),
+                                   swizzle_z(u), constant(16), constant(8)),
+                                swizzle_w(u), constant(24), constant(8));
+      }
+
+      /* uvec4 u = UVEC4_RVAL & 0xff */
       factory.emit(assign(u, bit_and(uvec4_rval, constant(0xffu))));
 
       /* return (u.w << 24) | (u.z << 16) | (u.y << 8) | u.x; */
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 695644117ac..7a8c4e1b8fa 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6019,6 +6019,9 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                LOWER_PACK_HALF_2x16 |
                                LOWER_UNPACK_HALF_2x16;
 
+         if (ctx->Extensions.ARB_gpu_shader5)
+            lower_inst |= LOWER_PACK_USE_BFI;
+
          lower_packing_builtins(ir, lower_inst);
       }
 

From 275c5810ca7e38560b2a77281e7a0498c50126f8 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 20 Aug 2015 21:55:52 -0400
Subject: [PATCH 54/82] glsl: provide the option of using BFE for unpack
 builting lowering

This greatly improves generated code, especially for the snorm variants,
since it is able to get rid of the lshift/rshift for sext, as well as
replacing each shift + mask with a single op.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/glsl/ir_builder.cpp                    |   6 ++
 src/glsl/ir_builder.h                      |   1 +
 src/glsl/ir_optimization.h                 |   1 +
 src/glsl/lower_packing_builtins.cpp        | 101 ++++++++++++++++++---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |   3 +-
 5 files changed, 99 insertions(+), 13 deletions(-)

diff --git a/src/glsl/ir_builder.cpp b/src/glsl/ir_builder.cpp
index cd03859cac0..c9cf1240dfe 100644
--- a/src/glsl/ir_builder.cpp
+++ b/src/glsl/ir_builder.cpp
@@ -566,6 +566,12 @@ csel(operand a, operand b, operand c)
    return expr(ir_triop_csel, a, b, c);
 }
 
+ir_expression *
+bitfield_extract(operand a, operand b, operand c)
+{
+   return expr(ir_triop_bitfield_extract, a, b, c);
+}
+
 ir_expression *
 bitfield_insert(operand a, operand b, operand c, operand d)
 {
diff --git a/src/glsl/ir_builder.h b/src/glsl/ir_builder.h
index f76453ffcf0..b483ebf6269 100644
--- a/src/glsl/ir_builder.h
+++ b/src/glsl/ir_builder.h
@@ -200,6 +200,7 @@ ir_expression *interpolate_at_sample(operand a, operand b);
 ir_expression *fma(operand a, operand b, operand c);
 ir_expression *lrp(operand x, operand y, operand a);
 ir_expression *csel(operand a, operand b, operand c);
+ir_expression *bitfield_extract(operand a, operand b, operand c);
 ir_expression *bitfield_insert(operand a, operand b, operand c, operand d);
 
 ir_swizzle *swizzle(operand a, int swizzle, int components);
diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
index b955874df84..265b2234cb6 100644
--- a/src/glsl/ir_optimization.h
+++ b/src/glsl/ir_optimization.h
@@ -69,6 +69,7 @@ enum lower_packing_builtins_op {
    LOWER_UNPACK_UNORM_4x8               = 0x0800,
 
    LOWER_PACK_USE_BFI                   = 0x1000,
+   LOWER_PACK_USE_BFE                   = 0x2000,
 };
 
 bool do_common_optimization(exec_list *ir, bool linked,
diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
index 1d76ebf935f..c8bf68be829 100644
--- a/src/glsl/lower_packing_builtins.cpp
+++ b/src/glsl/lower_packing_builtins.cpp
@@ -119,6 +119,7 @@ public:
          break;
       case LOWER_PACK_UNPACK_NONE:
       case LOWER_PACK_USE_BFI:
+      case LOWER_PACK_USE_BFE:
          assert(!"not reached");
          break;
       }
@@ -305,6 +306,39 @@ private:
       return deref(u2).val;
    }
 
+   /**
+    * \brief Unpack a uint32 into two int16's.
+    *
+    * Specifically each 16-bit value is sign-extended to the full width of an
+    * int32 on return.
+    */
+   ir_rvalue *
+   unpack_uint_to_ivec2(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      if (!(op_mask & LOWER_PACK_USE_BFE)) {
+         return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
+                              constant(16u)),
+                       constant(16u));
+      }
+
+      ir_variable *i = factory.make_temp(glsl_type::int_type,
+                                         "tmp_unpack_uint_to_ivec2_i");
+      factory.emit(assign(i, u2i(uint_rval)));
+
+      /* ivec2 i2; */
+      ir_variable *i2 = factory.make_temp(glsl_type::ivec2_type,
+                                          "tmp_unpack_uint_to_ivec2_i2");
+
+      factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
+                          WRITEMASK_X));
+      factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
+                          WRITEMASK_Y));
+
+      return deref(i2).val;
+   }
+
    /**
     * \brief Unpack a uint32 into four uint8's.
     *
@@ -329,13 +363,23 @@ private:
       /* u4.x = u & 0xffu; */
       factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
 
-      /* u4.y = (u >> 8u) & 0xffu; */
-      factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
-                                      constant(0xffu)), WRITEMASK_Y));
+      if (op_mask & LOWER_PACK_USE_BFE) {
+         /* u4.y = bitfield_extract(u, 8, 8); */
+         factory.emit(assign(u4, bitfield_extract(u, constant(8), constant(8)),
+                             WRITEMASK_Y));
 
-      /* u4.z = (u >> 16u) & 0xffu; */
-      factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
-                                      constant(0xffu)), WRITEMASK_Z));
+         /* u4.z = bitfield_extract(u, 16, 8); */
+         factory.emit(assign(u4, bitfield_extract(u, constant(16), constant(8)),
+                             WRITEMASK_Z));
+      } else {
+         /* u4.y = (u >> 8u) & 0xffu; */
+         factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
+                                         constant(0xffu)), WRITEMASK_Y));
+
+         /* u4.z = (u >> 16u) & 0xffu; */
+         factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
+                                         constant(0xffu)), WRITEMASK_Z));
+      }
 
       /* u4.w = (u >> 24u) */
       factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
@@ -343,6 +387,43 @@ private:
       return deref(u4).val;
    }
 
+   /**
+    * \brief Unpack a uint32 into four int8's.
+    *
+    * Specifically each 8-bit value is sign-extended to the full width of an
+    * int32 on return.
+    */
+   ir_rvalue *
+   unpack_uint_to_ivec4(ir_rvalue *uint_rval)
+   {
+      assert(uint_rval->type == glsl_type::uint_type);
+
+      if (!(op_mask & LOWER_PACK_USE_BFE)) {
+         return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
+                              constant(24u)),
+                       constant(24u));
+      }
+
+      ir_variable *i = factory.make_temp(glsl_type::int_type,
+                                         "tmp_unpack_uint_to_ivec4_i");
+      factory.emit(assign(i, u2i(uint_rval)));
+
+      /* ivec4 i4; */
+      ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
+                                          "tmp_unpack_uint_to_ivec4_i4");
+
+      factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
+                          WRITEMASK_X));
+      factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
+                          WRITEMASK_Y));
+      factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
+                          WRITEMASK_Z));
+      factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
+                          WRITEMASK_W));
+
+      return deref(i4).val;
+   }
+
    /**
     * \brief Lower a packSnorm2x16 expression.
     *
@@ -489,9 +570,7 @@ private:
       assert(uint_rval->type == glsl_type::uint_type);
 
       ir_rvalue *result =
-        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
-                                    constant(16)),
-                             constant(16u))),
+        clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
                   constant(32767.0f)),
               constant(-1.0f),
               constant(1.0f));
@@ -548,9 +627,7 @@ private:
       assert(uint_rval->type == glsl_type::uint_type);
 
       ir_rvalue *result =
-        clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
-                                    constant(24u)),
-                             constant(24u))),
+        clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
                   constant(127.0f)),
               constant(-1.0f),
               constant(1.0f));
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 7a8c4e1b8fa..95a25c12fb4 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6020,7 +6020,8 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
                                LOWER_UNPACK_HALF_2x16;
 
          if (ctx->Extensions.ARB_gpu_shader5)
-            lower_inst |= LOWER_PACK_USE_BFI;
+            lower_inst |= LOWER_PACK_USE_BFI |
+                          LOWER_PACK_USE_BFE;
 
          lower_packing_builtins(ir, lower_inst);
       }

From 8d6d0cc17d945317f44328a7761801e6781dc3fc Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 28 Aug 2015 09:57:04 +1000
Subject: [PATCH 55/82] gallium/util: fix debug_get_flags_option on 32-bit
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On 32-bit we need to use PRIu64 flags for printfs,
otherwise this segfaults in R600_DEBUG=help otherwise.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Cc: "11.0" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/util/u_debug.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_debug.c b/src/gallium/auxiliary/util/u_debug.c
index b4503deb8f6..5fe9e33e208 100644
--- a/src/gallium/auxiliary/util/u_debug.c
+++ b/src/gallium/auxiliary/util/u_debug.c
@@ -41,6 +41,7 @@
 #include "util/u_tile.h" 
 #include "util/u_prim.h"
 #include "util/u_surface.h"
+#include <inttypes.h>
 
 #include <stdio.h>
 #include <limits.h> /* CHAR_BIT */
@@ -275,7 +276,7 @@ debug_get_flags_option(const char *name,
       for (; flags->name; ++flags)
          namealign = MAX2(namealign, strlen(flags->name));
       for (flags = orig; flags->name; ++flags)
-         _debug_printf("| %*s [0x%0*lx]%s%s\n", namealign, flags->name,
+         _debug_printf("| %*s [0x%0*"PRIu64"]%s%s\n", namealign, flags->name,
                       (int)sizeof(uint64_t)*CHAR_BIT/4, flags->value,
                       flags->desc ? " " : "", flags->desc ? flags->desc : "");
    }
@@ -290,9 +291,9 @@ debug_get_flags_option(const char *name,
 
    if (debug_get_option_should_print()) {
       if (str) {
-         debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+         debug_printf("%s: %s = 0x%"PRIu64" (%s)\n", __FUNCTION__, name, result, str);
       } else {
-         debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+         debug_printf("%s: %s = 0x%"PRIu64"\n", __FUNCTION__, name, result);
       }
    }
 

From c149d84d458101e188386a92bbff1e071284d375 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 28 Aug 2015 09:58:15 +1000
Subject: [PATCH 56/82] r600g: use PRIi64 for some compute debug printfs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Otherwise this will crash on 32-bit, and it gets rid of
warnings building on 32-bit.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/compute_memory_pool.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/drivers/r600/compute_memory_pool.c b/src/gallium/drivers/r600/compute_memory_pool.c
index 413aa3d7c59..7c5113e9197 100644
--- a/src/gallium/drivers/r600/compute_memory_pool.c
+++ b/src/gallium/drivers/r600/compute_memory_pool.c
@@ -120,7 +120,7 @@ int64_t compute_memory_prealloc_chunk(
 
 	assert(size_in_dw <= pool->size_in_dw);
 
-	COMPUTE_DBG(pool->screen, "* compute_memory_prealloc_chunk() size_in_dw = %ld\n",
+	COMPUTE_DBG(pool->screen, "* compute_memory_prealloc_chunk() size_in_dw = %"PRIi64"\n",
 		size_in_dw);
 
 	LIST_FOR_EACH_ENTRY(item, pool->item_list, link) {
@@ -151,7 +151,7 @@ struct list_head *compute_memory_postalloc_chunk(
 	struct compute_memory_item *next;
 	struct list_head *next_link;
 
-	COMPUTE_DBG(pool->screen, "* compute_memory_postalloc_chunck() start_in_dw = %ld\n",
+	COMPUTE_DBG(pool->screen, "* compute_memory_postalloc_chunck() start_in_dw = %"PRIi64"\n",
 		start_in_dw);
 
 	/* Check if we can insert it in the front of the list */
@@ -568,7 +568,7 @@ void compute_memory_free(struct compute_memory_pool* pool, int64_t id)
 	struct pipe_screen *screen = (struct pipe_screen *)pool->screen;
 	struct pipe_resource *res;
 
-	COMPUTE_DBG(pool->screen, "* compute_memory_free() id + %ld \n", id);
+	COMPUTE_DBG(pool->screen, "* compute_memory_free() id + %"PRIi64" \n", id);
 
 	LIST_FOR_EACH_ENTRY_SAFE(item, next, pool->item_list, link) {
 
@@ -628,7 +628,7 @@ struct compute_memory_item* compute_memory_alloc(
 {
 	struct compute_memory_item *new_item = NULL;
 
-	COMPUTE_DBG(pool->screen, "* compute_memory_alloc() size_in_dw = %ld (%ld bytes)\n",
+	COMPUTE_DBG(pool->screen, "* compute_memory_alloc() size_in_dw = %"PRIi64" (%"PRIi64" bytes)\n",
 			size_in_dw, 4 * size_in_dw);
 
 	new_item = (struct compute_memory_item *)

From 6941883175612ae602a8745945153ba064f53a7a Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Fri, 28 Aug 2015 10:46:10 +1000
Subject: [PATCH 57/82] r600: port si_conv_prim_to_gs_out from radeonsi

This code was broken by the tess merge, and I totally missed it
until now. I'm not sure this fixes anything but it stops the assert.

Cc: "11.0" <mesa-stable@lists.freedesktop.org>
Reviewed-by: Glenn Kennard <glenn.kennard@gmail.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_pipe.h | 31 ++++++++++++++--------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 384ba800a79..3247aba969e 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -939,21 +939,22 @@ static inline bool r600_can_read_depth(struct r600_texture *rtex)
 static inline unsigned r600_conv_prim_to_gs_out(unsigned mode)
 {
 	static const int prim_conv[] = {
-		V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		V_028A6C_OUTPRIM_TYPE_TRISTRIP
+		[PIPE_PRIM_POINTS]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
+		[PIPE_PRIM_LINES]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_LOOP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_STRIP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_STRIP]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_FAN]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_QUADS]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_QUAD_STRIP]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_POLYGON]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_LINES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
+		[R600_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
 	};
 	assert(mode < Elements(prim_conv));
 

From 0eac5990016dbc435ef3260cf602783bebf03e59 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Fri, 28 Aug 2015 18:15:13 +0200
Subject: [PATCH 58/82] nvc0: remove commented out code related to PCOUNTER
 queries

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 .../drivers/nouveau/nvc0/nvc0_screen.h        | 20 -------------------
 1 file changed, 20 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index d8826ae0c0d..41008d229a1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -173,26 +173,6 @@ nvc0_screen(struct pipe_screen *screen)
 #define NVE4_PM_QUERY_METRIC_MP_EFFICIENCY      47
 #define NVE4_PM_QUERY_METRIC_INST_REPLAY_OHEAD  48
 
-/*
-#define NVE4_PM_QUERY_GR_IDLE                   50
-#define NVE4_PM_QUERY_BSP_IDLE                  51
-#define NVE4_PM_QUERY_VP_IDLE                   52
-#define NVE4_PM_QUERY_PPP_IDLE                  53
-#define NVE4_PM_QUERY_CE0_IDLE                  54
-#define NVE4_PM_QUERY_CE1_IDLE                  55
-#define NVE4_PM_QUERY_CE2_IDLE                  56
-*/
-/* L2 queries (PCOUNTER) */
-/*
-#define NVE4_PM_QUERY_L2_SUBP_WRITE_L1_SECTOR_QUERIES 57
-...
-*/
-/* TEX queries (PCOUNTER) */
-/*
-#define NVE4_PM_QUERY_TEX0_CACHE_SECTOR_QUERIES 58
-...
-*/
-
 #define NVC0_PM_QUERY_COUNT 31
 #define NVC0_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
 #define NVC0_PM_QUERY_LAST   NVC0_PM_QUERY(NVC0_PM_QUERY_COUNT - 1)

From 981f46aa95b27ff1d139a6d5e059f9fd32cf83f0 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Fri, 28 Aug 2015 18:30:13 +0200
Subject: [PATCH 59/82] nvc0: use enumerations for driver queries

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 .../drivers/nouveau/nvc0/nvc0_screen.h        | 241 +++++++++---------
 1 file changed, 122 insertions(+), 119 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 41008d229a1..cc671ef363a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -120,136 +120,139 @@ nvc0_screen(struct pipe_screen *screen)
 
 /* Performance counter queries:
  */
-#define NVE4_PM_QUERY_COUNT  49
 #define NVE4_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
 #define NVE4_PM_QUERY_LAST   NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1)
-#define NVE4_PM_QUERY_PROF_TRIGGER_0            0
-#define NVE4_PM_QUERY_PROF_TRIGGER_1            1
-#define NVE4_PM_QUERY_PROF_TRIGGER_2            2
-#define NVE4_PM_QUERY_PROF_TRIGGER_3            3
-#define NVE4_PM_QUERY_PROF_TRIGGER_4            4
-#define NVE4_PM_QUERY_PROF_TRIGGER_5            5
-#define NVE4_PM_QUERY_PROF_TRIGGER_6            6
-#define NVE4_PM_QUERY_PROF_TRIGGER_7            7
-#define NVE4_PM_QUERY_LAUNCHED_WARPS            8
-#define NVE4_PM_QUERY_LAUNCHED_THREADS          9
-#define NVE4_PM_QUERY_LAUNCHED_CTA              10
-#define NVE4_PM_QUERY_INST_ISSUED1              11
-#define NVE4_PM_QUERY_INST_ISSUED2              12
-#define NVE4_PM_QUERY_INST_EXECUTED             13
-#define NVE4_PM_QUERY_LD_LOCAL                  14
-#define NVE4_PM_QUERY_ST_LOCAL                  15
-#define NVE4_PM_QUERY_LD_SHARED                 16
-#define NVE4_PM_QUERY_ST_SHARED                 17
-#define NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT         18
-#define NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS        19
-#define NVE4_PM_QUERY_L1_LOCAL_STORE_HIT        20
-#define NVE4_PM_QUERY_L1_LOCAL_STORE_MISS       21
-#define NVE4_PM_QUERY_GLD_REQUEST               22
-#define NVE4_PM_QUERY_GST_REQUEST               23
-#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT        24
-#define NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS       25
-#define NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED 26
-#define NVE4_PM_QUERY_GST_TRANSACTIONS          27
-#define NVE4_PM_QUERY_BRANCH                    28
-#define NVE4_PM_QUERY_BRANCH_DIVERGENT          29
-#define NVE4_PM_QUERY_ACTIVE_WARPS              30
-#define NVE4_PM_QUERY_ACTIVE_CYCLES             31
-#define NVE4_PM_QUERY_INST_ISSUED               32
-#define NVE4_PM_QUERY_ATOM_COUNT                33
-#define NVE4_PM_QUERY_GRED_COUNT                34
-#define NVE4_PM_QUERY_LD_SHARED_REPLAY          35
-#define NVE4_PM_QUERY_ST_SHARED_REPLAY          36
-#define NVE4_PM_QUERY_LD_LOCAL_TRANSACTIONS     37
-#define NVE4_PM_QUERY_ST_LOCAL_TRANSACTIONS     38
-#define NVE4_PM_QUERY_L1_LD_SHARED_TRANSACTIONS 39
-#define NVE4_PM_QUERY_L1_ST_SHARED_TRANSACTIONS 40
-#define NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY        41
-#define NVE4_PM_QUERY_GST_MEM_DIV_REPLAY        42
-#define NVE4_PM_QUERY_METRIC_IPC                43
-#define NVE4_PM_QUERY_METRIC_IPAC               44
-#define NVE4_PM_QUERY_METRIC_IPEC               45
-#define NVE4_PM_QUERY_METRIC_MP_OCCUPANCY       46
-#define NVE4_PM_QUERY_METRIC_MP_EFFICIENCY      47
-#define NVE4_PM_QUERY_METRIC_INST_REPLAY_OHEAD  48
+enum nve4_pm_queries
+{
+    NVE4_PM_QUERY_PROF_TRIGGER_0 = 0,
+    NVE4_PM_QUERY_PROF_TRIGGER_1,
+    NVE4_PM_QUERY_PROF_TRIGGER_2,
+    NVE4_PM_QUERY_PROF_TRIGGER_3,
+    NVE4_PM_QUERY_PROF_TRIGGER_4,
+    NVE4_PM_QUERY_PROF_TRIGGER_5,
+    NVE4_PM_QUERY_PROF_TRIGGER_6,
+    NVE4_PM_QUERY_PROF_TRIGGER_7,
+    NVE4_PM_QUERY_LAUNCHED_WARPS,
+    NVE4_PM_QUERY_LAUNCHED_THREADS,
+    NVE4_PM_QUERY_LAUNCHED_CTA,
+    NVE4_PM_QUERY_INST_ISSUED1,
+    NVE4_PM_QUERY_INST_ISSUED2,
+    NVE4_PM_QUERY_INST_EXECUTED,
+    NVE4_PM_QUERY_LD_LOCAL,
+    NVE4_PM_QUERY_ST_LOCAL,
+    NVE4_PM_QUERY_LD_SHARED,
+    NVE4_PM_QUERY_ST_SHARED,
+    NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT,
+    NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS,
+    NVE4_PM_QUERY_L1_LOCAL_STORE_HIT,
+    NVE4_PM_QUERY_L1_LOCAL_STORE_MISS,
+    NVE4_PM_QUERY_GLD_REQUEST,
+    NVE4_PM_QUERY_GST_REQUEST,
+    NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT,
+    NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS,
+    NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED,
+    NVE4_PM_QUERY_GST_TRANSACTIONS,
+    NVE4_PM_QUERY_BRANCH,
+    NVE4_PM_QUERY_BRANCH_DIVERGENT,
+    NVE4_PM_QUERY_ACTIVE_WARPS,
+    NVE4_PM_QUERY_ACTIVE_CYCLES,
+    NVE4_PM_QUERY_INST_ISSUED,
+    NVE4_PM_QUERY_ATOM_COUNT,
+    NVE4_PM_QUERY_GRED_COUNT,
+    NVE4_PM_QUERY_LD_SHARED_REPLAY,
+    NVE4_PM_QUERY_ST_SHARED_REPLAY,
+    NVE4_PM_QUERY_LD_LOCAL_TRANSACTIONS,
+    NVE4_PM_QUERY_ST_LOCAL_TRANSACTIONS,
+    NVE4_PM_QUERY_L1_LD_SHARED_TRANSACTIONS,
+    NVE4_PM_QUERY_L1_ST_SHARED_TRANSACTIONS,
+    NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY,
+    NVE4_PM_QUERY_GST_MEM_DIV_REPLAY,
+    NVE4_PM_QUERY_METRIC_IPC,
+    NVE4_PM_QUERY_METRIC_IPAC,
+    NVE4_PM_QUERY_METRIC_IPEC,
+    NVE4_PM_QUERY_METRIC_MP_OCCUPANCY,
+    NVE4_PM_QUERY_METRIC_MP_EFFICIENCY,
+    NVE4_PM_QUERY_METRIC_INST_REPLAY_OHEAD,
+    NVE4_PM_QUERY_COUNT
+};
 
-#define NVC0_PM_QUERY_COUNT 31
 #define NVC0_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
 #define NVC0_PM_QUERY_LAST   NVC0_PM_QUERY(NVC0_PM_QUERY_COUNT - 1)
-#define NVC0_PM_QUERY_INST_EXECUTED             0
-#define NVC0_PM_QUERY_BRANCH                    1
-#define NVC0_PM_QUERY_BRANCH_DIVERGENT          2
-#define NVC0_PM_QUERY_ACTIVE_WARPS              3
-#define NVC0_PM_QUERY_ACTIVE_CYCLES             4
-#define NVC0_PM_QUERY_LAUNCHED_WARPS            5
-#define NVC0_PM_QUERY_LAUNCHED_THREADS          6
-#define NVC0_PM_QUERY_LD_SHARED                 7
-#define NVC0_PM_QUERY_ST_SHARED                 8
-#define NVC0_PM_QUERY_LD_LOCAL                  9
-#define NVC0_PM_QUERY_ST_LOCAL                  10
-#define NVC0_PM_QUERY_GRED_COUNT                11
-#define NVC0_PM_QUERY_ATOM_COUNT                12
-#define NVC0_PM_QUERY_GLD_REQUEST               13
-#define NVC0_PM_QUERY_GST_REQUEST               14
-#define NVC0_PM_QUERY_INST_ISSUED1_0            15
-#define NVC0_PM_QUERY_INST_ISSUED1_1            16
-#define NVC0_PM_QUERY_INST_ISSUED2_0            17
-#define NVC0_PM_QUERY_INST_ISSUED2_1            18
-#define NVC0_PM_QUERY_TH_INST_EXECUTED_0        19
-#define NVC0_PM_QUERY_TH_INST_EXECUTED_1        20
-#define NVC0_PM_QUERY_TH_INST_EXECUTED_2        21
-#define NVC0_PM_QUERY_TH_INST_EXECUTED_3        22
-#define NVC0_PM_QUERY_PROF_TRIGGER_0            23
-#define NVC0_PM_QUERY_PROF_TRIGGER_1            24
-#define NVC0_PM_QUERY_PROF_TRIGGER_2            25
-#define NVC0_PM_QUERY_PROF_TRIGGER_3            26
-#define NVC0_PM_QUERY_PROF_TRIGGER_4            27
-#define NVC0_PM_QUERY_PROF_TRIGGER_5            28
-#define NVC0_PM_QUERY_PROF_TRIGGER_6            29
-#define NVC0_PM_QUERY_PROF_TRIGGER_7            30
+enum nvc0_pm_queries
+{
+    NVC0_PM_QUERY_INST_EXECUTED = 0,
+    NVC0_PM_QUERY_BRANCH,
+    NVC0_PM_QUERY_BRANCH_DIVERGENT,
+    NVC0_PM_QUERY_ACTIVE_WARPS,
+    NVC0_PM_QUERY_ACTIVE_CYCLES,
+    NVC0_PM_QUERY_LAUNCHED_WARPS,
+    NVC0_PM_QUERY_LAUNCHED_THREADS,
+    NVC0_PM_QUERY_LD_SHARED,
+    NVC0_PM_QUERY_ST_SHARED,
+    NVC0_PM_QUERY_LD_LOCAL,
+    NVC0_PM_QUERY_ST_LOCAL,
+    NVC0_PM_QUERY_GRED_COUNT,
+    NVC0_PM_QUERY_ATOM_COUNT,
+    NVC0_PM_QUERY_GLD_REQUEST,
+    NVC0_PM_QUERY_GST_REQUEST,
+    NVC0_PM_QUERY_INST_ISSUED1_0,
+    NVC0_PM_QUERY_INST_ISSUED1_1,
+    NVC0_PM_QUERY_INST_ISSUED2_0,
+    NVC0_PM_QUERY_INST_ISSUED2_1,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_0,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_1,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_2,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_3,
+    NVC0_PM_QUERY_PROF_TRIGGER_0,
+    NVC0_PM_QUERY_PROF_TRIGGER_1,
+    NVC0_PM_QUERY_PROF_TRIGGER_2,
+    NVC0_PM_QUERY_PROF_TRIGGER_3,
+    NVC0_PM_QUERY_PROF_TRIGGER_4,
+    NVC0_PM_QUERY_PROF_TRIGGER_5,
+    NVC0_PM_QUERY_PROF_TRIGGER_6,
+    NVC0_PM_QUERY_PROF_TRIGGER_7,
+    NVC0_PM_QUERY_COUNT
+};
 
 /* Driver statistics queries:
  */
-#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
-
 #define NVC0_QUERY_DRV_STAT(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 1024 + (i))
-#define NVC0_QUERY_DRV_STAT_COUNT  29
 #define NVC0_QUERY_DRV_STAT_LAST   NVC0_QUERY_DRV_STAT(NVC0_QUERY_DRV_STAT_COUNT - 1)
-#define NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT         0
-#define NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES         1
-#define NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT         2
-#define NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID     3
-#define NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS     4
-#define NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ               5
-#define NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE              6
-#define NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT                   7
-#define NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT                   8
-#define NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT            9
-#define NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ              10
-#define NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE             11
-#define NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID      12
-#define NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT          13
-#define NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID     14
-#define NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS     15
-#define NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES                  16
-#define NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT 17
-#define NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT 18
-#define NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT                19
-#define NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT             20
-#define NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY                21
-#define NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED              22
-#define NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT       23
-#define NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES        24
-#define NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT           25
-#define NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES           26
-#define NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT                   27
-#define NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT         28
-
-#else
-
-#define NVC0_QUERY_DRV_STAT_COUNT 0
-
+enum nvc0_drv_stats_queries
+{
+#ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
+    NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_COUNT = 0,
+    NVC0_QUERY_DRV_STAT_TEX_OBJECT_CURRENT_BYTES,
+    NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_COUNT,
+    NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_VID,
+    NVC0_QUERY_DRV_STAT_BUF_OBJECT_CURRENT_BYTES_SYS,
+    NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_READ,
+    NVC0_QUERY_DRV_STAT_TEX_TRANSFERS_WRITE,
+    NVC0_QUERY_DRV_STAT_TEX_COPY_COUNT,
+    NVC0_QUERY_DRV_STAT_TEX_BLIT_COUNT,
+    NVC0_QUERY_DRV_STAT_TEX_CACHE_FLUSH_COUNT,
+    NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_READ,
+    NVC0_QUERY_DRV_STAT_BUF_TRANSFERS_WRITE,
+    NVC0_QUERY_DRV_STAT_BUF_READ_BYTES_STAGING_VID,
+    NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_DIRECT,
+    NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_VID,
+    NVC0_QUERY_DRV_STAT_BUF_WRITE_BYTES_STAGING_SYS,
+    NVC0_QUERY_DRV_STAT_BUF_COPY_BYTES,
+    NVC0_QUERY_DRV_STAT_BUF_NON_KERNEL_FENCE_SYNC_COUNT,
+    NVC0_QUERY_DRV_STAT_ANY_NON_KERNEL_FENCE_SYNC_COUNT,
+    NVC0_QUERY_DRV_STAT_QUERY_SYNC_COUNT,
+    NVC0_QUERY_DRV_STAT_GPU_SERIALIZE_COUNT,
+    NVC0_QUERY_DRV_STAT_DRAW_CALLS_ARRAY,
+    NVC0_QUERY_DRV_STAT_DRAW_CALLS_INDEXED,
+    NVC0_QUERY_DRV_STAT_DRAW_CALLS_FALLBACK_COUNT,
+    NVC0_QUERY_DRV_STAT_USER_BUFFER_UPLOAD_BYTES,
+    NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_COUNT,
+    NVC0_QUERY_DRV_STAT_CONSTBUF_UPLOAD_BYTES,
+    NVC0_QUERY_DRV_STAT_PUSHBUF_COUNT,
+    NVC0_QUERY_DRV_STAT_RESOURCE_VALIDATE_COUNT,
 #endif
+    NVC0_QUERY_DRV_STAT_COUNT
+};
 
 int nvc0_screen_get_driver_query_info(struct pipe_screen *, unsigned,
                                       struct pipe_driver_query_info *);

From ebca85423cf18e987ffebadcde79e95fc2609291 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Fri, 28 Aug 2015 18:41:16 +0200
Subject: [PATCH 60/82] nvc0: make names of performance counter queries
 consistent

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 56 +++++++++----------
 .../drivers/nouveau/nvc0/nvc0_screen.h        | 56 +++++++++----------
 2 files changed, 56 insertions(+), 56 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index f7b85a8e931..66720617733 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -891,39 +891,39 @@ static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
    _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
    _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
    _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
-   _Q1A(LAUNCHED_WARPS,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
-   _Q1A(LAUNCHED_THREADS,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
-   _Q1B(LAUNCHED_CTA,      0x0001, B6, WARP, 0x0000001c, 1, 1),
+   _Q1A(WARPS_LAUNCHED,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
+   _Q1A(THREADS_LAUNCHED,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
+   _Q1B(SM_CTA_LAUNCHED,      0x0001, B6, WARP, 0x0000001c, 1, 1),
    _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
    _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
    _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
    _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
-   _Q1A(LD_SHARED,   0x0001, B6, LDST, 0x00000000, 1, 1),
-   _Q1A(ST_SHARED,   0x0001, B6, LDST, 0x00000004, 1, 1),
-   _Q1A(LD_LOCAL,    0x0001, B6, LDST, 0x00000008, 1, 1),
-   _Q1A(ST_LOCAL,    0x0001, B6, LDST, 0x0000000c, 1, 1),
+   _Q1A(SHARED_LD,   0x0001, B6, LDST, 0x00000000, 1, 1),
+   _Q1A(SHARED_ST,   0x0001, B6, LDST, 0x00000004, 1, 1),
+   _Q1A(LOCAL_LD,    0x0001, B6, LDST, 0x00000008, 1, 1),
+   _Q1A(LOCAL_ST,    0x0001, B6, LDST, 0x0000000c, 1, 1),
    _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
    _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
-   _Q1B(L1_LOCAL_LOAD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
-   _Q1B(L1_LOCAL_LOAD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
-   _Q1B(L1_LOCAL_STORE_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
-   _Q1B(L1_LOCAL_STORE_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
-   _Q1B(L1_GLOBAL_LOAD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
-   _Q1B(L1_GLOBAL_LOAD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
-   _Q1B(GLD_TRANSACTIONS_UNCACHED, 0x0001, B6, MEM, 0x00000000, 1, 1),
+   _Q1B(L1_LOCAL_LD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
+   _Q1B(L1_LOCAL_LD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
+   _Q1B(L1_LOCAL_ST_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
+   _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
+   _Q1B(L1_GLD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
+   _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
+   _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
    _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
    _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
-   _Q1A(BRANCH_DIVERGENT, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
+   _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
    _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
    _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
    _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
    _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
-   _Q1B(LD_SHARED_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
-   _Q1B(ST_SHARED_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
-   _Q1B(LD_LOCAL_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
-   _Q1B(ST_LOCAL_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
-   _Q1B(L1_LD_SHARED_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
-   _Q1B(L1_ST_SHARED_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
+   _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
+   _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
+   _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
+   _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
+   _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
+   _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
    _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
    _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
    _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
@@ -1032,15 +1032,15 @@ static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
 {
    _Q(INST_EXECUTED,       0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
    _Q(BRANCH,              0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
-   _Q(BRANCH_DIVERGENT,    0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
+   _Q(DIVERGENT_BRANCH,    0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
    _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
    _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LAUNCHED_WARPS,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LAUNCHED_THREADS,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
-   _Q(LD_SHARED,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(ST_SHARED,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LD_LOCAL,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(ST_LOCAL,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(WARPS_LAUNCHED,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(THREADS_LAUNCHED,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
+   _Q(SHARED_LD,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(SHARED_ST,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(LOCAL_LD,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(LOCAL_ST,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(GRED_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(ATOM_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(GLD_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index cc671ef363a..d689863df5c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -132,39 +132,39 @@ enum nve4_pm_queries
     NVE4_PM_QUERY_PROF_TRIGGER_5,
     NVE4_PM_QUERY_PROF_TRIGGER_6,
     NVE4_PM_QUERY_PROF_TRIGGER_7,
-    NVE4_PM_QUERY_LAUNCHED_WARPS,
-    NVE4_PM_QUERY_LAUNCHED_THREADS,
-    NVE4_PM_QUERY_LAUNCHED_CTA,
+    NVE4_PM_QUERY_WARPS_LAUNCHED,
+    NVE4_PM_QUERY_THREADS_LAUNCHED,
+    NVE4_PM_QUERY_SM_CTA_LAUNCHED,
     NVE4_PM_QUERY_INST_ISSUED1,
     NVE4_PM_QUERY_INST_ISSUED2,
     NVE4_PM_QUERY_INST_EXECUTED,
-    NVE4_PM_QUERY_LD_LOCAL,
-    NVE4_PM_QUERY_ST_LOCAL,
-    NVE4_PM_QUERY_LD_SHARED,
-    NVE4_PM_QUERY_ST_SHARED,
-    NVE4_PM_QUERY_L1_LOCAL_LOAD_HIT,
-    NVE4_PM_QUERY_L1_LOCAL_LOAD_MISS,
-    NVE4_PM_QUERY_L1_LOCAL_STORE_HIT,
-    NVE4_PM_QUERY_L1_LOCAL_STORE_MISS,
+    NVE4_PM_QUERY_LOCAL_LD,
+    NVE4_PM_QUERY_LOCAL_ST,
+    NVE4_PM_QUERY_SHARED_LD,
+    NVE4_PM_QUERY_SHARED_ST,
+    NVE4_PM_QUERY_L1_LOCAL_LD_HIT,
+    NVE4_PM_QUERY_L1_LOCAL_LD_MISS,
+    NVE4_PM_QUERY_L1_LOCAL_ST_HIT,
+    NVE4_PM_QUERY_L1_LOCAL_ST_MISS,
     NVE4_PM_QUERY_GLD_REQUEST,
     NVE4_PM_QUERY_GST_REQUEST,
-    NVE4_PM_QUERY_L1_GLOBAL_LOAD_HIT,
-    NVE4_PM_QUERY_L1_GLOBAL_LOAD_MISS,
-    NVE4_PM_QUERY_GLD_TRANSACTIONS_UNCACHED,
+    NVE4_PM_QUERY_L1_GLD_HIT,
+    NVE4_PM_QUERY_L1_GLD_MISS,
+    NVE4_PM_QUERY_UNCACHED_GLD_TRANSACTIONS,
     NVE4_PM_QUERY_GST_TRANSACTIONS,
     NVE4_PM_QUERY_BRANCH,
-    NVE4_PM_QUERY_BRANCH_DIVERGENT,
+    NVE4_PM_QUERY_DIVERGENT_BRANCH,
     NVE4_PM_QUERY_ACTIVE_WARPS,
     NVE4_PM_QUERY_ACTIVE_CYCLES,
     NVE4_PM_QUERY_INST_ISSUED,
     NVE4_PM_QUERY_ATOM_COUNT,
     NVE4_PM_QUERY_GRED_COUNT,
-    NVE4_PM_QUERY_LD_SHARED_REPLAY,
-    NVE4_PM_QUERY_ST_SHARED_REPLAY,
-    NVE4_PM_QUERY_LD_LOCAL_TRANSACTIONS,
-    NVE4_PM_QUERY_ST_LOCAL_TRANSACTIONS,
-    NVE4_PM_QUERY_L1_LD_SHARED_TRANSACTIONS,
-    NVE4_PM_QUERY_L1_ST_SHARED_TRANSACTIONS,
+    NVE4_PM_QUERY_SHARED_LD_REPLAY,
+    NVE4_PM_QUERY_SHARED_ST_REPLAY,
+    NVE4_PM_QUERY_LOCAL_LD_TRANSACTIONS,
+    NVE4_PM_QUERY_LOCAL_ST_TRANSACTIONS,
+    NVE4_PM_QUERY_L1_SHARED_LD_TRANSACTIONS,
+    NVE4_PM_QUERY_L1_SHARED_ST_TRANSACTIONS,
     NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY,
     NVE4_PM_QUERY_GST_MEM_DIV_REPLAY,
     NVE4_PM_QUERY_METRIC_IPC,
@@ -182,15 +182,15 @@ enum nvc0_pm_queries
 {
     NVC0_PM_QUERY_INST_EXECUTED = 0,
     NVC0_PM_QUERY_BRANCH,
-    NVC0_PM_QUERY_BRANCH_DIVERGENT,
+    NVC0_PM_QUERY_DIVERGENT_BRANCH,
     NVC0_PM_QUERY_ACTIVE_WARPS,
     NVC0_PM_QUERY_ACTIVE_CYCLES,
-    NVC0_PM_QUERY_LAUNCHED_WARPS,
-    NVC0_PM_QUERY_LAUNCHED_THREADS,
-    NVC0_PM_QUERY_LD_SHARED,
-    NVC0_PM_QUERY_ST_SHARED,
-    NVC0_PM_QUERY_LD_LOCAL,
-    NVC0_PM_QUERY_ST_LOCAL,
+    NVC0_PM_QUERY_WARPS_LAUNCHED,
+    NVC0_PM_QUERY_THREADS_LAUNCHED,
+    NVC0_PM_QUERY_SHARED_LD,
+    NVC0_PM_QUERY_SHARED_ST,
+    NVC0_PM_QUERY_LOCAL_LD,
+    NVC0_PM_QUERY_LOCAL_ST,
     NVC0_PM_QUERY_GRED_COUNT,
     NVC0_PM_QUERY_ATOM_COUNT,
     NVC0_PM_QUERY_GLD_REQUEST,

From 21bdb4d8f381e4f33b7028a049162c71c2daff73 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Fri, 28 Aug 2015 19:09:33 +0200
Subject: [PATCH 61/82] nvc0: sort performance counter queries by name

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 188 +++++++++---------
 .../drivers/nouveau/nvc0/nvc0_screen.h        |  96 ++++-----
 2 files changed, 142 insertions(+), 142 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 66720617733..a2a4a5cb3c2 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -776,6 +776,33 @@ static const uint64_t nve4_read_mp_pm_counters_code[] =
 static const char *nve4_pm_query_names[] =
 {
    /* MP counters */
+   "active_cycles",
+   "active_warps",
+   "atom_count",
+   "branch",
+   "divergent_branch",
+   "gld_request",
+   "global_ld_mem_divergence_replays",
+   "global_store_transaction",
+   "global_st_mem_divergence_replays",
+   "gred_count",
+   "gst_request",
+   "inst_executed",
+   "inst_issued",
+   "inst_issued1",
+   "inst_issued2",
+   "l1_global_load_hit",
+   "l1_global_load_miss",
+   "l1_local_load_hit",
+   "l1_local_load_miss",
+   "l1_local_store_hit",
+   "l1_local_store_miss",
+   "l1_shared_load_transactions",
+   "l1_shared_store_transactions",
+   "local_load",
+   "local_load_transactions",
+   "local_store",
+   "local_store_transactions",
    "prof_trigger_00",
    "prof_trigger_01",
    "prof_trigger_02",
@@ -784,41 +811,14 @@ static const char *nve4_pm_query_names[] =
    "prof_trigger_05",
    "prof_trigger_06",
    "prof_trigger_07",
-   "warps_launched",
-   "threads_launched",
-   "sm_cta_launched",
-   "inst_issued1",
-   "inst_issued2",
-   "inst_executed",
-   "local_load",
-   "local_store",
    "shared_load",
-   "shared_store",
-   "l1_local_load_hit",
-   "l1_local_load_miss",
-   "l1_local_store_hit",
-   "l1_local_store_miss",
-   "gld_request",
-   "gst_request",
-   "l1_global_load_hit",
-   "l1_global_load_miss",
-   "uncached_global_load_transaction",
-   "global_store_transaction",
-   "branch",
-   "divergent_branch",
-   "active_warps",
-   "active_cycles",
-   "inst_issued",
-   "atom_count",
-   "gred_count",
    "shared_load_replay",
+   "shared_store",
    "shared_store_replay",
-   "local_load_transactions",
-   "local_store_transactions",
-   "l1_shared_load_transactions",
-   "l1_shared_store_transactions",
-   "global_ld_mem_divergence_replays",
-   "global_st_mem_divergence_replays",
+   "sm_cta_launched",
+   "threads_launched",
+   "uncached_global_load_transaction",
+   "warps_launched",
    /* metrics, i.e. functions of the MP counters */
    "metric-ipc",                   /* inst_executed, clock */
    "metric-ipac",                  /* inst_executed, active_cycles */
@@ -883,6 +883,33 @@ struct nvc0_mp_pm_query_cfg
  */
 static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
 {
+   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
+   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
+   _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
+   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
+   _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
+   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
+   _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
+   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
+   _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
+   _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
+   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
+   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
+   _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
+   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
+   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
+   _Q1B(L1_GLD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
+   _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
+   _Q1B(L1_LOCAL_LD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
+   _Q1B(L1_LOCAL_LD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
+   _Q1B(L1_LOCAL_ST_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
+   _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
+   _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
+   _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
+   _Q1A(LOCAL_LD,    0x0001, B6, LDST, 0x00000008, 1, 1),
+   _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
+   _Q1A(LOCAL_ST,    0x0001, B6, LDST, 0x0000000c, 1, 1),
+   _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
    _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
    _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
    _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
@@ -891,41 +918,14 @@ static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
    _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
    _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
    _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
-   _Q1A(WARPS_LAUNCHED,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
-   _Q1A(THREADS_LAUNCHED,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
-   _Q1B(SM_CTA_LAUNCHED,      0x0001, B6, WARP, 0x0000001c, 1, 1),
-   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
-   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
-   _Q1A(INST_ISSUED,   0x0003, B6, ISSUE, 0x00000104, 1, 1),
-   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
    _Q1A(SHARED_LD,   0x0001, B6, LDST, 0x00000000, 1, 1),
-   _Q1A(SHARED_ST,   0x0001, B6, LDST, 0x00000004, 1, 1),
-   _Q1A(LOCAL_LD,    0x0001, B6, LDST, 0x00000008, 1, 1),
-   _Q1A(LOCAL_ST,    0x0001, B6, LDST, 0x0000000c, 1, 1),
-   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
-   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
-   _Q1B(L1_LOCAL_LD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
-   _Q1B(L1_LOCAL_LD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
-   _Q1B(L1_LOCAL_ST_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
-   _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
-   _Q1B(L1_GLD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
-   _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
-   _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
-   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
-   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
-   _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
-   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
-   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
-   _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
-   _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
    _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
+   _Q1A(SHARED_ST,   0x0001, B6, LDST, 0x00000004, 1, 1),
    _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
-   _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
-   _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
-   _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
-   _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
-   _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
-   _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
+   _Q1B(SM_CTA_LAUNCHED,      0x0001, B6, WARP, 0x0000001c, 1, 1),
+   _Q1A(THREADS_LAUNCHED,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
+   _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
+   _Q1A(WARPS_LAUNCHED,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
    _M2AB(IPC, 0x3, B6, EXEC, 0x398, 0xffff, LOGOP, WARP, 0x0, DIV_SUM_M0, 10, 1),
    _M2AB(IPAC, 0x3, B6, EXEC, 0x398, 0x1, B6, WARP, 0x0, AVG_DIV_MM, 10, 1),
    _M2A(IPEC, 0x3, B6, EXEC, 0x398, 0xe, LOGOP, EXEC, 0x398, AVG_DIV_MM, 10, 1),
@@ -993,29 +993,21 @@ static const uint64_t nvc0_read_mp_pm_counters_code[] =
 static const char *nvc0_pm_query_names[] =
 {
    /* MP counters */
-   "inst_executed",
+   "active_cycles",
+   "active_warps",
+   "atom_count",
    "branch",
    "divergent_branch",
-   "active_warps",
-   "active_cycles",
-   "warps_launched",
-   "threads_launched",
-   "shared_load",
-   "shared_store",
-   "local_load",
-   "local_store",
-   "gred_count",
-   "atom_count",
    "gld_request",
+   "gred_count",
    "gst_request",
+   "inst_executed",
    "inst_issued1_0",
    "inst_issued1_1",
    "inst_issued2_0",
    "inst_issued2_1",
-   "thread_inst_executed_0",
-   "thread_inst_executed_1",
-   "thread_inst_executed_2",
-   "thread_inst_executed_3",
+   "local_load",
+   "local_store",
    "prof_trigger_00",
    "prof_trigger_01",
    "prof_trigger_02",
@@ -1024,35 +1016,35 @@ static const char *nvc0_pm_query_names[] =
    "prof_trigger_05",
    "prof_trigger_06",
    "prof_trigger_07",
+   "shared_load",
+   "shared_store",
+   "threads_launched",
+   "thread_inst_executed_0",
+   "thread_inst_executed_1",
+   "thread_inst_executed_2",
+   "thread_inst_executed_3",
+   "warps_launched",
 };
 
 #define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_PM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
 
 static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
 {
-   _Q(INST_EXECUTED,       0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
+   _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
+   _Q(ATOM_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(BRANCH,              0xaaaa, LOGOP, 0x1a, 2, 0x00, 0x11, 0x00, 0x00, 0x00, 0x00),
    _Q(DIVERGENT_BRANCH,    0xaaaa, LOGOP, 0x19, 2, 0x20, 0x31, 0x00, 0x00, 0x00, 0x00),
-   _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
-   _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(WARPS_LAUNCHED,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(THREADS_LAUNCHED,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
-   _Q(SHARED_LD,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(SHARED_ST,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LOCAL_LD,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(LOCAL_ST,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(GRED_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(ATOM_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(GLD_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(GRED_COUNT,          0xaaaa, LOGOP, 0x63, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(GST_REQUEST,         0xaaaa, LOGOP, 0x64, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(INST_EXECUTED,       0xaaaa, LOGOP, 0x2d, 3, 0x00, 0x11, 0x22, 0x00, 0x00, 0x00),
    _Q(INST_ISSUED1_0,      0xaaaa, LOGOP, 0x7e, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(INST_ISSUED1_1,      0xaaaa, LOGOP, 0x7e, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(INST_ISSUED2_0,      0xaaaa, LOGOP, 0x7e, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(INST_ISSUED2_1,      0xaaaa, LOGOP, 0x7e, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
-   _Q(TH_INST_EXECUTED_0,  0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(TH_INST_EXECUTED_1,  0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(TH_INST_EXECUTED_2,  0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
-   _Q(TH_INST_EXECUTED_3,  0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(LOCAL_LD,            0xaaaa, LOGOP, 0x64, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(LOCAL_ST,            0xaaaa, LOGOP, 0x64, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(PROF_TRIGGER_0,      0xaaaa, LOGOP, 0x01, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(PROF_TRIGGER_1,      0xaaaa, LOGOP, 0x01, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(PROF_TRIGGER_2,      0xaaaa, LOGOP, 0x01, 1, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00),
@@ -1061,6 +1053,14 @@ static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
    _Q(PROF_TRIGGER_5,      0xaaaa, LOGOP, 0x01, 1, 0x50, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(PROF_TRIGGER_6,      0xaaaa, LOGOP, 0x01, 1, 0x60, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(PROF_TRIGGER_7,      0xaaaa, LOGOP, 0x01, 1, 0x70, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(SHARED_LD,           0xaaaa, LOGOP, 0x64, 1, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(SHARED_ST,           0xaaaa, LOGOP, 0x64, 1, 0x40, 0x00, 0x00, 0x00, 0x00, 0x00),
+   _Q(THREADS_LAUNCHED,    0xaaaa, LOGOP, 0x26, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
+   _Q(TH_INST_EXECUTED_0,  0xaaaa, LOGOP, 0xa3, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(TH_INST_EXECUTED_1,  0xaaaa, LOGOP, 0xa5, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(TH_INST_EXECUTED_2,  0xaaaa, LOGOP, 0xa4, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(TH_INST_EXECUTED_3,  0xaaaa, LOGOP, 0xa6, 6, 0x00, 0x11, 0x22, 0x33, 0x44, 0x55),
+   _Q(WARPS_LAUNCHED,      0xaaaa, LOGOP, 0x26, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
 };
 
 #undef _Q
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index d689863df5c..531314f4200 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -124,7 +124,34 @@ nvc0_screen(struct pipe_screen *screen)
 #define NVE4_PM_QUERY_LAST   NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1)
 enum nve4_pm_queries
 {
-    NVE4_PM_QUERY_PROF_TRIGGER_0 = 0,
+    NVE4_PM_QUERY_ACTIVE_CYCLES = 0,
+    NVE4_PM_QUERY_ACTIVE_WARPS,
+    NVE4_PM_QUERY_ATOM_COUNT,
+    NVE4_PM_QUERY_BRANCH,
+    NVE4_PM_QUERY_DIVERGENT_BRANCH,
+    NVE4_PM_QUERY_GLD_REQUEST,
+    NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY,
+    NVE4_PM_QUERY_GST_TRANSACTIONS,
+    NVE4_PM_QUERY_GST_MEM_DIV_REPLAY,
+    NVE4_PM_QUERY_GRED_COUNT,
+    NVE4_PM_QUERY_GST_REQUEST,
+    NVE4_PM_QUERY_INST_EXECUTED,
+    NVE4_PM_QUERY_INST_ISSUED,
+    NVE4_PM_QUERY_INST_ISSUED1,
+    NVE4_PM_QUERY_INST_ISSUED2,
+    NVE4_PM_QUERY_L1_GLD_HIT,
+    NVE4_PM_QUERY_L1_GLD_MISS,
+    NVE4_PM_QUERY_L1_LOCAL_LD_HIT,
+    NVE4_PM_QUERY_L1_LOCAL_LD_MISS,
+    NVE4_PM_QUERY_L1_LOCAL_ST_HIT,
+    NVE4_PM_QUERY_L1_LOCAL_ST_MISS,
+    NVE4_PM_QUERY_L1_SHARED_LD_TRANSACTIONS,
+    NVE4_PM_QUERY_L1_SHARED_ST_TRANSACTIONS,
+    NVE4_PM_QUERY_LOCAL_LD,
+    NVE4_PM_QUERY_LOCAL_LD_TRANSACTIONS,
+    NVE4_PM_QUERY_LOCAL_ST,
+    NVE4_PM_QUERY_LOCAL_ST_TRANSACTIONS,
+    NVE4_PM_QUERY_PROF_TRIGGER_0,
     NVE4_PM_QUERY_PROF_TRIGGER_1,
     NVE4_PM_QUERY_PROF_TRIGGER_2,
     NVE4_PM_QUERY_PROF_TRIGGER_3,
@@ -132,41 +159,14 @@ enum nve4_pm_queries
     NVE4_PM_QUERY_PROF_TRIGGER_5,
     NVE4_PM_QUERY_PROF_TRIGGER_6,
     NVE4_PM_QUERY_PROF_TRIGGER_7,
-    NVE4_PM_QUERY_WARPS_LAUNCHED,
-    NVE4_PM_QUERY_THREADS_LAUNCHED,
-    NVE4_PM_QUERY_SM_CTA_LAUNCHED,
-    NVE4_PM_QUERY_INST_ISSUED1,
-    NVE4_PM_QUERY_INST_ISSUED2,
-    NVE4_PM_QUERY_INST_EXECUTED,
-    NVE4_PM_QUERY_LOCAL_LD,
-    NVE4_PM_QUERY_LOCAL_ST,
     NVE4_PM_QUERY_SHARED_LD,
-    NVE4_PM_QUERY_SHARED_ST,
-    NVE4_PM_QUERY_L1_LOCAL_LD_HIT,
-    NVE4_PM_QUERY_L1_LOCAL_LD_MISS,
-    NVE4_PM_QUERY_L1_LOCAL_ST_HIT,
-    NVE4_PM_QUERY_L1_LOCAL_ST_MISS,
-    NVE4_PM_QUERY_GLD_REQUEST,
-    NVE4_PM_QUERY_GST_REQUEST,
-    NVE4_PM_QUERY_L1_GLD_HIT,
-    NVE4_PM_QUERY_L1_GLD_MISS,
-    NVE4_PM_QUERY_UNCACHED_GLD_TRANSACTIONS,
-    NVE4_PM_QUERY_GST_TRANSACTIONS,
-    NVE4_PM_QUERY_BRANCH,
-    NVE4_PM_QUERY_DIVERGENT_BRANCH,
-    NVE4_PM_QUERY_ACTIVE_WARPS,
-    NVE4_PM_QUERY_ACTIVE_CYCLES,
-    NVE4_PM_QUERY_INST_ISSUED,
-    NVE4_PM_QUERY_ATOM_COUNT,
-    NVE4_PM_QUERY_GRED_COUNT,
     NVE4_PM_QUERY_SHARED_LD_REPLAY,
+    NVE4_PM_QUERY_SHARED_ST,
     NVE4_PM_QUERY_SHARED_ST_REPLAY,
-    NVE4_PM_QUERY_LOCAL_LD_TRANSACTIONS,
-    NVE4_PM_QUERY_LOCAL_ST_TRANSACTIONS,
-    NVE4_PM_QUERY_L1_SHARED_LD_TRANSACTIONS,
-    NVE4_PM_QUERY_L1_SHARED_ST_TRANSACTIONS,
-    NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY,
-    NVE4_PM_QUERY_GST_MEM_DIV_REPLAY,
+    NVE4_PM_QUERY_SM_CTA_LAUNCHED,
+    NVE4_PM_QUERY_THREADS_LAUNCHED,
+    NVE4_PM_QUERY_UNCACHED_GLD_TRANSACTIONS,
+    NVE4_PM_QUERY_WARPS_LAUNCHED,
     NVE4_PM_QUERY_METRIC_IPC,
     NVE4_PM_QUERY_METRIC_IPAC,
     NVE4_PM_QUERY_METRIC_IPEC,
@@ -180,29 +180,21 @@ enum nve4_pm_queries
 #define NVC0_PM_QUERY_LAST   NVC0_PM_QUERY(NVC0_PM_QUERY_COUNT - 1)
 enum nvc0_pm_queries
 {
-    NVC0_PM_QUERY_INST_EXECUTED = 0,
+    NVC0_PM_QUERY_ACTIVE_CYCLES = 0,
+    NVC0_PM_QUERY_ACTIVE_WARPS,
+    NVC0_PM_QUERY_ATOM_COUNT,
     NVC0_PM_QUERY_BRANCH,
     NVC0_PM_QUERY_DIVERGENT_BRANCH,
-    NVC0_PM_QUERY_ACTIVE_WARPS,
-    NVC0_PM_QUERY_ACTIVE_CYCLES,
-    NVC0_PM_QUERY_WARPS_LAUNCHED,
-    NVC0_PM_QUERY_THREADS_LAUNCHED,
-    NVC0_PM_QUERY_SHARED_LD,
-    NVC0_PM_QUERY_SHARED_ST,
-    NVC0_PM_QUERY_LOCAL_LD,
-    NVC0_PM_QUERY_LOCAL_ST,
-    NVC0_PM_QUERY_GRED_COUNT,
-    NVC0_PM_QUERY_ATOM_COUNT,
     NVC0_PM_QUERY_GLD_REQUEST,
+    NVC0_PM_QUERY_GRED_COUNT,
     NVC0_PM_QUERY_GST_REQUEST,
+    NVC0_PM_QUERY_INST_EXECUTED,
     NVC0_PM_QUERY_INST_ISSUED1_0,
     NVC0_PM_QUERY_INST_ISSUED1_1,
     NVC0_PM_QUERY_INST_ISSUED2_0,
     NVC0_PM_QUERY_INST_ISSUED2_1,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_0,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_1,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_2,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_3,
+    NVC0_PM_QUERY_LOCAL_LD,
+    NVC0_PM_QUERY_LOCAL_ST,
     NVC0_PM_QUERY_PROF_TRIGGER_0,
     NVC0_PM_QUERY_PROF_TRIGGER_1,
     NVC0_PM_QUERY_PROF_TRIGGER_2,
@@ -211,6 +203,14 @@ enum nvc0_pm_queries
     NVC0_PM_QUERY_PROF_TRIGGER_5,
     NVC0_PM_QUERY_PROF_TRIGGER_6,
     NVC0_PM_QUERY_PROF_TRIGGER_7,
+    NVC0_PM_QUERY_SHARED_LD,
+    NVC0_PM_QUERY_SHARED_ST,
+    NVC0_PM_QUERY_THREADS_LAUNCHED,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_0,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_1,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_2,
+    NVC0_PM_QUERY_TH_INST_EXECUTED_3,
+    NVC0_PM_QUERY_WARPS_LAUNCHED,
     NVC0_PM_QUERY_COUNT
 };
 

From c8a61ea4fbcb09215a95dc569dba335b766e5d4d Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Sat, 29 Aug 2015 10:58:49 +0200
Subject: [PATCH 62/82] nvc0: change prefix of MP performance counters to HW_SM

According to NVIDIA, local performance counters (MP) are prefixed
with SM, while global performance counters (PCOUNTER) are called PM.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_query.c | 124 ++++++-------
 .../drivers/nouveau/nvc0/nvc0_screen.h        | 174 +++++++++---------
 2 files changed, 149 insertions(+), 149 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index a2a4a5cb3c2..b13df6a9485 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -56,10 +56,10 @@ struct nvc0_query {
 
 #define NVC0_QUERY_ALLOC_SPACE 256
 
-static boolean nvc0_mp_pm_query_begin(struct nvc0_context *,
+static boolean nvc0_hw_sm_query_begin(struct nvc0_context *,
                                       struct nvc0_query *);
-static void nvc0_mp_pm_query_end(struct nvc0_context *, struct nvc0_query *);
-static boolean nvc0_mp_pm_query_result(struct nvc0_context *,
+static void nvc0_hw_sm_query_end(struct nvc0_context *, struct nvc0_query *);
+static boolean nvc0_hw_sm_query_result(struct nvc0_context *,
                                        struct nvc0_query *, void *, boolean);
 
 static inline struct nvc0_query *
@@ -159,7 +159,7 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
       } else
 #endif
       if (nvc0->screen->base.device->drm_version >= 0x01000101) {
-         if (type >= NVE4_PM_QUERY(0) && type <= NVE4_PM_QUERY_LAST) {
+         if (type >= NVE4_HW_SM_QUERY(0) && type <= NVE4_HW_SM_QUERY_LAST) {
             /* for each MP:
              * [00] = WS0.C0
              * [04] = WS0.C1
@@ -189,7 +189,7 @@ nvc0_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
             space = (4 * 4 + 4 + 4) * nvc0->screen->mp_count * sizeof(uint32_t);
             break;
          } else
-         if (type >= NVC0_PM_QUERY(0) && type <= NVC0_PM_QUERY_LAST) {
+         if (type >= NVC0_HW_SM_QUERY(0) && type <= NVC0_HW_SM_QUERY_LAST) {
             /* for each MP:
              * [00] = MP.C0
              * [04] = MP.C1
@@ -327,9 +327,9 @@ nvc0_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
             q->u.value = 0;
       } else
 #endif
-      if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
-          (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
-         ret = nvc0_mp_pm_query_begin(nvc0, q);
+      if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
+          (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
+         ret = nvc0_hw_sm_query_begin(nvc0, q);
       }
       break;
    }
@@ -412,9 +412,9 @@ nvc0_query_end(struct pipe_context *pipe, struct pipe_query *pq)
          return;
       } else
 #endif
-      if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
-          (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
-         nvc0_mp_pm_query_end(nvc0, q);
+      if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
+          (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
+         nvc0_hw_sm_query_end(nvc0, q);
       }
       break;
    }
@@ -453,9 +453,9 @@ nvc0_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       return true;
    } else
 #endif
-   if ((q->type >= NVE4_PM_QUERY(0) && q->type <= NVE4_PM_QUERY_LAST) ||
-       (q->type >= NVC0_PM_QUERY(0) && q->type <= NVC0_PM_QUERY_LAST)) {
-      return nvc0_mp_pm_query_result(nvc0, q, result, wait);
+   if ((q->type >= NVE4_HW_SM_QUERY(0) && q->type <= NVE4_HW_SM_QUERY_LAST) ||
+       (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST)) {
+      return nvc0_hw_sm_query_result(nvc0, q, result, wait);
    }
 
    if (q->state != NVC0_QUERY_STATE_READY)
@@ -692,7 +692,7 @@ static const char *nvc0_drv_stat_names[] =
  * We could add a kernel interface for it, but reading the counters like this
  * has the advantage of being async (if get_result isn't called immediately).
  */
-static const uint64_t nve4_read_mp_pm_counters_code[] =
+static const uint64_t nve4_read_hw_sm_counters_code[] =
 {
    /* sched 0x20 0x20 0x20 0x20 0x20 0x20 0x20
     * mov b32 $r8 $tidx
@@ -852,7 +852,7 @@ struct nvc0_mp_counter_cfg
 #define NVC0_COUNTER_OP2_AVG_DIV_MM     5 /* avg(ctr0 / ctr1) */
 #define NVC0_COUNTER_OP2_AVG_DIV_M0     6 /* avg(ctr0) / ctr1 of MP[0]) */
 
-struct nvc0_mp_pm_query_cfg
+struct nvc0_hw_sm_query_cfg
 {
    struct nvc0_mp_counter_cfg ctr[4];
    uint8_t num_counters;
@@ -860,17 +860,17 @@ struct nvc0_mp_pm_query_cfg
    uint8_t norm[2]; /* normalization num,denom */
 };
 
-#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
-#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_PM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
-#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
+#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
+#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, s }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { nu, dn } }
+#define _M2A(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
    { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
    { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g1, s1 }, \
    {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
-#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
+#define _M2B(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
    { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g0, s0 }, \
    { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
    {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
-#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_PM_QUERY_METRIC_##n] = { { \
+#define _M2AB(n, f0, m0, g0, s0, f1, m1, g1, s1, o, nu, dn) [NVE4_HW_SM_QUERY_METRIC_##n] = { { \
    { f0, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m0, 0, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g0, s0 }, \
    { f1, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m1, 0, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g1, s1 }, \
    {}, {}, }, 2, NVC0_COUNTER_OP2_##o, { nu, dn } }
@@ -881,7 +881,7 @@ struct nvc0_mp_pm_query_cfg
  * metric-ipXc: we simply multiply by 4 to account for the 4 warp schedulers;
  *  this is inaccurate !
  */
-static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
+static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
 {
    _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
    _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
@@ -940,7 +940,7 @@ static const struct nvc0_mp_pm_query_cfg nve4_mp_pm_queries[] =
 #undef _M2B
 
 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
-static const uint64_t nvc0_read_mp_pm_counters_code[] =
+static const uint64_t nvc0_read_hw_sm_counters_code[] =
 {
    /* mov b32 $r8 $tidx
     * mov b32 $r9 $physid
@@ -1026,9 +1026,9 @@ static const char *nvc0_pm_query_names[] =
    "warps_launched",
 };
 
-#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_PM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
+#define _Q(n, f, m, g, c, s0, s1, s2, s3, s4, s5) [NVC0_HW_SM_QUERY_##n] = { { { f, NVC0_COMPUTE_MP_PM_OP_MODE_##m, c, 0, g, s0|(s1 << 8)|(s2 << 16)|(s3 << 24)|(s4##ULL << 32)|(s5##ULL << 40) }, {}, {}, {} }, 1, NVC0_COUNTER_OPn_SUM, { 1, 1 } }
 
-static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
+static const struct nvc0_hw_sm_query_cfg nvc0_hw_sm_queries[] =
 {
    _Q(ACTIVE_CYCLES,       0xaaaa, LOGOP, 0x11, 1, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00),
    _Q(ACTIVE_WARPS,        0xaaaa, LOGOP, 0x24, 6, 0x10, 0x21, 0x32, 0x43, 0x54, 0x65),
@@ -1065,34 +1065,34 @@ static const struct nvc0_mp_pm_query_cfg nvc0_mp_pm_queries[] =
 
 #undef _Q
 
-static const struct nvc0_mp_pm_query_cfg *
-nvc0_mp_pm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q)
+static const struct nvc0_hw_sm_query_cfg *
+nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_screen *screen = nvc0->screen;
 
    if (screen->base.class_3d >= NVE4_3D_CLASS)
-      return &nve4_mp_pm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
-   return &nvc0_mp_pm_queries[q->type - NVC0_PM_QUERY(0)];
+      return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+   return &nvc0_hw_sm_queries[q->type - NVC0_HW_SM_QUERY(0)];
 }
 
 boolean
-nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
+nvc0_hw_sm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    const bool is_nve4 = screen->base.class_3d >= NVE4_3D_CLASS;
-   const struct nvc0_mp_pm_query_cfg *cfg;
+   const struct nvc0_hw_sm_query_cfg *cfg;
    unsigned i, c;
    unsigned num_ab[2] = { 0, 0 };
 
-   cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+   cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
 
    /* check if we have enough free counter slots */
    for (i = 0; i < cfg->num_counters; ++i)
       num_ab[cfg->ctr[i].sig_dom]++;
 
-   if (screen->pm.num_mp_pm_active[0] + num_ab[0] > 4 ||
-       screen->pm.num_mp_pm_active[1] + num_ab[1] > 4) {
+   if (screen->pm.num_hw_sm_active[0] + num_ab[0] > 4 ||
+       screen->pm.num_hw_sm_active[1] + num_ab[1] > 4) {
       NOUVEAU_ERR("Not enough free MP counter slots !\n");
       return false;
    }
@@ -1113,14 +1113,14 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
    for (i = 0; i < cfg->num_counters; ++i) {
       const unsigned d = cfg->ctr[i].sig_dom;
 
-      if (!screen->pm.num_mp_pm_active[d]) {
+      if (!screen->pm.num_hw_sm_active[d]) {
          uint32_t m = (1 << 22) | (1 << (7 + (8 * !d)));
-         if (screen->pm.num_mp_pm_active[!d])
+         if (screen->pm.num_hw_sm_active[!d])
             m |= 1 << (7 + (8 * d));
          BEGIN_NVC0(push, SUBC_SW(0x0600), 1);
          PUSH_DATA (push, m);
       }
-      screen->pm.num_mp_pm_active[d]++;
+      screen->pm.num_hw_sm_active[d]++;
 
       for (c = d * 4; c < (d * 4 + 4); ++c) {
          if (!screen->pm.mp_counter[c]) {
@@ -1163,7 +1163,7 @@ nvc0_mp_pm_query_begin(struct nvc0_context *nvc0, struct nvc0_query *q)
 }
 
 static void
-nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
+nvc0_hw_sm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct pipe_context *pipe = &nvc0->base.pipe;
@@ -1174,9 +1174,9 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    const uint block[3] = { 32, is_nve4 ? 4 : 1, 1 };
    const uint grid[3] = { screen->mp_count, 1, 1 };
    unsigned c;
-   const struct nvc0_mp_pm_query_cfg *cfg;
+   const struct nvc0_hw_sm_query_cfg *cfg;
 
-   cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+   cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
 
    if (unlikely(!screen->pm.prog)) {
       struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
@@ -1185,11 +1185,11 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
       prog->num_gprs = 14;
       prog->parm_size = 12;
       if (is_nve4) {
-         prog->code = (uint32_t *)nve4_read_mp_pm_counters_code;
-         prog->code_size = sizeof(nve4_read_mp_pm_counters_code);
+         prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
+         prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
       } else {
-         prog->code = (uint32_t *)nvc0_read_mp_pm_counters_code;
-         prog->code_size = sizeof(nvc0_read_mp_pm_counters_code);
+         prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
+         prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
       }
       screen->pm.prog = prog;
    }
@@ -1207,7 +1207,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
    /* release counters for this query */
    for (c = 0; c < 8; ++c) {
       if (nvc0_query(screen->pm.mp_counter[c]) == q) {
-         screen->pm.num_mp_pm_active[c / 4]--;
+         screen->pm.num_hw_sm_active[c / 4]--;
          screen->pm.mp_counter[c] = NULL;
       }
    }
@@ -1234,7 +1234,7 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
       q = nvc0_query(screen->pm.mp_counter[c]);
       if (!q)
          continue;
-      cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+      cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
       for (i = 0; i < cfg->num_counters; ++i) {
          if (mask & (1 << q->ctr[i]))
             break;
@@ -1250,10 +1250,10 @@ nvc0_mp_pm_query_end(struct nvc0_context *nvc0, struct nvc0_query *q)
 }
 
 static inline bool
-nvc0_mp_pm_query_read_data(uint32_t count[32][4],
+nvc0_hw_sm_query_read_data(uint32_t count[32][4],
                            struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
-                           const struct nvc0_mp_pm_query_cfg *cfg,
+                           const struct nvc0_hw_sm_query_cfg *cfg,
                            unsigned mp_count)
 {
    unsigned p, c;
@@ -1275,10 +1275,10 @@ nvc0_mp_pm_query_read_data(uint32_t count[32][4],
 }
 
 static inline bool
-nve4_mp_pm_query_read_data(uint32_t count[32][4],
+nve4_hw_sm_query_read_data(uint32_t count[32][4],
                            struct nvc0_context *nvc0, bool wait,
                            struct nvc0_query *q,
-                           const struct nvc0_mp_pm_query_cfg *cfg,
+                           const struct nvc0_hw_sm_query_cfg *cfg,
                            unsigned mp_count)
 {
    unsigned p, c, d;
@@ -1317,22 +1317,22 @@ nve4_mp_pm_query_read_data(uint32_t count[32][4],
  * NOTE: Interpretation of IPC requires knowledge of MP count.
  */
 static boolean
-nvc0_mp_pm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
+nvc0_hw_sm_query_result(struct nvc0_context *nvc0, struct nvc0_query *q,
                         void *result, boolean wait)
 {
    uint32_t count[32][4];
    uint64_t value = 0;
    unsigned mp_count = MIN2(nvc0->screen->mp_count_compute, 32);
    unsigned p, c;
-   const struct nvc0_mp_pm_query_cfg *cfg;
+   const struct nvc0_hw_sm_query_cfg *cfg;
    bool ret;
 
-   cfg = nvc0_mp_pm_query_get_cfg(nvc0, q);
+   cfg = nvc0_hw_sm_query_get_cfg(nvc0, q);
 
    if (nvc0->screen->base.class_3d >= NVE4_3D_CLASS)
-      ret = nve4_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
+      ret = nve4_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
    else
-      ret = nvc0_mp_pm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
+      ret = nvc0_hw_sm_query_read_data(count, nvc0, wait, q, cfg, mp_count);
    if (!ret)
       return false;
 
@@ -1410,11 +1410,11 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    if (screen->base.device->drm_version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
-            count += NVE4_PM_QUERY_COUNT;
+            count += NVE4_HW_SM_QUERY_COUNT;
          } else
          if (screen->base.class_3d < NVE4_3D_CLASS) {
             /* NVC0_COMPUTE is not always enabled */
-            count += NVC0_PM_QUERY_COUNT;
+            count += NVC0_HW_SM_QUERY_COUNT;
          }
       }
    }
@@ -1444,15 +1444,15 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-            info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+            info->query_type = NVE4_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
             info->max_value.u64 =
-               (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
+               (id < NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
             info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
             return 1;
          } else
          if (screen->base.class_3d < NVE4_3D_CLASS) {
             info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-            info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+            info->query_type = NVC0_HW_SM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
             info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
             return 1;
          }
@@ -1494,7 +1494,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
          info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
 
          if (screen->base.class_3d == NVE4_3D_CLASS) {
-            info->num_queries = NVE4_PM_QUERY_COUNT;
+            info->num_queries = NVE4_HW_SM_QUERY_COUNT;
 
              /* On NVE4+, each multiprocessor have 8 hardware counters separated
               * in two distinct domains, but we allow only one active query
@@ -1504,7 +1504,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
              return 1;
          } else
          if (screen->base.class_3d < NVE4_3D_CLASS) {
-            info->num_queries = NVC0_PM_QUERY_COUNT;
+            info->num_queries = NVC0_HW_SM_QUERY_COUNT;
 
             /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
              * in a single domain. */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 531314f4200..f57a316f01e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -95,7 +95,7 @@ struct nvc0_screen {
    struct {
       struct nvc0_program *prog; /* compute state object to read MP counters */
       struct pipe_query *mp_counter[8]; /* counter to query allocation */
-      uint8_t num_mp_pm_active[2];
+      uint8_t num_hw_sm_active[2];
       bool mp_counters_enabled;
    } pm;
 
@@ -120,98 +120,98 @@ nvc0_screen(struct pipe_screen *screen)
 
 /* Performance counter queries:
  */
-#define NVE4_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
-#define NVE4_PM_QUERY_LAST   NVE4_PM_QUERY(NVE4_PM_QUERY_COUNT - 1)
+#define NVE4_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
+#define NVE4_HW_SM_QUERY_LAST   NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1)
 enum nve4_pm_queries
 {
-    NVE4_PM_QUERY_ACTIVE_CYCLES = 0,
-    NVE4_PM_QUERY_ACTIVE_WARPS,
-    NVE4_PM_QUERY_ATOM_COUNT,
-    NVE4_PM_QUERY_BRANCH,
-    NVE4_PM_QUERY_DIVERGENT_BRANCH,
-    NVE4_PM_QUERY_GLD_REQUEST,
-    NVE4_PM_QUERY_GLD_MEM_DIV_REPLAY,
-    NVE4_PM_QUERY_GST_TRANSACTIONS,
-    NVE4_PM_QUERY_GST_MEM_DIV_REPLAY,
-    NVE4_PM_QUERY_GRED_COUNT,
-    NVE4_PM_QUERY_GST_REQUEST,
-    NVE4_PM_QUERY_INST_EXECUTED,
-    NVE4_PM_QUERY_INST_ISSUED,
-    NVE4_PM_QUERY_INST_ISSUED1,
-    NVE4_PM_QUERY_INST_ISSUED2,
-    NVE4_PM_QUERY_L1_GLD_HIT,
-    NVE4_PM_QUERY_L1_GLD_MISS,
-    NVE4_PM_QUERY_L1_LOCAL_LD_HIT,
-    NVE4_PM_QUERY_L1_LOCAL_LD_MISS,
-    NVE4_PM_QUERY_L1_LOCAL_ST_HIT,
-    NVE4_PM_QUERY_L1_LOCAL_ST_MISS,
-    NVE4_PM_QUERY_L1_SHARED_LD_TRANSACTIONS,
-    NVE4_PM_QUERY_L1_SHARED_ST_TRANSACTIONS,
-    NVE4_PM_QUERY_LOCAL_LD,
-    NVE4_PM_QUERY_LOCAL_LD_TRANSACTIONS,
-    NVE4_PM_QUERY_LOCAL_ST,
-    NVE4_PM_QUERY_LOCAL_ST_TRANSACTIONS,
-    NVE4_PM_QUERY_PROF_TRIGGER_0,
-    NVE4_PM_QUERY_PROF_TRIGGER_1,
-    NVE4_PM_QUERY_PROF_TRIGGER_2,
-    NVE4_PM_QUERY_PROF_TRIGGER_3,
-    NVE4_PM_QUERY_PROF_TRIGGER_4,
-    NVE4_PM_QUERY_PROF_TRIGGER_5,
-    NVE4_PM_QUERY_PROF_TRIGGER_6,
-    NVE4_PM_QUERY_PROF_TRIGGER_7,
-    NVE4_PM_QUERY_SHARED_LD,
-    NVE4_PM_QUERY_SHARED_LD_REPLAY,
-    NVE4_PM_QUERY_SHARED_ST,
-    NVE4_PM_QUERY_SHARED_ST_REPLAY,
-    NVE4_PM_QUERY_SM_CTA_LAUNCHED,
-    NVE4_PM_QUERY_THREADS_LAUNCHED,
-    NVE4_PM_QUERY_UNCACHED_GLD_TRANSACTIONS,
-    NVE4_PM_QUERY_WARPS_LAUNCHED,
-    NVE4_PM_QUERY_METRIC_IPC,
-    NVE4_PM_QUERY_METRIC_IPAC,
-    NVE4_PM_QUERY_METRIC_IPEC,
-    NVE4_PM_QUERY_METRIC_MP_OCCUPANCY,
-    NVE4_PM_QUERY_METRIC_MP_EFFICIENCY,
-    NVE4_PM_QUERY_METRIC_INST_REPLAY_OHEAD,
-    NVE4_PM_QUERY_COUNT
+    NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0,
+    NVE4_HW_SM_QUERY_ACTIVE_WARPS,
+    NVE4_HW_SM_QUERY_ATOM_COUNT,
+    NVE4_HW_SM_QUERY_BRANCH,
+    NVE4_HW_SM_QUERY_DIVERGENT_BRANCH,
+    NVE4_HW_SM_QUERY_GLD_REQUEST,
+    NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
+    NVE4_HW_SM_QUERY_GST_TRANSACTIONS,
+    NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
+    NVE4_HW_SM_QUERY_GRED_COUNT,
+    NVE4_HW_SM_QUERY_GST_REQUEST,
+    NVE4_HW_SM_QUERY_INST_EXECUTED,
+    NVE4_HW_SM_QUERY_INST_ISSUED,
+    NVE4_HW_SM_QUERY_INST_ISSUED1,
+    NVE4_HW_SM_QUERY_INST_ISSUED2,
+    NVE4_HW_SM_QUERY_L1_GLD_HIT,
+    NVE4_HW_SM_QUERY_L1_GLD_MISS,
+    NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT,
+    NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS,
+    NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT,
+    NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS,
+    NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
+    NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
+    NVE4_HW_SM_QUERY_LOCAL_LD,
+    NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
+    NVE4_HW_SM_QUERY_LOCAL_ST,
+    NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_0,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_1,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_2,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_3,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_4,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_5,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_6,
+    NVE4_HW_SM_QUERY_PROF_TRIGGER_7,
+    NVE4_HW_SM_QUERY_SHARED_LD,
+    NVE4_HW_SM_QUERY_SHARED_LD_REPLAY,
+    NVE4_HW_SM_QUERY_SHARED_ST,
+    NVE4_HW_SM_QUERY_SHARED_ST_REPLAY,
+    NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED,
+    NVE4_HW_SM_QUERY_THREADS_LAUNCHED,
+    NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
+    NVE4_HW_SM_QUERY_WARPS_LAUNCHED,
+    NVE4_HW_SM_QUERY_METRIC_IPC,
+    NVE4_HW_SM_QUERY_METRIC_IPAC,
+    NVE4_HW_SM_QUERY_METRIC_IPEC,
+    NVE4_HW_SM_QUERY_METRIC_MP_OCCUPANCY,
+    NVE4_HW_SM_QUERY_METRIC_MP_EFFICIENCY,
+    NVE4_HW_SM_QUERY_METRIC_INST_REPLAY_OHEAD,
+    NVE4_HW_SM_QUERY_COUNT
 };
 
-#define NVC0_PM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
-#define NVC0_PM_QUERY_LAST   NVC0_PM_QUERY(NVC0_PM_QUERY_COUNT - 1)
+#define NVC0_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
+#define NVC0_HW_SM_QUERY_LAST   NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
 enum nvc0_pm_queries
 {
-    NVC0_PM_QUERY_ACTIVE_CYCLES = 0,
-    NVC0_PM_QUERY_ACTIVE_WARPS,
-    NVC0_PM_QUERY_ATOM_COUNT,
-    NVC0_PM_QUERY_BRANCH,
-    NVC0_PM_QUERY_DIVERGENT_BRANCH,
-    NVC0_PM_QUERY_GLD_REQUEST,
-    NVC0_PM_QUERY_GRED_COUNT,
-    NVC0_PM_QUERY_GST_REQUEST,
-    NVC0_PM_QUERY_INST_EXECUTED,
-    NVC0_PM_QUERY_INST_ISSUED1_0,
-    NVC0_PM_QUERY_INST_ISSUED1_1,
-    NVC0_PM_QUERY_INST_ISSUED2_0,
-    NVC0_PM_QUERY_INST_ISSUED2_1,
-    NVC0_PM_QUERY_LOCAL_LD,
-    NVC0_PM_QUERY_LOCAL_ST,
-    NVC0_PM_QUERY_PROF_TRIGGER_0,
-    NVC0_PM_QUERY_PROF_TRIGGER_1,
-    NVC0_PM_QUERY_PROF_TRIGGER_2,
-    NVC0_PM_QUERY_PROF_TRIGGER_3,
-    NVC0_PM_QUERY_PROF_TRIGGER_4,
-    NVC0_PM_QUERY_PROF_TRIGGER_5,
-    NVC0_PM_QUERY_PROF_TRIGGER_6,
-    NVC0_PM_QUERY_PROF_TRIGGER_7,
-    NVC0_PM_QUERY_SHARED_LD,
-    NVC0_PM_QUERY_SHARED_ST,
-    NVC0_PM_QUERY_THREADS_LAUNCHED,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_0,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_1,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_2,
-    NVC0_PM_QUERY_TH_INST_EXECUTED_3,
-    NVC0_PM_QUERY_WARPS_LAUNCHED,
-    NVC0_PM_QUERY_COUNT
+    NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
+    NVC0_HW_SM_QUERY_ACTIVE_WARPS,
+    NVC0_HW_SM_QUERY_ATOM_COUNT,
+    NVC0_HW_SM_QUERY_BRANCH,
+    NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
+    NVC0_HW_SM_QUERY_GLD_REQUEST,
+    NVC0_HW_SM_QUERY_GRED_COUNT,
+    NVC0_HW_SM_QUERY_GST_REQUEST,
+    NVC0_HW_SM_QUERY_INST_EXECUTED,
+    NVC0_HW_SM_QUERY_INST_ISSUED1_0,
+    NVC0_HW_SM_QUERY_INST_ISSUED1_1,
+    NVC0_HW_SM_QUERY_INST_ISSUED2_0,
+    NVC0_HW_SM_QUERY_INST_ISSUED2_1,
+    NVC0_HW_SM_QUERY_LOCAL_LD,
+    NVC0_HW_SM_QUERY_LOCAL_ST,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
+    NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+    NVC0_HW_SM_QUERY_SHARED_LD,
+    NVC0_HW_SM_QUERY_SHARED_ST,
+    NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
+    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
+    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
+    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
+    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
+    NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+    NVC0_HW_SM_QUERY_COUNT
 };
 
 /* Driver statistics queries:

From 58e24b4761ec8c348bf6825c2355a6e047599306 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 24 Aug 2015 23:31:00 -0400
Subject: [PATCH 63/82] freedreno/a3xx: add basic clip plane support

The hardware is capable of dealing with GL1-style user clip planes.
No clip vertex, no clip distances. Fixes a number of ucp tests, as well
as neverball.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.0" <mesa-stable@lists.freedesktop.org>
---
 src/gallium/drivers/freedreno/a3xx/fd3_emit.c | 19 +++++++++++++++++++
 .../drivers/freedreno/freedreno_context.h     |  2 ++
 .../drivers/freedreno/freedreno_state.c       |  4 +++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 752e7f88cb9..6f514ed05df 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -563,10 +563,29 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		val |= COND(fp->writes_pos, A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE);
 		val |= COND(fp->frag_coord, A3XX_GRAS_CL_CLIP_CNTL_ZCOORD |
 				A3XX_GRAS_CL_CLIP_CNTL_WCOORD);
+		/* TODO only use if prog doesn't use clipvertex/clipdist */
+		val |= MIN2(util_bitcount(ctx->rasterizer->clip_plane_enable), 6) << 26;
 		OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
 		OUT_RING(ring, val);
 	}
 
+	if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_UCP)) {
+		uint32_t planes = ctx->rasterizer->clip_plane_enable;
+		int count = 0;
+
+		while (planes && count < 6) {
+			int i = ffs(planes) - 1;
+
+			planes &= ~(1U << i);
+			fd_wfi(ctx, ring);
+			OUT_PKT0(ring, REG_A3XX_GRAS_CL_USER_PLANE(count++), 4);
+			OUT_RING(ring, fui(ctx->ucp.ucp[i][0]));
+			OUT_RING(ring, fui(ctx->ucp.ucp[i][1]));
+			OUT_RING(ring, fui(ctx->ucp.ucp[i][2]));
+			OUT_RING(ring, fui(ctx->ucp.ucp[i][3]));
+		}
+	}
+
 	/* NOTE: since primitive_restart is not actually part of any
 	 * state object, we need to make sure that we always emit
 	 * PRIM_VTX_CNTL.. either that or be more clever and detect
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 509a90fdf23..3486c2fd1b7 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -334,6 +334,7 @@ struct fd_context {
 		FD_DIRTY_INDEXBUF    = (1 << 16),
 		FD_DIRTY_SCISSOR     = (1 << 17),
 		FD_DIRTY_STREAMOUT   = (1 << 18),
+		FD_DIRTY_UCP         = (1 << 19),
 	} dirty;
 
 	struct pipe_blend_state *blend;
@@ -355,6 +356,7 @@ struct fd_context {
 	struct fd_constbuf_stateobj constbuf[PIPE_SHADER_TYPES];
 	struct pipe_index_buffer indexbuf;
 	struct fd_streamout_stateobj streamout;
+	struct pipe_clip_state ucp;
 
 	/* GMEM/tile handling fxns: */
 	void (*emit_tile_init)(struct fd_context *ctx);
diff --git a/src/gallium/drivers/freedreno/freedreno_state.c b/src/gallium/drivers/freedreno/freedreno_state.c
index 7bf8bdb4507..e75865a9387 100644
--- a/src/gallium/drivers/freedreno/freedreno_state.c
+++ b/src/gallium/drivers/freedreno/freedreno_state.c
@@ -65,7 +65,9 @@ static void
 fd_set_clip_state(struct pipe_context *pctx,
 		const struct pipe_clip_state *clip)
 {
-	DBG("TODO: ");
+	struct fd_context *ctx = fd_context(pctx);
+	ctx->ucp = *clip;
+	ctx->dirty |= FD_DIRTY_UCP;
 }
 
 static void

From a5a96118ed728969c5a41e643cf6ffd0c42461f0 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 26 Aug 2015 00:11:23 -0400
Subject: [PATCH 64/82] freedreno/a3xx: implement half-z clipping

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/freedreno/a3xx/a3xx.xml.h       | 1 +
 src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c | 3 ++-
 src/gallium/drivers/freedreno/freedreno_screen.c    | 2 +-
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index 441bfec5756..a157dc33db9 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -680,6 +680,7 @@ static inline uint32_t REG_A3XX_CP_PROTECT_REG(uint32_t i0) { return 0x00000460
 #define A3XX_GRAS_CL_CLIP_CNTL_VP_CLIP_CODE_IGNORE		0x00080000
 #define A3XX_GRAS_CL_CLIP_CNTL_VP_XFORM_DISABLE			0x00100000
 #define A3XX_GRAS_CL_CLIP_CNTL_PERSP_DIVISION_DISABLE		0x00200000
+#define A3XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z			0x00400000
 #define A3XX_GRAS_CL_CLIP_CNTL_ZCOORD				0x00800000
 #define A3XX_GRAS_CL_CLIP_CNTL_WCOORD				0x01000000
 #define A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE			0x02000000
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
index 583caaa806f..260eacd301a 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_rasterizer.c
@@ -65,7 +65,8 @@ fd3_rasterizer_state_create(struct pipe_context *pctx,
 	if (cso->multisample)
 		TODO
 */
-	so->gras_cl_clip_cntl = A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER; /* ??? */
+	so->gras_cl_clip_cntl = A3XX_GRAS_CL_CLIP_CNTL_IJ_PERSP_CENTER /* ??? */ |
+		COND(cso->clip_halfz, A3XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z);
 	so->gras_su_point_minmax =
 			A3XX_GRAS_SU_POINT_MINMAX_MIN(psize_min) |
 			A3XX_GRAS_SU_POINT_MINMAX_MAX(psize_max);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 86e9a21da2f..17dd47c71ab 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -191,6 +191,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 		return 16383;
 
 	case PIPE_CAP_DEPTH_CLIP_DISABLE:
+	case PIPE_CAP_CLIP_HALFZ:
 	case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
 		return is_a3xx(screen);
 
@@ -228,7 +229,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
-	case PIPE_CAP_CLIP_HALFZ:
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:

From e321596e9f66207cc679b4ddbee13d4c8cdb896f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sat, 22 Aug 2015 18:05:37 +0200
Subject: [PATCH 65/82] winsys/radeon: handle non-zero finite timeout when
 waiting for buffers

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/winsys/radeon/drm/radeon_drm_bo.c | 54 +++++++++++++------
 src/gallium/winsys/radeon/drm/radeon_drm_cs.c | 25 +--------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 3a9ac445b24..600ced924ba 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -101,30 +101,54 @@ static struct radeon_bo *get_radeon_bo(struct pb_buffer *_buf)
     return bo;
 }
 
+static bool radeon_bo_is_busy(struct radeon_bo *bo)
+{
+    struct drm_radeon_gem_busy args = {0};
+
+    args.handle = bo->handle;
+    return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
+                               &args, sizeof(args)) != 0;
+}
+
+static void radeon_bo_wait_idle(struct radeon_bo *bo)
+{
+    struct drm_radeon_gem_wait_idle args = {0};
+
+    args.handle = bo->handle;
+    while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
+                           &args, sizeof(args)) == -EBUSY);
+}
+
 static bool radeon_bo_wait(struct pb_buffer *_buf, uint64_t timeout,
                            enum radeon_bo_usage usage)
 {
-   struct radeon_bo *bo = get_radeon_bo(_buf);
+    struct radeon_bo *bo = get_radeon_bo(_buf);
+    int64_t abs_timeout;
 
-   /* Wait if any ioctl is being submitted with this buffer. */
-   if (!os_wait_until_zero(&bo->num_active_ioctls, timeout))
-      return false;
+    /* No timeout. Just query. */
+    if (timeout == 0)
+        return !bo->num_active_ioctls && !radeon_bo_is_busy(bo);
 
-   /* TODO: handle arbitrary timeout */
-    if (!timeout) {
-        struct drm_radeon_gem_busy args = {0};
+    abs_timeout = os_time_get_absolute_timeout(timeout);
 
-        args.handle = bo->handle;
-        return drmCommandWriteRead(bo->rws->fd, DRM_RADEON_GEM_BUSY,
-                                   &args, sizeof(args)) == 0;
-    } else {
-        struct drm_radeon_gem_wait_idle args = {0};
+    /* Wait if any ioctl is being submitted with this buffer. */
+    if (!os_wait_until_zero_abs_timeout(&bo->num_active_ioctls, abs_timeout))
+        return false;
 
-        args.handle = bo->handle;
-        while (drmCommandWrite(bo->rws->fd, DRM_RADEON_GEM_WAIT_IDLE,
-                               &args, sizeof(args)) == -EBUSY);
+    /* Infinite timeout. */
+    if (abs_timeout == PIPE_TIMEOUT_INFINITE) {
+        radeon_bo_wait_idle(bo);
         return true;
     }
+
+    /* Other timeouts need to be emulated with a loop. */
+    while (radeon_bo_is_busy(bo)) {
+       if (os_time_get_nano() >= abs_timeout)
+          return false;
+       os_time_sleep(10);
+    }
+
+    return true;
 }
 
 static enum radeon_bo_domain get_valid_domain(enum radeon_bo_domain domain)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
index f04a696988a..341af55df8b 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_cs.c
@@ -645,29 +645,8 @@ static bool radeon_fence_wait(struct radeon_winsys *ws,
                               struct pipe_fence_handle *fence,
                               uint64_t timeout)
 {
-    struct pb_buffer *rfence = (struct pb_buffer*)fence;
-
-    if (timeout == 0)
-        return ws->buffer_wait(rfence, 0, RADEON_USAGE_READWRITE);
-
-    if (timeout != PIPE_TIMEOUT_INFINITE) {
-        int64_t start_time = os_time_get();
-
-        /* Convert to microseconds. */
-        timeout /= 1000;
-
-        /* Wait in a loop. */
-        while (!ws->buffer_wait(rfence, 0, RADEON_USAGE_READWRITE)) {
-            if (os_time_get() - start_time >= timeout) {
-                return FALSE;
-            }
-            os_time_sleep(10);
-        }
-        return TRUE;
-    }
-
-    ws->buffer_wait(rfence, PIPE_TIMEOUT_INFINITE, RADEON_USAGE_READWRITE);
-    return TRUE;
+    return ws->buffer_wait((struct pb_buffer*)fence, timeout,
+                           RADEON_USAGE_READWRITE);
 }
 
 static void radeon_fence_reference(struct pipe_fence_handle **dst,

From 437cb1e3f482570447501526927df4d80c845bf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 23 Aug 2015 12:57:09 +0200
Subject: [PATCH 66/82] gallium/radeon: fix the ADDRESS_HI mask for EVENT_WRITE
 CIK packets

Cc: mesa-stable@lists.freedesktop.org
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/r600_query.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 7057aa19a7c..65339bbb66f 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -197,7 +197,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32UL) & 0xFF);
+		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -206,13 +206,13 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32UL) & 0xFF);
+		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (3 << 29) | ((va >> 32UL) & 0xFF));
+		radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
 		radeon_emit(cs, 0);
 		radeon_emit(cs, 0);
 		break;
@@ -220,7 +220,7 @@ static void r600_emit_query_begin(struct r600_common_context *ctx, struct r600_q
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32UL) & 0xFF);
+		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	default:
 		assert(0);
@@ -254,7 +254,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_ZPASS_DONE) | EVENT_INDEX(1));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32UL) & 0xFF);
+		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	case PIPE_QUERY_PRIMITIVES_EMITTED:
 	case PIPE_QUERY_PRIMITIVES_GENERATED:
@@ -264,7 +264,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(event_type_for_stream(query)) | EVENT_INDEX(3));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32UL) & 0xFF);
+		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	case PIPE_QUERY_TIME_ELAPSED:
 		va += query->buffer.results_end + query->result_size/2;
@@ -273,7 +273,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE_EOP, 4, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_CACHE_FLUSH_AND_INV_TS_EVENT) | EVENT_INDEX(5));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (3 << 29) | ((va >> 32UL) & 0xFF));
+		radeon_emit(cs, (3 << 29) | ((va >> 32) & 0xFFFF));
 		radeon_emit(cs, 0);
 		radeon_emit(cs, 0);
 		break;
@@ -282,7 +282,7 @@ static void r600_emit_query_end(struct r600_common_context *ctx, struct r600_que
 		radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 2, 0));
 		radeon_emit(cs, EVENT_TYPE(EVENT_TYPE_SAMPLE_PIPELINESTAT) | EVENT_INDEX(2));
 		radeon_emit(cs, va);
-		radeon_emit(cs, (va >> 32UL) & 0xFF);
+		radeon_emit(cs, (va >> 32) & 0xFFFF);
 		break;
 	default:
 		assert(0);

From 379e3382e8631fdd15c28c6643512205483e7b99 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 23 Aug 2015 13:05:53 +0200
Subject: [PATCH 67/82] radeonsi: remove no-op 32-bit masking

Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
---
 src/gallium/drivers/radeon/r600_query.c       | 4 ++--
 src/gallium/drivers/radeonsi/si_compute.c     | 2 +-
 src/gallium/drivers/radeonsi/si_descriptors.c | 2 +-
 src/gallium/drivers/radeonsi/si_dma.c         | 4 ++--
 src/gallium/drivers/radeonsi/si_shader.c      | 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index 65339bbb66f..deeae0a6a65 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -341,8 +341,8 @@ static void r600_emit_query_predication(struct r600_common_context *ctx, struct
 	
 			while (results_base < qbuf->results_end) {
 				radeon_emit(cs, PKT3(PKT3_SET_PREDICATION, 1, 0));
-				radeon_emit(cs, (va + results_base) & 0xFFFFFFFFUL);
-				radeon_emit(cs, op | (((va + results_base) >> 32UL) & 0xFF));
+				radeon_emit(cs, va + results_base);
+				radeon_emit(cs, op | (((va + results_base) >> 32) & 0xFF));
 				r600_emit_reloc(ctx, &ctx->rings.gfx, qbuf->buf, RADEON_USAGE_READ,
 						RADEON_PRIO_MIN);
 				results_base += query->result_size;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index d4fe5653687..0cdecd6da79 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -362,7 +362,7 @@ static void si_launch_grid(
 	shader_va += pc;
 #endif
 	si_pm4_add_bo(pm4, shader->bo, RADEON_USAGE_READ, RADEON_PRIO_SHADER_DATA);
-	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, (shader_va >> 8) & 0xffffffff);
+	si_pm4_set_reg(pm4, R_00B830_COMPUTE_PGM_LO, shader_va >> 8);
 	si_pm4_set_reg(pm4, R_00B834_COMPUTE_PGM_HI, shader_va >> 40);
 
 	si_pm4_set_reg(pm4, R_00B848_COMPUTE_PGM_RSRC1,
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 890be071596..b74c893c7d5 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -426,7 +426,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 		va = rbuffer->gpu_address + offset;
 
 		/* Fill in T# buffer resource description */
-		desc[0] = va & 0xFFFFFFFF;
+		desc[0] = va;
 		desc[1] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
 			  S_008F04_STRIDE(vb->stride);
 
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 7a0076e7aa9..1a7eeaecf9e 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -86,8 +86,8 @@ static void si_dma_copy_buffer(struct si_context *ctx,
 	for (i = 0; i < ncopy; i++) {
 		csize = size < max_csize ? size : max_csize;
 		cs->buf[cs->cdw++] = SI_DMA_PACKET(SI_DMA_PACKET_COPY, sub_cmd, csize);
-		cs->buf[cs->cdw++] = dst_offset & 0xffffffff;
-		cs->buf[cs->cdw++] = src_offset & 0xffffffff;
+		cs->buf[cs->cdw++] = dst_offset;
+		cs->buf[cs->cdw++] = src_offset;
 		cs->buf[cs->cdw++] = (dst_offset >> 32UL) & 0xff;
 		cs->buf[cs->cdw++] = (src_offset >> 32UL) & 0xff;
 		dst_offset += csize << shift;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 98b42890f7d..ab5b3ee9ce9 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -3781,7 +3781,7 @@ void si_shader_apply_scratch_relocs(struct si_context *sctx,
 			uint64_t scratch_va)
 {
 	unsigned i;
-	uint32_t scratch_rsrc_dword0 = scratch_va & 0xffffffff;
+	uint32_t scratch_rsrc_dword0 = scratch_va;
 	uint32_t scratch_rsrc_dword1 =
 		S_008F04_BASE_ADDRESS_HI(scratch_va >> 32)
 		|  S_008F04_STRIDE(shader->scratch_bytes_per_wave / 64);

From 7dc8a3497fdf0fbd8ff4381712a54c2cd94bfbfc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Tue, 25 Aug 2015 19:21:38 +0200
Subject: [PATCH 68/82] radeonsi: don't use the emit qt keyword in si_init_atom

It confuses my editor.
---
 src/gallium/drivers/radeonsi/si_state.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index c923ea7e154..806ab5f0e22 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -35,10 +35,10 @@
 #include "util/u_pstipple.h"
 
 static void si_init_atom(struct r600_atom *atom, struct r600_atom **list_elem,
-			 void (*emit)(struct si_context *ctx, struct r600_atom *state),
+			 void (*emit_func)(struct si_context *ctx, struct r600_atom *state),
 			 unsigned num_dw)
 {
-	atom->emit = (void*)emit;
+	atom->emit = (void*)emit_func;
 	atom->num_dw = num_dw;
 	atom->dirty = false;
 	*list_elem = atom;

From 7b6369eb69e688e5bf739dbe60452950a794f55d Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 29 Aug 2015 18:31:06 +1000
Subject: [PATCH 69/82] r600g: Remove dead assigment to 'gs_input_prim' in
 shader state
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Note that 'geometry shader properties' should be carried in the
selector state over the shader state in any case.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/r600_shader.c | 3 ---
 src/gallium/drivers/r600/r600_shader.h | 1 -
 2 files changed, 4 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 4c4b6005981..44e41fb30a6 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2009,9 +2009,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
 				/* we don't need this one */
 				break;
-			case TGSI_PROPERTY_GS_INPUT_PRIM:
-				shader->gs_input_prim = property->u[0].Data;
-				break;
 			case TGSI_PROPERTY_GS_OUTPUT_PRIM:
 				shader->gs_output_prim = property->u[0].Data;
 				break;
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 927bac57673..2b99b22a6b2 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -79,7 +79,6 @@ struct r600_shader {
 	boolean			uses_index_registers;
 
 	/* geometry shader properties */
-	unsigned		gs_input_prim;
 	unsigned		gs_output_prim;
 	unsigned		gs_max_out_vertices;
 	unsigned		gs_num_invocations;

From b4dee1b6360a91117c7a754ed70f359f6000a0de Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 29 Aug 2015 18:31:07 +1000
Subject: [PATCH 70/82] r600g: Move geometry properties state from shader to
 selector
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/evergreen_state.c   | 16 ++++++++--------
 src/gallium/drivers/r600/r600_pipe.h         |  5 +++++
 src/gallium/drivers/r600/r600_shader.c       |  6 +++---
 src/gallium/drivers/r600/r600_shader.h       |  4 ----
 src/gallium/drivers/r600/r600_state.c        | 12 ++++++------
 src/gallium/drivers/r600/r600_state_common.c |  2 +-
 6 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 6a91d4709f4..7c82390ba40 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -2143,11 +2143,11 @@ static void evergreen_emit_shader_stages(struct r600_context *rctx, struct r600_
 	if (state->geom_enable) {
 		uint32_t cut_val;
 
-		if (rctx->gs_shader->current->shader.gs_max_out_vertices <= 128)
+		if (rctx->gs_shader->gs_max_out_vertices <= 128)
 			cut_val = V_028A40_GS_CUT_128;
-		else if (rctx->gs_shader->current->shader.gs_max_out_vertices <= 256)
+		else if (rctx->gs_shader->gs_max_out_vertices <= 256)
 			cut_val = V_028A40_GS_CUT_256;
-		else if (rctx->gs_shader->current->shader.gs_max_out_vertices <= 512)
+		else if (rctx->gs_shader->gs_max_out_vertices <= 512)
 			cut_val = V_028A40_GS_CUT_512;
 		else
 			cut_val = V_028A40_GS_CUT_1024;
@@ -3013,7 +3013,7 @@ void evergreen_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader
 	struct r600_shader *rshader = &shader->shader;
 	struct r600_shader *cp_shader = &shader->gs_copy_shader->shader;
 	unsigned gsvs_itemsize =
-			(cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2;
+			(cp_shader->ring_item_size * shader->selector->gs_max_out_vertices) >> 2;
 
 	r600_init_command_buffer(cb, 64);
 
@@ -3022,14 +3022,14 @@ void evergreen_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader
 	r600_store_context_reg(cb, R_028AB8_VGT_VTX_CNT_EN, 1);
 
 	r600_store_context_reg(cb, R_028B38_VGT_GS_MAX_VERT_OUT,
-			       S_028B38_MAX_VERT_OUT(rshader->gs_max_out_vertices));
+			       S_028B38_MAX_VERT_OUT(shader->selector->gs_max_out_vertices));
 	r600_store_context_reg(cb, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
-			       r600_conv_prim_to_gs_out(rshader->gs_output_prim));
+			       r600_conv_prim_to_gs_out(shader->selector->gs_output_prim));
 
 	if (rctx->screen->b.info.drm_minor >= 35) {
 		r600_store_context_reg(cb, R_028B90_VGT_GS_INSTANCE_CNT,
-				S_028B90_CNT(MIN2(rshader->gs_num_invocations, 127)) |
-				S_028B90_ENABLE(rshader->gs_num_invocations > 0));
+				S_028B90_CNT(MIN2(shader->selector->gs_num_invocations, 127)) |
+				S_028B90_ENABLE(shader->selector->gs_num_invocations > 0));
 	}
 	r600_store_context_reg_seq(cb, R_02891C_SQ_GS_VERT_ITEMSIZE, 4);
 	r600_store_value(cb, cp_shader->ring_item_size >> 2);
diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 3247aba969e..eb7036048e5 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -311,6 +311,11 @@ struct r600_pipe_shader_selector {
 	/* PIPE_SHADER_[VERTEX|FRAGMENT|...] */
 	unsigned	type;
 
+	/* geometry shader properties */
+	unsigned	gs_output_prim;
+	unsigned	gs_max_out_vertices;
+	unsigned	gs_num_invocations;
+
 	unsigned	nr_ps_max_color_exports;
 };
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 44e41fb30a6..f0b794c809e 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -2010,13 +2010,13 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 				/* we don't need this one */
 				break;
 			case TGSI_PROPERTY_GS_OUTPUT_PRIM:
-				shader->gs_output_prim = property->u[0].Data;
+				pipeshader->selector->gs_output_prim = property->u[0].Data;
 				break;
 			case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
-				shader->gs_max_out_vertices = property->u[0].Data;
+				pipeshader->selector->gs_max_out_vertices = property->u[0].Data;
 				break;
 			case TGSI_PROPERTY_GS_INVOCATIONS:
-				shader->gs_num_invocations = property->u[0].Data;
+				pipeshader->selector->gs_num_invocations = property->u[0].Data;
 				break;
 			}
 			break;
diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h
index 2b99b22a6b2..f5ca9d67f1e 100644
--- a/src/gallium/drivers/r600/r600_shader.h
+++ b/src/gallium/drivers/r600/r600_shader.h
@@ -78,10 +78,6 @@ struct r600_shader {
 	/* Temporarily workaround SB not handling CF_INDEX_[01] index registers */
 	boolean			uses_index_registers;
 
-	/* geometry shader properties */
-	unsigned		gs_output_prim;
-	unsigned		gs_max_out_vertices;
-	unsigned		gs_num_invocations;
 	/* size in bytes of a data item in the ring (single vertex data) */
 	unsigned		ring_item_size;
 
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index 5cc2283792d..51527631efd 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -1951,11 +1951,11 @@ static void r600_emit_shader_stages(struct r600_context *rctx, struct r600_atom
 	if (state->geom_enable) {
 		uint32_t cut_val;
 
-		if (rctx->gs_shader->current->shader.gs_max_out_vertices <= 128)
+		if (rctx->gs_shader->gs_max_out_vertices <= 128)
 			cut_val = V_028A40_GS_CUT_128;
-		else if (rctx->gs_shader->current->shader.gs_max_out_vertices <= 256)
+		else if (rctx->gs_shader->gs_max_out_vertices <= 256)
 			cut_val = V_028A40_GS_CUT_256;
-		else if (rctx->gs_shader->current->shader.gs_max_out_vertices <= 512)
+		else if (rctx->gs_shader->gs_max_out_vertices <= 512)
 			cut_val = V_028A40_GS_CUT_512;
 		else
 			cut_val = V_028A40_GS_CUT_1024;
@@ -2650,7 +2650,7 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 	struct r600_shader *rshader = &shader->shader;
 	struct r600_shader *cp_shader = &shader->gs_copy_shader->shader;
 	unsigned gsvs_itemsize =
-			(cp_shader->ring_item_size * rshader->gs_max_out_vertices) >> 2;
+			(cp_shader->ring_item_size * shader->selector->gs_max_out_vertices) >> 2;
 
 	r600_init_command_buffer(cb, 64);
 
@@ -2659,10 +2659,10 @@ void r600_update_gs_state(struct pipe_context *ctx, struct r600_pipe_shader *sha
 
 	if (rctx->b.chip_class >= R700) {
 		r600_store_context_reg(cb, R_028B38_VGT_GS_MAX_VERT_OUT,
-				       S_028B38_MAX_VERT_OUT(rshader->gs_max_out_vertices));
+				       S_028B38_MAX_VERT_OUT(shader->selector->gs_max_out_vertices));
 	}
 	r600_store_context_reg(cb, R_028A6C_VGT_GS_OUT_PRIM_TYPE,
-			       r600_conv_prim_to_gs_out(rshader->gs_output_prim));
+			       r600_conv_prim_to_gs_out(shader->selector->gs_output_prim));
 
 	r600_store_context_reg(cb, R_0288C8_SQ_GS_VERT_ITEMSIZE,
 	                       cp_shader->ring_item_size >> 2);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index a05dd8352c7..63746b55502 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -1524,7 +1524,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		unsigned prim = info.mode;
 
 		if (rctx->gs_shader) {
-			prim = rctx->gs_shader->current->shader.gs_output_prim;
+			prim = rctx->gs_shader->gs_output_prim;
 		}
 		prim = r600_conv_prim_to_gs_out(prim); /* decrease the number of types to 3 */
 

From 3eed81a97b2fa1f98a2ae577b8b6e04cb144f31a Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 29 Aug 2015 18:31:08 +1000
Subject: [PATCH 71/82] r600g: Set geometry properties in
 r600_create_shader_state()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The selector is shared by all shader variants, so the
individual shaders shouldn't change it. Use tgsi_shader_scan()
results to set geometry properties within a
r600_create_shader_state() call and treat said propertices in
the selector as read-only within r600_shader_from_tgsi().

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/r600_pipe.h         |  3 ++
 src/gallium/drivers/r600/r600_shader.c       | 31 ++++----------------
 src/gallium/drivers/r600/r600_state_common.c | 14 +++++++++
 3 files changed, 23 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index eb7036048e5..4bd3d7cf75b 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -36,6 +36,8 @@
 #include "util/list.h"
 #include "util/u_transfer.h"
 
+#include "tgsi/tgsi_scan.h"
+
 #define R600_NUM_ATOMS 75
 
 #define R600_MAX_VIEWPORTS 16
@@ -305,6 +307,7 @@ struct r600_pipe_shader_selector {
 
 	struct tgsi_token       *tokens;
 	struct pipe_stream_output_info  so;
+	struct tgsi_shader_info		info;
 
 	unsigned	num_shaders;
 
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index f0b794c809e..a265fb81225 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1809,7 +1809,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	struct tgsi_token *tokens = pipeshader->selector->tokens;
 	struct pipe_stream_output_info so = pipeshader->selector->so;
 	struct tgsi_full_immediate *immediate;
-	struct tgsi_full_property *property;
 	struct r600_shader_ctx ctx;
 	struct r600_bytecode_output output[32];
 	unsigned output_done, noutput;
@@ -1968,6 +1967,12 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	ctx.nliterals = 0;
 	ctx.literals = NULL;
 	shader->fs_write_all = FALSE;
+	if (ctx.info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
+		shader->fs_write_all = TRUE;
+
+	shader->vs_position_window_space = FALSE;
+	if (ctx.info.properties[TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION])
+		shader->vs_position_window_space = TRUE;
 
 	if (shader->vs_as_gs_a)
 		vs_add_primid_output(&ctx, key.vs.prim_id_out);
@@ -1994,31 +1999,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 				goto out_err;
 			break;
 		case TGSI_TOKEN_TYPE_INSTRUCTION:
-			break;
 		case TGSI_TOKEN_TYPE_PROPERTY:
-			property = &ctx.parse.FullToken.FullProperty;
-			switch (property->Property.PropertyName) {
-			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-				if (property->u[0].Data == 1)
-					shader->fs_write_all = TRUE;
-				break;
-			case TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION:
-				if (property->u[0].Data == 1)
-					shader->vs_position_window_space = TRUE;
-				break;
-			case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
-				/* we don't need this one */
-				break;
-			case TGSI_PROPERTY_GS_OUTPUT_PRIM:
-				pipeshader->selector->gs_output_prim = property->u[0].Data;
-				break;
-			case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
-				pipeshader->selector->gs_max_out_vertices = property->u[0].Data;
-				break;
-			case TGSI_PROPERTY_GS_INVOCATIONS:
-				pipeshader->selector->gs_num_invocations = property->u[0].Data;
-				break;
-			}
 			break;
 		default:
 			R600_ERR("unsupported token type %d\n", ctx.parse.FullToken.Token.Type);
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 63746b55502..d9cf736b043 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -34,6 +34,7 @@
 #include "util/u_upload_mgr.h"
 #include "util/u_math.h"
 #include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
 
 void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw)
 {
@@ -818,6 +819,19 @@ static void *r600_create_shader_state(struct pipe_context *ctx,
 	sel->type = pipe_shader_type;
 	sel->tokens = tgsi_dup_tokens(state->tokens);
 	sel->so = state->stream_output;
+	tgsi_scan_shader(state->tokens, &sel->info);
+
+	switch (pipe_shader_type) {
+	case PIPE_SHADER_GEOMETRY:
+		sel->gs_output_prim =
+			sel->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
+		sel->gs_max_out_vertices =
+			sel->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+		sel->gs_num_invocations =
+			sel->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
+		break;
+	}
+
 	return sel;
 }
 

From 0d19dc302f21d00b88ccb7b70e5a110d17ea4fdf Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 29 Aug 2015 18:31:09 +1000
Subject: [PATCH 72/82] r600g: Use TGSI parse results instead of manually
 exfiltrating
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This makes better use of the work that the TGSI API has done for
us.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/r600/r600_shader.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index a265fb81225..b7d7828a9c2 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -1839,7 +1839,7 @@ static int r600_shader_from_tgsi(struct r600_context *rctx,
 	shader->indirect_files = ctx.info.indirect_files;
 	indirect_gprs = ctx.info.indirect_files & ~(1 << TGSI_FILE_CONSTANT);
 	tgsi_parse_init(&ctx.parse, tokens);
-	ctx.type = ctx.parse.FullHeader.Processor.Processor;
+	ctx.type = ctx.info.processor;
 	shader->processor_type = ctx.type;
 	ctx.bc->type = shader->processor_type;
 

From c8bc8d723598ec87bbce9a2439075dfe1612a359 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <t_arceri@yahoo.com.au>
Date: Sun, 9 Aug 2015 16:25:50 +1000
Subject: [PATCH 73/82] glsl: remove specical case subroutine type counting

Unlike samplers we can get the correct value for subroutines from
component_slots()

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/glsl/link_uniforms.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 254086dc050..a0cb6182925 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -47,10 +47,9 @@
 static unsigned
 values_for_type(const glsl_type *type)
 {
-   if (type->is_sampler() || type->is_subroutine()) {
+   if (type->is_sampler()) {
       return 1;
-   } else if (type->is_array() && (type->fields.array->is_sampler() ||
-                                   type->fields.array->is_subroutine())) {
+   } else if (type->is_array() && type->fields.array->is_sampler()) {
       return type->array_size();
    } else {
       return type->component_slots();

From 03b7ec877843cd622717b01c1047e08baf34facf Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Sun, 30 Aug 2015 20:40:31 +1000
Subject: [PATCH 74/82] r600: move prim convert from geom shader to function.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This should avoid C++ fail including this header.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/r600_pipe.h         | 26 +-------------------
 src/gallium/drivers/r600/r600_state_common.c | 25 +++++++++++++++++++
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/src/gallium/drivers/r600/r600_pipe.h b/src/gallium/drivers/r600/r600_pipe.h
index 4bd3d7cf75b..ee3e928861b 100644
--- a/src/gallium/drivers/r600/r600_pipe.h
+++ b/src/gallium/drivers/r600/r600_pipe.h
@@ -944,29 +944,5 @@ static inline bool r600_can_read_depth(struct r600_texture *rtex)
 #define     V_028A6C_OUTPRIM_TYPE_LINESTRIP            1
 #define     V_028A6C_OUTPRIM_TYPE_TRISTRIP             2
 
-static inline unsigned r600_conv_prim_to_gs_out(unsigned mode)
-{
-	static const int prim_conv[] = {
-		[PIPE_PRIM_POINTS]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		[PIPE_PRIM_LINES]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_LOOP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_STRIP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_STRIP]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_FAN]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_QUADS]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_QUAD_STRIP]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_POLYGON]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_LINES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
-		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
-		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
-		[R600_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
-	};
-	assert(mode < Elements(prim_conv));
-
-	return prim_conv[mode];
-}
-
+unsigned r600_conv_prim_to_gs_out(unsigned mode);
 #endif
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index d9cf736b043..a65064945cf 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -124,6 +124,31 @@ static unsigned r600_conv_pipe_prim(unsigned prim)
 	return prim_conv[prim];
 }
 
+unsigned r600_conv_prim_to_gs_out(unsigned mode)
+{
+	static const int prim_conv[] = {
+		[PIPE_PRIM_POINTS]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
+		[PIPE_PRIM_LINES]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_LOOP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_STRIP]			= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_STRIP]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_FAN]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_QUADS]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_QUAD_STRIP]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_POLYGON]			= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_LINES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_028A6C_OUTPRIM_TYPE_TRISTRIP,
+		[PIPE_PRIM_PATCHES]			= V_028A6C_OUTPRIM_TYPE_POINTLIST,
+		[R600_PRIM_RECTANGLE_LIST]		= V_028A6C_OUTPRIM_TYPE_TRISTRIP
+	};
+	assert(mode < Elements(prim_conv));
+
+	return prim_conv[mode];
+}
+
 /* common state between evergreen and r600 */
 
 static void r600_bind_blend_state_internal(struct r600_context *rctx,

From 46968c114003b30be335adccbc30445aca9b5dea Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 27 Aug 2015 01:01:00 +0100
Subject: [PATCH 75/82] st/mesa: cache tgsi opcode info in the instruction
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Instead of looking this up lots, lets just cache it in the instruction
translation up front. I just noticed this function what high in a profile
of shader-db on radeonsi.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 41 +++++++++-------------
 1 file changed, 17 insertions(+), 24 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 95a25c12fb4..db6a5a417ea 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -262,6 +262,7 @@ public:
    int dead_mask; /**< Used in dead code elimination */
 
    class function_entry *function; /* Set on TGSI_OPCODE_CAL or TGSI_OPCODE_BGNSUB */
+   const struct tgsi_opcode_info *info;
 };
 
 class variable_storage : public exec_node {
@@ -530,25 +531,16 @@ swizzle_for_size(int size)
    return size_swizzles[size - 1];
 }
 
-static bool
-is_tex_instruction(unsigned opcode)
+static unsigned
+num_inst_dst_regs(const glsl_to_tgsi_instruction *op)
 {
-   const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
-   return info->is_tex;
+   return op->info->num_dst;
 }
 
 static unsigned
-num_inst_dst_regs(unsigned opcode)
+num_inst_src_regs(const glsl_to_tgsi_instruction *op)
 {
-   const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
-   return info->num_dst;
-}
-
-static unsigned
-num_inst_src_regs(unsigned opcode)
-{
-   const tgsi_opcode_info* info = tgsi_get_opcode_info(opcode);
-   return info->is_tex ? info->num_src - 1 : info->num_src;
+   return op->info->is_tex ? op->info->num_src - 1 : op->info->num_src;
 }
 
 glsl_to_tgsi_instruction *
@@ -592,6 +584,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    assert(num_reladdr == 0);
 
    inst->op = op;
+   inst->info = tgsi_get_opcode_info(op);
    inst->dst[0] = dst;
    inst->dst[1] = dst1;
    inst->src[0] = src0;
@@ -3564,7 +3557,7 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
    v->samplers_used = 0;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
-      if (is_tex_instruction(inst->op)) {
+      if (inst->info->is_tex) {
          for (int i = 0; i < inst->sampler_array_size; i++) {
             unsigned idx = inst->sampler.index + i;
             v->samplers_used |= 1 << idx;
@@ -3699,7 +3692,7 @@ glsl_to_tgsi_visitor::rename_temp_register(int index, int new_index)
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
       unsigned j;
 
-      for (j = 0; j < num_inst_src_regs(inst->op); j++) {
+      for (j = 0; j < num_inst_src_regs(inst); j++) {
          if (inst->src[j].file == PROGRAM_TEMPORARY &&
              inst->src[j].index == index) {
             inst->src[j].index = new_index;
@@ -3713,7 +3706,7 @@ glsl_to_tgsi_visitor::rename_temp_register(int index, int new_index)
          }
       }
 
-      for (j = 0; j < num_inst_dst_regs(inst->op); j++) {
+      for (j = 0; j < num_inst_dst_regs(inst); j++) {
          if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index) {
             inst->dst[j].index = new_index;
          }
@@ -3729,7 +3722,7 @@ glsl_to_tgsi_visitor::get_first_temp_read(int index)
    unsigned i = 0, j;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
-      for (j = 0; j < num_inst_src_regs(inst->op); j++) {
+      for (j = 0; j < num_inst_src_regs(inst); j++) {
          if (inst->src[j].file == PROGRAM_TEMPORARY &&
              inst->src[j].index == index) {
             return (depth == 0) ? i : loop_start;
@@ -3763,7 +3756,7 @@ glsl_to_tgsi_visitor::get_first_temp_write(int index)
    unsigned j;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
-      for (j = 0; j < num_inst_dst_regs(inst->op); j++) {
+      for (j = 0; j < num_inst_dst_regs(inst); j++) {
          if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index) {
             return (depth == 0) ? i : loop_start;
          }
@@ -3789,7 +3782,7 @@ glsl_to_tgsi_visitor::get_last_temp_read(int index)
    unsigned i = 0, j;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
-      for (j = 0; j < num_inst_src_regs(inst->op); j++) {
+      for (j = 0; j < num_inst_src_regs(inst); j++) {
          if (inst->src[j].file == PROGRAM_TEMPORARY &&
              inst->src[j].index == index) {
             last = (depth == 0) ? i : -2;
@@ -3821,7 +3814,7 @@ glsl_to_tgsi_visitor::get_last_temp_write(int index)
    unsigned j;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
-      for (j = 0; j < num_inst_dst_regs(inst->op); j++) {
+      for (j = 0; j < num_inst_dst_regs(inst); j++) {
          if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index)
             last = (depth == 0) ? i : -2;
       }
@@ -4219,7 +4212,7 @@ glsl_to_tgsi_visitor::merge_two_dsts(void)
    foreach_in_list_safe(glsl_to_tgsi_instruction, inst, &this->instructions) {
       glsl_to_tgsi_instruction *inst2;
       bool merged;
-      if (num_inst_dst_regs(inst->op) != 2)
+      if (num_inst_dst_regs(inst) != 2)
          continue;
 
       if (inst->dst[0].file != PROGRAM_UNDEFINED &&
@@ -4995,8 +4988,8 @@ compile_tgsi_instruction(struct st_translate *t,
    unsigned num_src;
    unsigned tex_target;
 
-   num_dst = num_inst_dst_regs(inst->op);
-   num_src = num_inst_src_regs(inst->op);
+   num_dst = num_inst_dst_regs(inst);
+   num_src = num_inst_src_regs(inst);
 
    for (i = 0; i < num_dst; i++)
       dst[i] = translate_dst(t,

From aee73f2942eff2ffb4a0497ac81f01a3b00294b8 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 27 Aug 2015 01:46:33 +0100
Subject: [PATCH 76/82] st/mesa: reduce time spent in calculating temp
 read/writes
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The glsl->tgsi convertor does some temporary register reduction
however in profiling shader-db this shows up quite highly,

so optimise things to reduce the number of loops through
all the instructions we do. This drops merge_registers
from 4-5% on the profile to 1%. I think this can be reduced
further by possibly optimising the renumber pass.

Acked-by: Marek Olšák <marek.olsak@amd.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 151 +++++++++++----------
 1 file changed, 78 insertions(+), 73 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index db6a5a417ea..9174b41d8c9 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -480,10 +480,9 @@ public:
    void simplify_cmp(void);
 
    void rename_temp_register(int index, int new_index);
-   int get_first_temp_read(int index);
-   int get_first_temp_write(int index);
-   int get_last_temp_read(int index);
-   int get_last_temp_write(int index);
+   void get_first_temp_read(int *first_reads);
+   void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
+   void get_last_temp_write(int *last_writes);
 
    void copy_propagate(void);
    int eliminate_dead_code(void);
@@ -3714,8 +3713,8 @@ glsl_to_tgsi_visitor::rename_temp_register(int index, int new_index)
    }
 }
 
-int
-glsl_to_tgsi_visitor::get_first_temp_read(int index)
+void
+glsl_to_tgsi_visitor::get_first_temp_read(int *first_reads)
 {
    int depth = 0; /* loop depth */
    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
@@ -3723,15 +3722,15 @@ glsl_to_tgsi_visitor::get_first_temp_read(int index)
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
       for (j = 0; j < num_inst_src_regs(inst); j++) {
-         if (inst->src[j].file == PROGRAM_TEMPORARY &&
-             inst->src[j].index == index) {
-            return (depth == 0) ? i : loop_start;
+         if (inst->src[j].file == PROGRAM_TEMPORARY) {
+            if (first_reads[inst->src[j].index] == -1)
+                first_reads[inst->src[j].index] = (depth == 0) ? i : loop_start;
          }
       }
       for (j = 0; j < inst->tex_offset_num_offset; j++) {
-         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY &&
-             inst->tex_offsets[j].index == index) {
-            return (depth == 0) ? i : loop_start;
+         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY) {
+            if (first_reads[inst->tex_offsets[j].index] == -1)
+               first_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : loop_start;
          }
       }
       if (inst->op == TGSI_OPCODE_BGNLOOP) {
@@ -3744,91 +3743,73 @@ glsl_to_tgsi_visitor::get_first_temp_read(int index)
       assert(depth >= 0);
       i++;
    }
-   return -1;
 }
 
-int
-glsl_to_tgsi_visitor::get_first_temp_write(int index)
+void
+glsl_to_tgsi_visitor::get_last_temp_read_first_temp_write(int *last_reads, int *first_writes)
 {
    int depth = 0; /* loop depth */
    int loop_start = -1; /* index of the first active BGNLOOP (if any) */
-   int i = 0;
-   unsigned j;
-
+   unsigned i = 0, j;
+   int k;
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
+      for (j = 0; j < num_inst_src_regs(inst); j++) {
+         if (inst->src[j].file == PROGRAM_TEMPORARY)
+            last_reads[inst->src[j].index] = (depth == 0) ? i : -2;
+      }
       for (j = 0; j < num_inst_dst_regs(inst); j++) {
-         if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index) {
-            return (depth == 0) ? i : loop_start;
-         }
+         if (inst->dst[j].file == PROGRAM_TEMPORARY)
+            if (first_writes[inst->dst[j].index] == -1)
+               first_writes[inst->dst[j].index] = (depth == 0) ? i : loop_start;
+      }
+      for (j = 0; j < inst->tex_offset_num_offset; j++) {
+         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
+            last_reads[inst->tex_offsets[j].index] = (depth == 0) ? i : -2;
       }
       if (inst->op == TGSI_OPCODE_BGNLOOP) {
          if(depth++ == 0)
             loop_start = i;
       } else if (inst->op == TGSI_OPCODE_ENDLOOP) {
-         if (--depth == 0)
+         if (--depth == 0) {
             loop_start = -1;
-      }
-      assert(depth >= 0);
-      i++;
-   }
-   return -1;
-}
-
-int
-glsl_to_tgsi_visitor::get_last_temp_read(int index)
-{
-   int depth = 0; /* loop depth */
-   int last = -1; /* index of last instruction that reads the temporary */
-   unsigned i = 0, j;
-
-   foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
-      for (j = 0; j < num_inst_src_regs(inst); j++) {
-         if (inst->src[j].file == PROGRAM_TEMPORARY &&
-             inst->src[j].index == index) {
-            last = (depth == 0) ? i : -2;
+            for (k = 0; k < this->next_temp; k++) {
+               if (last_reads[k] == -2) {
+                  last_reads[k] = i;
+               }
+            }
          }
       }
-      for (j = 0; j < inst->tex_offset_num_offset; j++) {
-          if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY &&
-              inst->tex_offsets[j].index == index)
-              last = (depth == 0) ? i : -2;
-      }
-      if (inst->op == TGSI_OPCODE_BGNLOOP)
-         depth++;
-      else if (inst->op == TGSI_OPCODE_ENDLOOP)
-         if (--depth == 0 && last == -2)
-            last = i;
       assert(depth >= 0);
       i++;
    }
-   assert(last >= -1);
-   return last;
 }
 
-int
-glsl_to_tgsi_visitor::get_last_temp_write(int index)
+void
+glsl_to_tgsi_visitor::get_last_temp_write(int *last_writes)
 {
    int depth = 0; /* loop depth */
-   int last = -1; /* index of last instruction that writes to the temporary */
-   int i = 0;
+   int i = 0, k;
    unsigned j;
 
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
       for (j = 0; j < num_inst_dst_regs(inst); j++) {
-         if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index)
-            last = (depth == 0) ? i : -2;
+         if (inst->dst[j].file == PROGRAM_TEMPORARY)
+            last_writes[inst->dst[j].index] = (depth == 0) ? i : -2;
       }
 
       if (inst->op == TGSI_OPCODE_BGNLOOP)
          depth++;
       else if (inst->op == TGSI_OPCODE_ENDLOOP)
-         if (--depth == 0 && last == -2)
-            last = i;
+         if (--depth == 0) {
+            for (k = 0; k < this->next_temp; k++) {
+               if (last_writes[k] == -2) {
+                  last_writes[k] = i;
+               }
+            }
+         }
       assert(depth >= 0);
       i++;
    }
-   assert(last >= -1);
-   return last;
 }
 
 /*
@@ -4264,9 +4245,10 @@ glsl_to_tgsi_visitor::merge_registers(void)
     * into an array so that we don't have to traverse the instruction list as
     * much. */
    for (i = 0; i < this->next_temp; i++) {
-      last_reads[i] = get_last_temp_read(i);
-      first_writes[i] = get_first_temp_write(i);
+      last_reads[i] = -1;
+      first_writes[i] = -1;
    }
+   get_last_temp_read_first_temp_write(last_reads, first_writes);
 
    /* Start looking for registers with non-overlapping usages that can be
     * merged together. */
@@ -4307,15 +4289,21 @@ glsl_to_tgsi_visitor::renumber_registers(void)
 {
    int i = 0;
    int new_index = 0;
+   int *first_reads = rzalloc_array(mem_ctx, int, this->next_temp);
+
+   for (i = 0; i < this->next_temp; i++)
+      first_reads[i] = -1;
+   get_first_temp_read(first_reads);
 
    for (i = 0; i < this->next_temp; i++) {
-      if (get_first_temp_read(i) < 0) continue;
+      if (first_reads[i] < 0) continue;
       if (i != new_index)
          rename_temp_register(i, new_index);
       new_index++;
    }
 
    this->next_temp = new_index;
+   ralloc_free(first_reads);
 }
 
 /**
@@ -5790,14 +5778,31 @@ get_mesa_program(struct gl_context *ctx,
 #if 0
    /* Print out some information (for debugging purposes) used by the
     * optimization passes. */
-   for (i = 0; i < v->next_temp; i++) {
-      int fr = v->get_first_temp_read(i);
-      int fw = v->get_first_temp_write(i);
-      int lr = v->get_last_temp_read(i);
-      int lw = v->get_last_temp_write(i);
+   {
+      int i;
+      int *first_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
+      int *first_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
+      int *last_writes = rzalloc_array(v->mem_ctx, int, v->next_temp);
+      int *last_reads = rzalloc_array(v->mem_ctx, int, v->next_temp);
 
-      printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, fr, fw, lr, lw);
-      assert(fw <= fr);
+      for (i = 0; i < v->next_temp; i++) {
+         first_writes[i] = -1;
+         first_reads[i] = -1;
+         last_writes[i] = -1;
+         last_reads[i] = -1;
+      }
+      v->get_first_temp_read(first_reads);
+      v->get_last_temp_read_first_temp_write(last_reads, first_writes);
+      v->get_last_temp_write(last_writes);
+      for (i = 0; i < v->next_temp; i++)
+         printf("Temp %d: FR=%3d FW=%3d LR=%3d LW=%3d\n", i, first_reads[i],
+                first_writes[i],
+                last_reads[i],
+                last_writes[i]);
+      ralloc_free(first_writes);
+      ralloc_free(first_reads);
+      ralloc_free(last_writes);
+      ralloc_free(last_reads);
    }
 #endif
 

From 78027c965a50719959df821b7f545db191574724 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 27 Aug 2015 02:13:14 +0100
Subject: [PATCH 77/82] st/mesa: move to renumbering registers in a group
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This can be done with a single pass for the instruction base,
and takes renumber_registers out of its spot on the profile.

Acked-by: Marek Olšák <marek.olsak@amd.com
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 57 ++++++++++++++--------
 1 file changed, 38 insertions(+), 19 deletions(-)

diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 9174b41d8c9..6c9f9477a17 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -336,6 +336,11 @@ struct array_decl {
    unsigned array_size;
 };
 
+struct rename_reg_pair {
+   int old_reg;
+   int new_reg;
+};
+
 struct glsl_to_tgsi_visitor : public ir_visitor {
 public:
    glsl_to_tgsi_visitor();
@@ -479,7 +484,7 @@ public:
 
    void simplify_cmp(void);
 
-   void rename_temp_register(int index, int new_index);
+   void rename_temp_registers(int num_renames, struct rename_reg_pair *renames);
    void get_first_temp_read(int *first_reads);
    void get_last_temp_read_first_temp_write(int *last_reads, int *first_writes);
    void get_last_temp_write(int *last_writes);
@@ -3686,29 +3691,30 @@ glsl_to_tgsi_visitor::simplify_cmp(void)
 
 /* Replaces all references to a temporary register index with another index. */
 void
-glsl_to_tgsi_visitor::rename_temp_register(int index, int new_index)
+glsl_to_tgsi_visitor::rename_temp_registers(int num_renames, struct rename_reg_pair *renames)
 {
    foreach_in_list(glsl_to_tgsi_instruction, inst, &this->instructions) {
       unsigned j;
-
+      int k;
       for (j = 0; j < num_inst_src_regs(inst); j++) {
-         if (inst->src[j].file == PROGRAM_TEMPORARY &&
-             inst->src[j].index == index) {
-            inst->src[j].index = new_index;
-         }
+         if (inst->src[j].file == PROGRAM_TEMPORARY)
+            for (k = 0; k < num_renames; k++)
+               if (inst->src[j].index == renames[k].old_reg)
+                  inst->src[j].index = renames[k].new_reg;
       }
 
       for (j = 0; j < inst->tex_offset_num_offset; j++) {
-         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY &&
-             inst->tex_offsets[j].index == index) {
-            inst->tex_offsets[j].index = new_index;
-         }
+         if (inst->tex_offsets[j].file == PROGRAM_TEMPORARY)
+            for (k = 0; k < num_renames; k++)
+               if (inst->tex_offsets[j].index == renames[k].old_reg)
+                  inst->tex_offsets[j].index = renames[k].new_reg;
       }
 
       for (j = 0; j < num_inst_dst_regs(inst); j++) {
-         if (inst->dst[j].file == PROGRAM_TEMPORARY && inst->dst[j].index == index) {
-            inst->dst[j].index = new_index;
-         }
+         if (inst->dst[j].file == PROGRAM_TEMPORARY)
+             for (k = 0; k < num_renames; k++)
+                if (inst->dst[j].index == renames[k].old_reg)
+                   inst->dst[j].index = renames[k].new_reg;
       }
    }
 }
@@ -4239,7 +4245,9 @@ glsl_to_tgsi_visitor::merge_registers(void)
 {
    int *last_reads = rzalloc_array(mem_ctx, int, this->next_temp);
    int *first_writes = rzalloc_array(mem_ctx, int, this->next_temp);
+   struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
    int i, j;
+   int num_renames = 0;
 
    /* Read the indices of the last read and first write to each temp register
     * into an array so that we don't have to traverse the instruction list as
@@ -4266,7 +4274,9 @@ glsl_to_tgsi_visitor::merge_registers(void)
           * as the register at index j. */
          if (first_writes[i] <= first_writes[j] &&
              last_reads[i] <= first_writes[j]) {
-            rename_temp_register(j, i); /* Replace all references to j with i.*/
+            renames[num_renames].old_reg = j;
+            renames[num_renames].new_reg = i;
+            num_renames++;
 
             /* Update the first_writes and last_reads arrays with the new
              * values for the merged register index, and mark the newly unused
@@ -4278,6 +4288,8 @@ glsl_to_tgsi_visitor::merge_registers(void)
       }
    }
 
+   rename_temp_registers(num_renames, renames);
+   ralloc_free(renames);
    ralloc_free(last_reads);
    ralloc_free(first_writes);
 }
@@ -4290,19 +4302,26 @@ glsl_to_tgsi_visitor::renumber_registers(void)
    int i = 0;
    int new_index = 0;
    int *first_reads = rzalloc_array(mem_ctx, int, this->next_temp);
-
-   for (i = 0; i < this->next_temp; i++)
+   struct rename_reg_pair *renames = rzalloc_array(mem_ctx, struct rename_reg_pair, this->next_temp);
+   int num_renames = 0;
+   for (i = 0; i < this->next_temp; i++) {
       first_reads[i] = -1;
+   }
    get_first_temp_read(first_reads);
 
    for (i = 0; i < this->next_temp; i++) {
       if (first_reads[i] < 0) continue;
-      if (i != new_index)
-         rename_temp_register(i, new_index);
+      if (i != new_index) {
+         renames[num_renames].old_reg = i;
+         renames[num_renames].new_reg = new_index;
+         num_renames++;
+      }
       new_index++;
    }
 
+   rename_temp_registers(num_renames, renames);
    this->next_temp = new_index;
+   ralloc_free(renames);
    ralloc_free(first_reads);
 }
 

From d2e3638ef9e2ddf7e02b9fbe3fa8d40c63ebe5da Mon Sep 17 00:00:00 2001
From: Ben Widawsky <benjamin.widawsky@intel.com>
Date: Wed, 8 Jul 2015 17:04:10 -0700
Subject: [PATCH 78/82] i965/chv|skl: Apply sampler bypass w/a

Certain compressed formats require this setting. The docs don't go into much
detail as to why it's needed exactly.

This patch introduces no piglit regressions on gen9 (bsw is untested). Note that
the SKL "regressions" are fixed tests, and the egl_khr_gl_colorspace tests are
WTF. The patch also fixes nothing I can find.
http://otc-mesa-ci.jf.intel.com/job/Leeroy/127820/

v2:
Reworded commit message (Matt); Added piglit results link.
Restructured condition (Matt)
Moved check out to function (Nanley). I left the setting of the bit in the
  surface state open coded because it seems to go better with the existing code.

v3:
Use and inline function only in gen8_emit_texture_surface_state() (Matt).

Cc: Matt Turner <mattst88@gmail.com>
Cc: Nanley Chery <nanleychery@gmail.com>
Signed-off-by: Ben Widawsky <ben@bwidawsk.net>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h        |  1 +
 src/mesa/drivers/dri/i965/gen8_surface_state.c | 14 ++++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index cb5c82a002d..07fe1983ef2 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -276,6 +276,7 @@
 #define GEN8_SURFACE_TILING_W                       (1 << 12)
 #define GEN8_SURFACE_TILING_X                       (2 << 12)
 #define GEN8_SURFACE_TILING_Y                       (3 << 12)
+#define GEN8_SURFACE_SAMPLER_L2_BYPASS_DISABLE      (1 << 9)
 #define BRW_SURFACE_RC_READ_WRITE	(1 << 8)
 #define BRW_SURFACE_MIPLAYOUT_SHIFT	10
 #define BRW_SURFACE_MIPMAPLAYOUT_BELOW   0
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index 6c4d3e197a5..d2f333fd4dd 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -238,6 +238,20 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
       surf[0] |= BRW_SURFACE_CUBEFACE_ENABLES;
    }
 
+   /* From the CHV PRM, Volume 2d, page 321 (RENDER_SURFACE_STATE dword 0
+    * bit 9 "Sampler L2 Bypass Mode Disable" Programming Notes):
+    *
+    *    This bit must be set for the following surface types: BC2_UNORM
+    *    BC3_UNORM BC5_UNORM BC5_SNORM BC7_UNORM
+    */
+   if ((brw->gen >= 9 || brw->is_cherryview) &&
+       (format == BRW_SURFACEFORMAT_BC2_UNORM ||
+        format == BRW_SURFACEFORMAT_BC3_UNORM ||
+        format == BRW_SURFACEFORMAT_BC5_UNORM ||
+        format == BRW_SURFACEFORMAT_BC5_SNORM ||
+        format == BRW_SURFACEFORMAT_BC7_UNORM))
+      surf[0] |= GEN8_SURFACE_SAMPLER_L2_BYPASS_DISABLE;
+
    if (_mesa_is_array_texture(target) || target == GL_TEXTURE_CUBE_MAP)
       surf[0] |= GEN8_SURFACE_IS_ARRAY;
 

From a4ba41638d41865ef34bf36a525efcf8102c01ee Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Fri, 28 Aug 2015 17:10:00 -0700
Subject: [PATCH 79/82] i965/fs: Use greater-equal cmod to implement maximum.

The docs specifically call out SEL with .l and .ge as the
implementations of MIN and MAX respectively. Among other things,
SEL with these conditional mods are commutative.

See commit 3b7f683f.

Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_fs_builder.h           | 2 ++
 src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp | 8 ++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
index 34545eaa0fb..df10a9de293 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_builder.h
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -372,6 +372,8 @@ namespace brw {
       emit_minmax(const dst_reg &dst, const src_reg &src0,
                   const src_reg &src1, brw_conditional_mod mod) const
       {
+         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
          if (shader->devinfo->gen >= 6) {
             set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
                                  fix_unsigned_negate(src1)));
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 50e0acd05f5..727e8d1b82a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -686,7 +686,7 @@ namespace {
                if (is_signed)
                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
                                   fs_reg(-(int)scale(widths[c] - s) - 1),
-                                  BRW_CONDITIONAL_G);
+                                  BRW_CONDITIONAL_GE);
             }
          }
 
@@ -717,7 +717,7 @@ namespace {
                if (is_signed)
                   bld.emit_minmax(offset(dst, bld, c),
                                   offset(dst, bld, c), fs_reg(-1.0f),
-                                  BRW_CONDITIONAL_G);
+                                  BRW_CONDITIONAL_GE);
             }
          }
          return dst;
@@ -741,7 +741,7 @@ namespace {
                /* Clamp the normalized floating-point argument. */
                if (is_signed) {
                   bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
-                                  fs_reg(-1.0f), BRW_CONDITIONAL_G);
+                                  fs_reg(-1.0f), BRW_CONDITIONAL_GE);
 
                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
                                   fs_reg(1.0f), BRW_CONDITIONAL_L);
@@ -812,7 +812,7 @@ namespace {
                /* Clamp to the minimum value. */
                if (widths[c] < 16)
                   bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
-                                  fs_reg(0.0f), BRW_CONDITIONAL_G);
+                                  fs_reg(0.0f), BRW_CONDITIONAL_GE);
 
                /* Convert to 16-bit floating-point. */
                bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));

From 3063913f77cd2db1a263cb824a5c8c3dcc1a51a0 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 31 Aug 2015 14:22:23 +1000
Subject: [PATCH 80/82] r600/sb: update last_cf for finalize if.

As Glenn did for finalize_loop we need to update_cf when we
add a POP at the end of a shader.

I think this fixes one of the earlier shader going off end
of memory problems we've stopped.

Reviewed-by: Glenn Kennard <glenn.kennard@gmail.com>
Cc: "10.6" "11.0" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/r600/sb/sb_bc_finalize.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
index 8c2cd1460e5..dadee456a1f 100644
--- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
+++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp
@@ -199,6 +199,9 @@ void bc_finalizer::finalize_if(region_node* r) {
 		cf_node *if_jump = sh.create_cf(CF_OP_JUMP);
 		cf_node *if_pop = sh.create_cf(CF_OP_POP);
 
+		if (!last_cf || last_cf->get_parent_region() == r) {
+			last_cf = if_pop;
+		}
 		if_pop->bc.pop_count = 1;
 		if_pop->jump_after(if_pop);
 

From 01024ded1e791b33353ffa09d4e3dfb5b638179d Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Wed, 26 Aug 2015 12:01:38 -0700
Subject: [PATCH 81/82] mesa/texcompress: correct mapping of S3TC formats in
 conversion function

MESA_FORMAT_RGBA_DXT5 should actually be reserved for GL_RGBA[4]_DXT5_S3TC.
Also, Gallium and other dri drivers (radeon and nouveau) follow this mapping
scheme.

Reviewed-by: Chad Versace <chad.versace@intel.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/texcompress.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index bb94137a940..394c8bab214 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -400,15 +400,15 @@ _mesa_glenum_to_compressed_format(GLenum format)
 
    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
    case GL_RGB_S3TC:
+   case GL_RGB4_S3TC:
       return MESA_FORMAT_RGB_DXT1;
    case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-   case GL_RGB4_S3TC:
       return MESA_FORMAT_RGBA_DXT1;
    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
    case GL_RGBA_S3TC:
+   case GL_RGBA4_S3TC:
       return MESA_FORMAT_RGBA_DXT3;
    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-   case GL_RGBA4_S3TC:
       return MESA_FORMAT_RGBA_DXT5;
 
    case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:

From 76f17266ec68c644b2609d355018329636ae8e75 Mon Sep 17 00:00:00 2001
From: Nanley Chery <nanley.g.chery@intel.com>
Date: Wed, 12 Aug 2015 14:41:50 -0700
Subject: [PATCH 82/82] mesa/texformat: use format conversion function in
 _mesa_choose_tex_format

This function's cases for non-generic compressed formats duplicate
the GL to MESA translation in _mesa_glenum_to_compressed_format().
This patch replaces the switch cases with a call to the translation
function. This change teaches this function about ASTC, thus enabling
ASTC for glTex*Storage*() calls.

Reviewed-by: Chad Versace <chad.versace@intel.com>
Signed-off-by: Nanley Chery <nanley.g.chery@intel.com>
---
 src/mesa/main/texformat.c | 94 ++++++---------------------------------
 1 file changed, 13 insertions(+), 81 deletions(-)

diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index f4d17e1bdb5..fd9f335a767 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -38,6 +38,7 @@
 #include "mtypes.h"
 #include "texcompress.h"
 #include "texformat.h"
+#include "glformats.h"
 
 #define RETURN_IF_SUPPORTED(f) do {		\
    if (ctx->TextureFormatSupported[f])		\
@@ -276,87 +277,6 @@ _mesa_choose_tex_format(struct gl_context *ctx, GLenum target,
          RETURN_IF_SUPPORTED(MESA_FORMAT_YCBCR_REV);
       break;
 
-   /* For non-generic compressed format we assert two things:
-    *
-    * 1. The format has already been validated against the set of available
-    *    extensions.
-    *
-    * 2. The driver only enables the extension if it supports all of the
-    *    formats that are part of that extension.
-    */
-   case GL_COMPRESSED_RGB_FXT1_3DFX:
-      return MESA_FORMAT_RGB_FXT1;
-   case GL_COMPRESSED_RGBA_FXT1_3DFX:
-      return MESA_FORMAT_RGBA_FXT1;
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-   case GL_RGB_S3TC:
-   case GL_RGB4_S3TC:
-      return MESA_FORMAT_RGB_DXT1;
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      return MESA_FORMAT_RGBA_DXT1;
-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-   case GL_RGBA_S3TC:
-   case GL_RGBA4_S3TC:
-      return MESA_FORMAT_RGBA_DXT3;
-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-      return MESA_FORMAT_RGBA_DXT5;
-   case GL_COMPRESSED_RED_RGTC1:
-      return MESA_FORMAT_R_RGTC1_UNORM;
-   case GL_COMPRESSED_SIGNED_RED_RGTC1:
-      return MESA_FORMAT_R_RGTC1_SNORM;
-   case GL_COMPRESSED_RG_RGTC2:
-      return MESA_FORMAT_RG_RGTC2_UNORM;
-   case GL_COMPRESSED_SIGNED_RG_RGTC2:
-      return MESA_FORMAT_RG_RGTC2_SNORM;
-   case GL_COMPRESSED_LUMINANCE_LATC1_EXT:
-      return MESA_FORMAT_L_LATC1_UNORM;
-   case GL_COMPRESSED_SIGNED_LUMINANCE_LATC1_EXT:
-      return MESA_FORMAT_L_LATC1_SNORM;
-   case GL_COMPRESSED_LUMINANCE_ALPHA_LATC2_EXT:
-      return MESA_FORMAT_LA_LATC2_UNORM;
-   case GL_COMPRESSED_SIGNED_LUMINANCE_ALPHA_LATC2_EXT:
-      return MESA_FORMAT_LA_LATC2_SNORM;
-   case GL_COMPRESSED_LUMINANCE_ALPHA_3DC_ATI:
-      return MESA_FORMAT_LA_LATC2_UNORM;
-   case GL_ETC1_RGB8_OES:
-      return MESA_FORMAT_ETC1_RGB8;
-   case GL_COMPRESSED_RGB8_ETC2:
-      return MESA_FORMAT_ETC2_RGB8;
-   case GL_COMPRESSED_SRGB8_ETC2:
-      return MESA_FORMAT_ETC2_SRGB8;
-   case GL_COMPRESSED_RGBA8_ETC2_EAC:
-      return MESA_FORMAT_ETC2_RGBA8_EAC;
-   case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
-      return MESA_FORMAT_ETC2_SRGB8_ALPHA8_EAC;
-   case GL_COMPRESSED_R11_EAC:
-      return MESA_FORMAT_ETC2_R11_EAC;
-   case GL_COMPRESSED_RG11_EAC:
-      return MESA_FORMAT_ETC2_RG11_EAC;
-   case GL_COMPRESSED_SIGNED_R11_EAC:
-      return MESA_FORMAT_ETC2_SIGNED_R11_EAC;
-   case GL_COMPRESSED_SIGNED_RG11_EAC:
-      return MESA_FORMAT_ETC2_SIGNED_RG11_EAC;
-   case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-      return MESA_FORMAT_ETC2_RGB8_PUNCHTHROUGH_ALPHA1;
-   case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
-      return MESA_FORMAT_ETC2_SRGB8_PUNCHTHROUGH_ALPHA1;
-   case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
-      return MESA_FORMAT_SRGB_DXT1;
-   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
-      return MESA_FORMAT_SRGBA_DXT1;
-   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
-      return MESA_FORMAT_SRGBA_DXT3;
-   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
-      return MESA_FORMAT_SRGBA_DXT5;
-   case GL_COMPRESSED_RGBA_BPTC_UNORM:
-      return MESA_FORMAT_BPTC_RGBA_UNORM;
-   case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
-      return MESA_FORMAT_BPTC_SRGB_ALPHA_UNORM;
-   case GL_COMPRESSED_RGB_BPTC_SIGNED_FLOAT:
-      return MESA_FORMAT_BPTC_RGB_SIGNED_FLOAT;
-   case GL_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT:
-      return MESA_FORMAT_BPTC_RGB_UNSIGNED_FLOAT;
-
    case GL_ALPHA16F_ARB:
       RETURN_IF_SUPPORTED(MESA_FORMAT_A_FLOAT16);
       RETURN_IF_SUPPORTED(MESA_FORMAT_A_FLOAT32);
@@ -844,6 +764,18 @@ _mesa_choose_tex_format(struct gl_context *ctx, GLenum target,
    case GL_BGRA:
       RETURN_IF_SUPPORTED(MESA_FORMAT_B8G8R8A8_UNORM);
       break;
+
+   default:
+      /* For non-generic compressed format we assert two things:
+       *
+       * 1. The format has already been validated against the set of available
+       *    extensions.
+       *
+       * 2. The driver only enables the extension if it supports all of the
+       *    formats that are part of that extension.
+       */
+      if (_mesa_is_compressed_format(ctx, internalFormat))
+         return _mesa_glenum_to_compressed_format(internalFormat);
    }
 
    _mesa_problem(ctx, "unexpected format %s in _mesa_choose_tex_format()",