From 511ce2925baf90c1d93d3e6a389d31e8e7549493 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 23 Mar 2016 22:35:41 -0700
Subject: [PATCH 001/238] mesa: Check glReadBuffer enums against the ES3 table.

From the ES 3.2 spec, section 16.1.1 (Selecting Buffers for Reading):

   "An INVALID_ENUM error is generated if src is not BACK or one of
    the values from table 15.5."

Table 15.5 contains NONE and COLOR_ATTACHMENTi.

Mesa properly returned INVALID_ENUM for unknown enums, but it decided
what was known by using read_buffer_enum_to_index, which handles all
enums in every API.  So enums that were valid in GL were making it
past the "valid enum" check.  Such targets would then be classified
as unsupported, and we'd raise INVALID_OPERATION, but that's technically
the wrong error code.

Fixes dEQP-GLES31's
functional.debug.negative_coverage.get_error.buffer.read_buffer

v2: Only call read_buffer_enuM_to_index when required (Eduardo).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eduardo Lima Mitev <elima@igalia.com>
---
 src/mesa/main/buffers.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 26dafd1b786..a28c5831576 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -222,6 +222,12 @@ read_buffer_enum_to_index(GLenum buffer)
    }
 }
 
+static bool
+is_legal_es3_readbuffer_enum(GLenum buf)
+{
+   return buf == GL_BACK || buf == GL_NONE ||
+          (buf >= GL_COLOR_ATTACHMENT0 && buf <= GL_COLOR_ATTACHMENT31);
+}
 
 /**
  * Called by glDrawBuffer() and glNamedFramebufferDrawBuffer().
@@ -715,7 +721,11 @@ read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    }
    else {
       /* general case / window-system framebuffer */
-      srcBuffer = read_buffer_enum_to_index(buffer);
+      if (_mesa_is_gles3(ctx) && !is_legal_es3_readbuffer_enum(buffer))
+         srcBuffer = -1;
+      else
+         srcBuffer = read_buffer_enum_to_index(buffer);
+
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid buffer %s)", caller,

From f5e24b19e883281452952ecce3e811cda1f7946c Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Tue, 3 Nov 2015 11:46:38 -0500
Subject: [PATCH 002/238] winsys/amdgpu: addrlib - add Polaris support (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: fix indentation as noted by Michel

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
---
 .../winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp       |  8 +++++++-
 src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h |  2 ++
 src/gallium/winsys/amdgpu/drm/amdgpu_id.h              | 10 +++++++++-
 3 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
index 570216241d1..7c5d29a2166 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
@@ -351,6 +351,8 @@ AddrChipFamily CIAddrLib::HwlConvertChipFamily(
             m_settings.isIceland         = ASICREV_IS_ICELAND_M(uChipRevision);
             m_settings.isTonga           = ASICREV_IS_TONGA_P(uChipRevision);
             m_settings.isFiji            = ASICREV_IS_FIJI_P(uChipRevision);
+            m_settings.isPolaris10       = ASICREV_IS_POLARIS10_P(uChipRevision);
+            m_settings.isPolaris11       = ASICREV_IS_POLARIS11_M(uChipRevision);
             break;
         case FAMILY_CZ:
             m_settings.isCarrizo         = 1;
@@ -403,7 +405,7 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
 
     // @todo: VI
     // Move this to VI code path once created
-    if (m_settings.isTonga)
+    if (m_settings.isTonga || m_settings.isPolaris10)
     {
         m_pipes = 8;
     }
@@ -415,6 +417,10 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
     {
         m_pipes = 16;
     }
+    else if (m_settings.isPolaris11)
+    {
+        m_pipes = 4;
+    }
 
     if (valid)
     {
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
index 4cbe9706baa..de995fa4058 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
@@ -60,6 +60,8 @@ struct CIChipSettings
         UINT_32 isIceland         : 1;
         UINT_32 isTonga           : 1;
         UINT_32 isFiji            : 1;
+        UINT_32 isPolaris10       : 1;
+        UINT_32 isPolaris11       : 1;
         // VI fusion (Carrizo)
         UINT_32 isCarrizo         : 1;
     };
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
index 90fe0cd50f1..40b835c2248 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -138,6 +138,10 @@ enum {
 
 	VI_FIJI_P_A0      = 60,
 
+	VI_POLARIS10_P_A0 = 80,
+
+	VI_POLARIS11_M_A0 = 90,
+
 	VI_UNKNOWN        = 0xFF
 };
 
@@ -147,7 +151,11 @@ enum {
 #define ASICREV_IS_TONGA_P(eChipRev)	\
 	((eChipRev >= VI_TONGA_P_A0) && (eChipRev < VI_FIJI_P_A0))
 #define ASICREV_IS_FIJI_P(eChipRev)	\
-	(eChipRev >= VI_FIJI_P_A0)
+	((eChipRev >= VI_FIJI_P_A0)  && (eChipRev < VI_POLARIS10_P_A0))
+#define ASICREV_IS_POLARIS10_P(eChipRev)\
+	((eChipRev >= VI_POLARIS10_P_A0) && (eChipRev < VI_POLARIS11_M_A0))
+#define ASICREV_IS_POLARIS11_M(eChipRev)   \
+	(eChipRev >= VI_POLARIS11_M_A0)
 
 /* CZ specific rev IDs */
 enum {

From 42e442d888ce2d3dcb95350d17c298791f5d76cc Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Wed, 4 Nov 2015 16:13:07 -0500
Subject: [PATCH 003/238] radeonsi: add support for Polaris (v2)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: Polaris chips should be defined after Stoney

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com> (v1)
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com> (v1)
Signed-off-by: Leo Liu <leo.liu@amd.com> (v2 diff)
Reviewed-by: Alex Deucher <alexander.deucher@amd.com> (v2 diff)
---
 src/gallium/drivers/radeon/r600_pipe_common.c | 9 +++++++++
 src/gallium/drivers/radeon/radeon_winsys.h    | 2 ++
 src/gallium/drivers/radeonsi/si_pipe.c        | 2 ++
 src/gallium/drivers/radeonsi/si_state.c       | 8 ++++++++
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 8 ++++++++
 5 files changed, 29 insertions(+)

diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index eed9d83ee49..720fc06ece2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -467,6 +467,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
 	case CHIP_FIJI: return "AMD FIJI";
+	case CHIP_POLARIS10: return "AMD POLARIS10";
+	case CHIP_POLARIS11: return "AMD POLARIS11";
 	case CHIP_STONEY: return "AMD STONEY";
 	default: return "AMD unknown";
 	}
@@ -597,6 +599,13 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 #else
 	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
+#endif
+#if HAVE_LLVM <= 0x0308
+	case CHIP_POLARIS10: return "tonga";
+	case CHIP_POLARIS11: return "tonga";
+#else
+	case CHIP_POLARIS10: return "polaris10";
+	case CHIP_POLARIS11: return "polaris11";
 #endif
 	default: return "";
 	}
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index d35e963133e..baecca72383 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -124,6 +124,8 @@ enum radeon_family {
     CHIP_CARRIZO,
     CHIP_FIJI,
     CHIP_STONEY,
+    CHIP_POLARIS10,
+    CHIP_POLARIS11,
     CHIP_LAST,
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index dd1103eed06..ed84dc224ff 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -598,6 +598,8 @@ static bool si_init_gs_info(struct si_screen *sscreen)
 	case CHIP_HAWAII:
 	case CHIP_TONGA:
 	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
 		sscreen->gs_table_depth = 32;
 		return true;
 	default:
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1245f56c08a..a2b0da90ec9 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -3946,6 +3946,14 @@ static void si_init_config(struct si_context *sctx)
 			raster_config_1 = 0x0000002e;
 		}
 		break;
+	case CHIP_POLARIS10:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	case CHIP_POLARIS11:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
 	case CHIP_TONGA:
 		raster_config = 0x16000012;
 		raster_config_1 = 0x0000002a;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 938b9c244b2..87d9a6aebec 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -237,6 +237,14 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd)
       ws->family = FAMILY_VI;
       ws->rev_id = VI_FIJI_P_A0;
       break;
+   case CHIP_POLARIS10:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_POLARIS10_P_A0;
+      break;
+   case CHIP_POLARIS11:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_POLARIS11_M_A0;
+      break;
    default:
       fprintf(stderr, "amdgpu: Unknown family.\n");
       goto fail;

From 0c5477465f08502fd81783ce17c449330537eb00 Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Tue, 15 Dec 2015 15:16:29 -0500
Subject: [PATCH 004/238] radeon/vce: add Polaris11 VCE firmware support

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 2ab74e9eb6c..6584393f430 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -50,6 +50,7 @@
 #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
 #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
 #define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
+#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
@@ -482,6 +483,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		break;
 
 	case FW_52_0_3:
+	case FW_52_4_3:
 		radeon_vce_52_init(enc);
 		break;
 
@@ -514,6 +516,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 	case FW_50_10_2:
 	case FW_50_17_3:
 	case FW_52_0_3:
+	case FW_52_4_3:
 		return true;
 	default:
 		return false;

From f87ed903fb6fd1bdb0cfa7a4dd5b9d00a9f38e31 Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Tue, 15 Dec 2015 15:33:40 -0500
Subject: [PATCH 005/238] radeon/vce: disable two pipe mode for Polaris11

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 src/gallium/drivers/radeon/radeon_vce.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 6584393f430..99b82ca9409 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -409,7 +409,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
             rscreen->info.drm_major == 3)
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA &&
-             rscreen->info.family != CHIP_STONEY)
+	    rscreen->info.family != CHIP_STONEY &&
+	    rscreen->info.family != CHIP_POLARIS11)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
 	if ((rscreen->info.family >= CHIP_TONGA) &&

From f00c840578a70e479ffb99f6b64c73dc420179fa Mon Sep 17 00:00:00 2001
From: Sonny Jiang <sonny.jiang@amd.com>
Date: Wed, 4 Nov 2015 11:01:33 -0500
Subject: [PATCH 006/238] radeonsi: add Polaris PCI IDs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Sonny Jiang <sonny.jiang@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com> (Polaris10)
Reviewed-by: Michel Dänzer <michel.daenzer@amd.com> (Polaris11)
---
 include/pci_ids/radeonsi_pci_ids.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index bcf15a186c6..4df8e9d83a9 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -182,4 +182,14 @@ CHIPSET(0x9877, CARRIZO_, CARRIZO)
 
 CHIPSET(0x7300, FIJI_, FIJI)
 
+CHIPSET(0x67E0, POLARIS11_, POLARIS11)
+CHIPSET(0x67E1, POLARIS11_, POLARIS11)
+CHIPSET(0x67E8, POLARIS11_, POLARIS11)
+CHIPSET(0x67E9, POLARIS11_, POLARIS11)
+CHIPSET(0x67EB, POLARIS11_, POLARIS11)
+CHIPSET(0x67FF, POLARIS11_, POLARIS11)
+
+CHIPSET(0x67C0, POLARIS10_, POLARIS10)
+CHIPSET(0x67DF, POLARIS10_, POLARIS10)
+
 CHIPSET(0x98E4, STONEY_, STONEY)

From 3a251859903dd567ba81d86f06f5c86933a010af Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 16 Feb 2016 17:32:34 -0600
Subject: [PATCH 007/238] swr: [rasterizer] Add string knob type

---
 src/gallium/drivers/swr/rasterizer/core/knobs_init.h        | 5 +++++
 .../drivers/swr/rasterizer/scripts/templates/knobs.template | 6 +++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index 3f19555557f..adf738c1bed 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
     }
 }
 
+static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
+{
+    knobValue = pOverride;
+}
+
 template <typename T>
 static inline void InitKnob(T& knob)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
index 922117e7e16..66c8e84b827 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -77,7 +77,11 @@ struct GlobalKnobs
     % for line in knob[1]['desc']:
     // ${line}
     % endfor
+    % if knob[1]['type'] == 'std::string':
+    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}");
+    % else:
     DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+    % endif
 
     % endfor
     GlobalKnobs();
@@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
     str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
     % if knob[1]['type'] == 'bool':
     str << (KNOB_${knob[0]} ? "+\n" : "-\n");
-    % elif knob[1]['type'] != 'float':
+    % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
     str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
     str << std::dec << KNOB_${knob[0]} << "\n";
     % else:

From 45a4afa634bdca2b1147a76b74ed15a690b7a014 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 17 Feb 2016 17:55:59 -0600
Subject: [PATCH 008/238] swr: [rasterizer core] Split all RECT_LIST draws into
 1 RECT per draw

Needed until proper RECT_LIST PrimAssembly code is written.
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index fccccab503c..c70b4fafedd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1005,6 +1005,11 @@ uint32_t MaxVertsPerDraw(
         }
         break;
 
+    // The Primitive Assembly code can only handle 1 RECT at a time.
+    case TOP_RECT_LIST:
+        vertsPerDraw = 3;
+        break;
+
     default:
         // We are not splitting up draws for other topologies.
         break;

From dd0f9eed8cda45ed1373abebad7dd2398a16e9a7 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 18 Feb 2016 19:00:30 -0600
Subject: [PATCH 009/238] swr: [rasterizer] switch assert uses to SWR_ASSERT

---
 src/gallium/drivers/swr/rasterizer/core/tilemgr.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 9137941bad4..b5eaaab63a3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -299,8 +299,8 @@ public:
         uint32_t x, y;
         MacroTileMgr::getTileIndices(macroID, x, y);
 
-        assert(x < KNOB_NUM_HOT_TILES_X);
-        assert(y < KNOB_NUM_HOT_TILES_Y);
+        SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+        SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
         HotTileSet &tile = mHotTiles[x][y];
         HOTTILE& hotTile = tile.Attachment[attachment];
@@ -326,7 +326,7 @@ public:
             {
                 // tile should be either uninitialized or resolved if we're deleting and switching to a 
                 // new sample count
-                assert((hotTile.state == HOTTILE_INVALID) ||
+                SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
                        (hotTile.state == HOTTILE_RESOLVED) || 
                        (hotTile.state == HOTTILE_CLEAR));
                 _aligned_free(hotTile.pBuffer);
@@ -377,8 +377,8 @@ public:
     {
         uint32_t x, y;
         MacroTileMgr::getTileIndices(macroID, x, y);
-        assert(x < KNOB_NUM_HOT_TILES_X);
-        assert(y < KNOB_NUM_HOT_TILES_Y);
+        SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+        SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
         return mHotTiles[x][y];
     }

From 45f0ce168ce21a7a95f48d3164e42a947732b896 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Fri, 19 Feb 2016 17:55:23 -0600
Subject: [PATCH 010/238] swr: [rasterizer core] RingBuffer class for DC/DS

Use head/tail ring buffer indices for thread synchronization.

1. SwrWaitForIdle loops until ring is empty. (head == tail)
2. GetDrawContext waits until ring is not full. (head - tail) == Ring Size
3. Draw enqueues by incrementing head.
4. Last worker thread to move past a DC dequeues by incrementing tail.

Todo: To reduce contention we can cache the tail in the API thread. For
example, if you know you have 64 free entries in the ring then you don't
need to keep checking the tail until you used those 64 entries.
---
 src/gallium/drivers/swr/Makefile.sources-arch |   1 +
 .../drivers/swr/rasterizer/common/os.h        |   1 +
 .../drivers/swr/rasterizer/core/api.cpp       | 153 ++++++------------
 .../drivers/swr/rasterizer/core/context.h     |  19 +--
 .../drivers/swr/rasterizer/core/ringbuffer.h  | 102 ++++++++++++
 .../drivers/swr/rasterizer/core/threads.cpp   |  40 ++---
 6 files changed, 180 insertions(+), 136 deletions(-)
 create mode 100644 src/gallium/drivers/swr/rasterizer/core/ringbuffer.h

diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
index 6c105f46199..7544f8efccc 100644
--- a/src/gallium/drivers/swr/Makefile.sources-arch
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -83,6 +83,7 @@ CORE_CXX_SOURCES := \
 	rasterizer/core/rasterizer.h \
 	rasterizer/core/rdtsc_core.cpp \
 	rasterizer/core/rdtsc_core.h \
+	rasterizer/core/ringbuffer.h \
 	rasterizer/core/state.h \
 	rasterizer/core/threads.cpp \
 	rasterizer/core/threads.h \
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 522ae0dd65f..265b879e1cb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -192,6 +192,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
 #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
 #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
 #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
 #define _ReadWriteBarrier() asm volatile("" ::: "memory")
 #define __stdcall
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index c70b4fafedd..e18f9e7a811 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -61,11 +61,8 @@ HANDLE SwrCreateContext(
     pContext->driverType = pCreateInfo->driver;
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-    pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
-    pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
+    pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+    pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
     pContext->numSubContexts = pCreateInfo->maxSubContexts;
     if (pContext->numSubContexts > 1)
@@ -77,7 +74,6 @@ HANDLE SwrCreateContext(
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new Arena();
-        pContext->dcRing[dc].inUse = false;
         pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
         pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
 
@@ -108,9 +104,6 @@ HANDLE SwrCreateContext(
         pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
     }
 
-    pContext->nextDrawId = 1;
-    pContext->DrawEnqueued = 1;
-
     // State setup AFTER context is fully initialized
     SetupDefaultState(pContext);
 
@@ -148,8 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
         _aligned_free(pContext->pScratch[i]);
     }
 
-    _aligned_free(pContext->dcRing);
-    _aligned_free(pContext->dsRing);
     _aligned_free(pContext->subCtxSave);
 
     delete(pContext->pHotTileMgr);
@@ -168,49 +159,28 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
     pContext->FifosNotEmpty.notify_all();
 }
 
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
 {
-    // For single thread nothing should still be drawing.
-    if (KNOB_SINGLE_THREADED) { return false; }
-
-    if (pDC->isCompute)
+    if (IsDraw)
     {
-        if (pDC->doneCompute)
-        {
-            pDC->inUse = false;
-            return false;
-        }
+        // Each worker thread looks at a DC for both FE and BE work at different times and so we
+        // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
+        // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+        // then moved on if all work is done.)
+        pContext->pCurDrawContext->threadsDone =
+            pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
     }
-
-    // Check if backend work is done. First make sure all triangles have been binned.
-    if (pDC->doneFE == true)
+    else
     {
-        // ensure workers have all moved passed this draw
-        if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
-        {
-            return true;
-        }
-
-        if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
-        {
-            return true;
-        }
-
-        pDC->inUse = false;    // all work is done.
+        pContext->pCurDrawContext->threadsDone =
+            pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
     }
 
-    return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
-{
-    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
-    pContext->pCurDrawContext->inUse = true;
-
     _ReadWriteBarrier();
     {
         std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->DrawEnqueued++;
+        pContext->dcRing.Enqueue();
     }
 
     if (KNOB_SINGLE_THREADED)
@@ -219,10 +189,24 @@ void QueueDraw(SWR_CONTEXT *pContext)
         uint32_t mxcsr = _mm_getcsr();
         _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 
-        std::unordered_set<uint32_t> lockedTiles;
-        uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
-        WorkOnFifoFE(pContext, 0, curDraw[0], 0);
-        WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+        if (IsDraw)
+        {
+            std::unordered_set<uint32_t> lockedTiles;
+            uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+        }
+        else
+        {
+            uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+            WorkOnCompute(pContext, 0, curDispatch);
+        }
+
+        // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+        if (!pContext->dcRing.IsEmpty())
+        {
+            pContext->dcRing.Dequeue();
+        }
 
         // restore csr
         _mm_setcsr(mxcsr);
@@ -239,40 +223,14 @@ void QueueDraw(SWR_CONTEXT *pContext)
     pContext->pCurDrawContext = nullptr;
 }
 
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
 {
-    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
-    pContext->pCurDrawContext->inUse = true;
+    QueueWork<true>(pContext);
+}
 
-    _ReadWriteBarrier();
-    {
-        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->DrawEnqueued++;
-    }
-
-    if (KNOB_SINGLE_THREADED)
-    {
-        // flush denormals to 0
-        uint32_t mxcsr = _mm_getcsr();
-        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
-        uint64_t curDispatch = pContext->pCurDrawContext->drawId;
-        WorkOnCompute(pContext, 0, curDispatch);
-
-        // restore csr
-        _mm_setcsr(mxcsr);
-    }
-    else
-    {
-        RDTSC_START(APIDrawWakeAllThreads);
-        WakeAllThreads(pContext);
-        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
-    }
-
-    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
-    pContext->pPrevDrawContext = pContext->pCurDrawContext;
-    pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+    QueueWork<false>(pContext);
 }
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
@@ -281,17 +239,17 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
-        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
-        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
-        pContext->pCurDrawContext = pCurDrawContext;
-
-        // Need to wait until this draw context is available to use.
-        while (StillDrawing(pContext, pCurDrawContext))
+        // Need to wait for a free entry.
+        while (pContext->dcRing.IsFull())
         {
             _mm_pause();
         }
 
+        uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+        pContext->pCurDrawContext = pCurDrawContext;
+
         // Assign next available entry in DS ring to this DC.
         uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
@@ -332,18 +290,15 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         pCurDrawContext->pArena->Reset();
         pCurDrawContext->pContext = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
-        pCurDrawContext->inUse = false;
 
-        pCurDrawContext->doneCompute = false;
         pCurDrawContext->doneFE = false;
         pCurDrawContext->FeLock = 0;
-        pCurDrawContext->threadsDoneFE = 0;
-        pCurDrawContext->threadsDoneBE = 0;
+        pCurDrawContext->threadsDone = 0;
 
         pCurDrawContext->pTileMgr->initialize();
 
         // Assign unique drawId for this DC
-        pCurDrawContext->drawId = pContext->nextDrawId++;
+        pCurDrawContext->drawId = pContext->dcRing.GetHead();
     }
     else
     {
@@ -431,16 +386,12 @@ void SwrWaitForIdle(HANDLE hContext)
     SWR_CONTEXT *pContext = GetContext(hContext);
 
     RDTSC_START(APIWaitForIdle);
-    // Wait for all work to complete.
-    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
 
-        while (StillDrawing(pContext, pDC))
-        {
-            _mm_pause();
-        }
+    while (!pContext->dcRing.IsEmpty())
+    {
+        _mm_pause();
     }
+
     RDTSC_STOP(APIWaitForIdle, 1, 0);
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 4a214aff1c8..d75d9754e57 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -41,6 +41,7 @@
 #include "core/knobs.h"
 #include "common/simdintrin.h"
 #include "core/threads.h"
+#include "ringbuffer.h"
 
 // x.8 fixed point precision values
 #define FIXED_POINT_SHIFT 8
@@ -381,19 +382,14 @@ struct DRAW_CONTEXT
 
     FE_WORK FeWork;
     volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(bool) inUse;
     volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-
-    // Have all worker threads moved past draw in DC ring?
-    volatile OSALIGNLINE(uint32_t) threadsDoneFE;
-    volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+    volatile OSALIGNLINE(int64_t) threadsDone;
 
     uint64_t dependency;
 
     MacroTileMgr* pTileMgr;
 
     // The following fields are valid if isCompute is true.
-    volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done?   (isCompute)
     DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
 
     DRAW_STATE* pState;
@@ -438,7 +434,7 @@ struct SWR_CONTEXT
     //  3. State - When an applications sets state after draw
     //     a. Same as step 1.
     //     b. State is copied from prev draw context to current.
-    DRAW_CONTEXT* dcRing;
+    RingBuffer<DRAW_CONTEXT> dcRing;
 
     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
@@ -448,7 +444,7 @@ struct SWR_CONTEXT
     //  These split draws all have identical state. So instead of storing the state directly
     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
     //  to reference a single entry in the DS ring.
-    DRAW_STATE*   dsRing;
+    RingBuffer<DRAW_STATE> dsRing;
 
     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
 
@@ -463,13 +459,6 @@ struct SWR_CONTEXT
     std::condition_variable FifosNotEmpty;
     std::mutex WaitLock;
 
-    // Draw Contexts will get a unique drawId generated from this
-    uint64_t nextDrawId;
-
-    // most recent draw id enqueued by the API thread
-    // written by api thread, read by multiple workers
-    OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
     DRIVER_TYPE driverType;
 
     uint32_t privateStateSize;
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
new file mode 100644
index 00000000000..e323136bc41
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+*        The RingBuffer class manages all aspects of the ring buffer including
+*        the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+    RingBuffer()
+        : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+    {
+    }
+
+    ~RingBuffer()
+    {
+        Destroy();
+    }
+
+    void Init(uint32_t numEntries)
+    {
+        SWR_ASSERT(numEntries > 0);
+        mNumEntries = numEntries;
+        mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+        SWR_ASSERT(mpRingBuffer != nullptr);
+        memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+    }
+
+    void Destroy()
+    {
+        _aligned_free(mpRingBuffer);
+        mpRingBuffer = nullptr;
+    }
+
+    T& operator[](const uint32_t index)
+    {
+        SWR_ASSERT(index < mNumEntries);
+        return mpRingBuffer[index];
+    }
+
+    INLINE void Enqueue()
+    {
+        mRingHead++; // There's only one producer.
+    }
+
+    INLINE void Dequeue()
+    {
+        InterlockedIncrement(&mRingTail); // There are multiple consumers.
+    }
+
+    INLINE bool IsEmpty()
+    {
+        return (GetHead() == GetTail());
+    }
+
+    INLINE bool IsFull()
+    {
+        ///@note We don't handle wrap case due to using 64-bit indices.
+        ///      It would take 11 million years to wrap at 50,000 DCs per sec.
+        ///      If we used 32-bit indices then its about 23 hours to wrap.
+        uint64_t numEnqueued = GetHead() - GetTail();
+        SWR_ASSERT(numEnqueued <= mNumEntries);
+
+        return (numEnqueued == mNumEntries);
+    }
+
+    INLINE volatile uint64_t GetTail() { return mRingTail; }
+    INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+private:
+    T* mpRingBuffer;
+    uint32_t mNumEntries;
+
+    OSALIGNLINE(volatile uint64_t) mRingHead;  // Consumer Counter
+    OSALIGNLINE(volatile uint64_t) mRingTail;  // Producer Counter
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 24c5588bfec..8f0d9249ae0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -265,9 +265,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
 INLINE
 uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
 {
-    //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
-    //return result;
-    return pContext->DrawEnqueued;
+    return pContext->dcRing.GetHead();
 }
 
 INLINE
@@ -449,6 +447,18 @@ void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macro
     }
 }
 
+INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+{
+    int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+
+    if (result == 0)
+    {
+        _ReadWriteBarrier();
+
+        pContext->dcRing.Dequeue();  // Remove from tail
+    }
+}
+
 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
 {
     // increment our current draw id to the first incomplete draw
@@ -466,7 +476,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
         if (isWorkComplete)
         {
             curDrawBE++;
-            InterlockedIncrement(&pDC->threadsDoneBE);
+            CompleteDrawContext(pContext, pDC);
         }
         else
         {
@@ -579,7 +589,7 @@ void WorkOnFifoBE(
                         {
                             // We can increment the current BE and safely move to next draw since we know this draw is complete.
                             curDrawBE++;
-                            InterlockedIncrement(&pDC->threadsDoneBE);
+                            CompleteDrawContext(pContext, pDC);
 
                             lastRetiredDraw++;
 
@@ -608,8 +618,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE,
         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
         if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
         {
+            CompleteDrawContext(pContext, pDC);
             curDrawFE++;
-            InterlockedIncrement(&pDC->threadsDoneFE);
         }
         else
         {
@@ -673,22 +683,12 @@ void WorkOnCompute(
     // Is there any work remaining?
     if (queue.getNumQueued() > 0)
     {
-        bool lastToComplete = false;
-
         uint32_t threadGroupId = 0;
         while (queue.getWork(threadGroupId))
         {
             ProcessComputeBE(pDC, workerId, threadGroupId);
 
-            lastToComplete = queue.finishedWork();
-        }
-
-        _ReadWriteBarrier();
-
-        if (lastToComplete)
-        {
-            SWR_ASSERT(queue.isWorkComplete() == true);
-            pDC->doneCompute = true;
+            queue.finishedWork();
         }
     }
 }
@@ -732,10 +732,10 @@ DWORD workerThreadMain(LPVOID pData)
     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 
-    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 
-    uint64_t curDrawBE = 1;
-    uint64_t curDrawFE = 1;
+    uint64_t curDrawBE = 0;
+    uint64_t curDrawFE = 0;
 
     while (pContext->threadPool.inThreadShutdown == false)
     {

From 136988b42b6c7bd9649fd13d5a6117b02a41e52a Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Fri, 19 Feb 2016 19:05:14 -0600
Subject: [PATCH 011/238] swr: [rasterizer core] fix rasterizing multisampling
 with scissor enabled

We were not evaluating the scissor edge equations at sample positions.
---
 .../swr/rasterizer/core/rasterizer.cpp        | 70 ++++++++-----------
 1 file changed, 28 insertions(+), 42 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 587e336d87d..07c9eedb50d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
     // used to for testing if entire raster tile is inside a triangle
-    vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
-    vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
-    vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+    for (uint32_t e = 0; e < numEdges; ++e)
+    {
+        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+    }
 
     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
     // step sample positions to the raster tile bbox of multisample points
@@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     //                             |      |
     //                             |      |
     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-    __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+    __m256d vEdgeTileBbox[3];
     if (sampleCount > SWR_MULTISAMPLE_1X)
     {
         __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
@@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         // step edge equation tests from Tile
         // used to for testing if entire raster tile is inside a triangle
-        __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
-        __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
-        vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
-        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
-        vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
-        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
-        vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+        for (uint32_t e = 0; e < 3; ++e)
+        {
+            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+        }
     }
 
     RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
@@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             {
                 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
                 // evaluate edge equations at the tile multisample bounding box
-                vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
-                vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
-                vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+                vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+                vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+                vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
                 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
                 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
                 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
@@ -796,13 +792,14 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                     }
                     else
                     {
-                        __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; 
+                        __m256d vEdgeAtSample[numEdges];
                         if(sampleCount == SWR_MULTISAMPLE_1X)
                         {
                             // should get optimized out for single sample case (global value numbering or copy propagation)
-                            vEdge0AtSample = vEdgeFix16[0];
-                            vEdge1AtSample = vEdgeFix16[1];
-                            vEdge2AtSample = vEdgeFix16[2];
+                            for (uint32_t e = 0; e < numEdges; ++e)
+                            {
+                                vEdgeAtSample[e] = vEdgeFix16[e];
+                            }
                         }
                         else
                         {
@@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                             // for each edge and broadcasts it before offsetting to individual pixel quads
 
                             // step edge equation tests from UL tile corner to pixel sample position
-                            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
-                            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
-                            vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
-
-                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
-                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
-                            vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
-
-                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
-                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
-                            vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+                            for (uint32_t e = 0; e < numEdges; ++e)
+                            {
+                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+                            }
                         }
 
                         double startQuadEdges[numEdges];
                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                        _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
-                        _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
-                        _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
-
-                        for (uint32_t e = 3; e < numEdges; ++e)
+                        for (uint32_t e = 0; e < numEdges; ++e)
                         {
-                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
                         }
 
                         // not trivial accept or reject, must rasterize full tile

From 7ead4959a5a1b5687458173490b2eec91ef6193b Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 22 Feb 2016 11:00:07 -0600
Subject: [PATCH 012/238] swr: [rasterizer jitter] Fix type mismatch on select
 args for SCATTERPS

---
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5394fc7bf5a..2ff77bc30ec 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -1286,8 +1286,10 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 {
     Value* pStack = STACKSAVE();
 
+    Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
     // allocate tmp stack for masked off lanes
-    Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+    Value* vTmpPtr = ALLOCA(pSrcTy);
 
     Value *mask = MASK(vMask);
     for (uint32_t i = 0; i < JM()->mVWidth; ++i)
@@ -1295,7 +1297,7 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
         Value *offset = VEXTRACT(vOffsets, C(i));
         // byte pointer to component
         Value *storeAddress = GEP(pDst, offset);
-        storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+        storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
         Value *selMask = VEXTRACT(mask, C(i));
         Value *srcElem = VEXTRACT(vSrc, C(i));
         // switch in a safe address to load if we're trying to access a vertex 

From bdd690dc3667f66cbe87974f18e247cf1e6f9c5f Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 23 Feb 2016 13:47:24 -0600
Subject: [PATCH 013/238] swr: [rasterizer jitter] Cleanup use of types inside
 of Builder.

Also, cached the simd width since we don't have to keep querying
the JitManager for it.
---
 .../drivers/swr/rasterizer/jitter/builder.cpp | 16 +++--
 .../drivers/swr/rasterizer/jitter/builder.h   |  6 ++
 .../swr/rasterizer/jitter/builder_misc.cpp    | 58 +++++++++----------
 .../swr/rasterizer/jitter/fetch_jit.cpp       | 58 +++++++++----------
 4 files changed, 75 insertions(+), 63 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index c15bdf1e756..757ea3fe39c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -38,6 +38,8 @@ using namespace llvm;
 Builder::Builder(JitManager *pJitMgr)
     : mpJitMgr(pJitMgr)
 {
+    mVWidth = pJitMgr->mVWidth;
+
     mpIRBuilder = &pJitMgr->mBuilder;
 
     mVoidTy = Type::getVoidTy(pJitMgr->mContext);
@@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr)
     mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
     mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
     mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+    mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+    mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+    mInt32PtrTy = PointerType::get(mInt32Ty, 0);
     mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
     mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
     mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-    mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
-    mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
-    mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
-    mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
-    mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+    mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+    mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+    mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+    mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+    mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+    mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
 
     if (sizeof(uint32_t*) == 4)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 49216612cc9..239ef2ab49f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -43,6 +43,8 @@ struct Builder
     JitManager* mpJitMgr;
     IRBuilder<>* mpIRBuilder;
 
+    uint32_t             mVWidth;
+
     // Built in types.
     Type*                mVoidTy;
     Type*                mInt1Ty;
@@ -54,12 +56,16 @@ struct Builder
     Type*                mFP16Ty;
     Type*                mFP32Ty;
     Type*                mDoubleTy;
+    Type*                mInt8PtrTy;
+    Type*                mInt16PtrTy;
+    Type*                mInt32PtrTy;
     Type*                mSimdFP16Ty;
     Type*                mSimdFP32Ty;
     Type*                mSimdInt16Ty;
     Type*                mSimdInt32Ty;
     Type*                mSimdInt64Ty;
     Type*                mSimdIntPtrTy;
+    Type*                mSimdVectorTy;
     StructType*          mV4FP32Ty;
     StructType*          mV4Int32Ty;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 2ff77bc30ec..7ebaca05151 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -189,32 +189,32 @@ Constant *Builder::PRED(bool pred)
 
 Value *Builder::VIMMED1(int i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(uint32_t i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(float i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 }
 
 Value *Builder::VIMMED1(bool i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VUNDEF_IPTR()
 {
-    return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 }
 
 Value *Builder::VUNDEF_I()
 {
-    return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type *ty, uint32_t size)
@@ -224,12 +224,12 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size)
 
 Value *Builder::VUNDEF_F()
 {
-    return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type* t)
 {
-    return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(t, mVWidth));
 }
 
 #if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
@@ -247,7 +247,7 @@ Value *Builder::VBROADCAST(Value *src)
         return src;
     }
 
-    return VECTOR_SPLAT(JM()->mVWidth, src);
+    return VECTOR_SPLAT(mVWidth, src);
 }
 
 uint32_t Builder::IMMED(Value* v)
@@ -342,8 +342,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
     else
     {
         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
-        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
+        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
     }
     return vResult;
 }
@@ -575,7 +575,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
         Value *vOffsets = MUL(vIndices,vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets,C(i));
@@ -625,7 +625,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
         Value *vOffsets = MUL(vIndices, vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets, C(i));
@@ -800,7 +800,7 @@ Value *Builder::CVTPH2PS(Value* a)
         }
 
         Value* pResult = UndefValue::get(mSimdFP32Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
@@ -833,7 +833,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)
         }
 
         Value* pResult = UndefValue::get(mSimdInt16Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
@@ -1085,8 +1085,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // input could either be float or int vector; do shuffle work in int
     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
@@ -1094,7 +1094,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 
     if(bPackedOutput) 
     {
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
@@ -1179,12 +1179,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     if(bPackedOutput)
     {
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
         // shuffle mask
         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
@@ -1292,7 +1292,7 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
     Value* vTmpPtr = ALLOCA(pSrcTy);
 
     Value *mask = MASK(vMask);
-    for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+    for (uint32_t i = 0; i < mVWidth; ++i)
     {
         Value *offset = VEXTRACT(vOffsets, C(i));
         // byte pointer to component
@@ -1415,8 +1415,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
 #endif
@@ -1432,17 +1432,17 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth; i++) {
+    for (unsigned i = 0; i < mVWidth; i++) {
         idx.push_back(C(i));
     }
     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 
     SmallVector<Constant*,8> idx2;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx2.push_back(C(flag ? i : i + mVWidth));
     }
-    for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
-        idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+        idx2.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c5a180e27cb..2ca01309d05 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     std::vector<Value*>    vtxInputIndices(2, C(0));
     // GEP
     pVtxOut = GEP(pVtxOut, C(0));
-    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 
     // SWR_FETCH_CONTEXT::pStreams
     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
@@ -220,8 +220,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
 
     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 
-    std::vector<Constant*>    pMask(JM()->mVWidth);
-    for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+    std::vector<Constant*>    pMask(mVWidth);
+    for(uint32_t i = 0; i < mVWidth; ++i)
     {
         pMask[i] = (C(i < 4 ? i : 4));
     }
@@ -254,7 +254,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
         Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
 
         // Load from the stream.
-        for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+        for(uint32_t lane = 0; lane < mVWidth; ++lane)
         {
             // Get index
             Value* index = VEXTRACT(vIndices, C(lane));
@@ -380,44 +380,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
             vectors.push_back(wvec);
         }
 
-        std::vector<Constant*>        v01Mask(JM()->mVWidth);
-        std::vector<Constant*>        v23Mask(JM()->mVWidth);
-        std::vector<Constant*>        v02Mask(JM()->mVWidth);
-        std::vector<Constant*>        v13Mask(JM()->mVWidth);
+        std::vector<Constant*>        v01Mask(mVWidth);
+        std::vector<Constant*>        v23Mask(mVWidth);
+        std::vector<Constant*>        v02Mask(mVWidth);
+        std::vector<Constant*>        v13Mask(mVWidth);
 
         // Concatenate the vectors together.
         elements[0] = VUNDEF_F(); 
         elements[1] = VUNDEF_F(); 
         elements[2] = VUNDEF_F(); 
         elements[3] = VUNDEF_F(); 
-        for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+        for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
         {
             v01Mask[4 * b + 0] = C(0 + 4 * b);
             v01Mask[4 * b + 1] = C(1 + 4 * b);
-            v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+            v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 
             v23Mask[4 * b + 0] = C(2 + 4 * b);
             v23Mask[4 * b + 1] = C(3 + 4 * b);
-            v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
-            v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
+            v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
             v02Mask[4 * b + 0] = C(0 + 4 * b);
             v02Mask[4 * b + 1] = C(2 + 4 * b);
-            v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+            v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 
             v13Mask[4 * b + 0] = C(1 + 4 * b);
             v13Mask[4 * b + 1] = C(3 + 4 * b);
-            v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
-            v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
+            v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
-            std::vector<Constant*>    iMask(JM()->mVWidth);
-            for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+            std::vector<Constant*>    iMask(mVWidth);
+            for(uint32_t i = 0; i < mVWidth; ++i)
             {
                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
                 {
-                    iMask[i] = C(i % 4 + JM()->mVWidth);
+                    iMask[i] = C(i % 4 + mVWidth);
                 }
                 else
                 {
@@ -805,7 +805,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint8_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -840,7 +840,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint16_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -925,13 +925,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
     const uint32_t (&swizzle)[4] = std::get<9>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = mSimdInt32Ty;
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
-        Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask, including any swizzling
         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
@@ -1138,8 +1138,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
     Value* (&vVertexElements)[4] = std::get<8>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
@@ -1149,7 +1149,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
 
         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,

From 3f4fba3772cb2e6c23e7f664c3225ba4e0889ff4 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 23 Feb 2016 17:29:59 -0600
Subject: [PATCH 014/238] swr: [rasterizer core] Move InitializeHotTiles and
 corresponding clear code out of threads.cpp.

---
 .../drivers/swr/rasterizer/core/threads.cpp   | 167 +-----------
 .../drivers/swr/rasterizer/core/tilemgr.cpp   | 252 +++++++++++++++++-
 .../drivers/swr/rasterizer/core/tilemgr.h     |  93 +------
 3 files changed, 258 insertions(+), 254 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 8f0d9249ae0..351a98be4d8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -44,7 +44,6 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
-#include "core/multisample.h"
 
 
 
@@ -281,171 +280,7 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti
     return (pDC->dependency > lastRetiredDraw);
 }
 
-void ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
-    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
-    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
-    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
-            {
-                _simd_store_ps(pfBuf, valR);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valG);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valB);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valA);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
-        }
-    }
-}
-
-void ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
-
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
-            {
-                _simd_store_ps(pfBuf, valZ);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
-        }
-    }
-}
-
-void ClearStencilHotTile(const HOTTILE* pHotTile)
-{
-    // convert from F32 to U8.
-    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
-    simdscalari valS = _simd_set1_epi8(clearVal);
-
-    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
-            {
-                _simd_store_si(pBuf, valS);
-                pBuf += 1;
-            }
-        }
-    }
-}
-
-// for draw calls, we initialize the active hot tiles and perform deferred
-// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
-// the draw routine itself mainly for performance, to avoid unnecessary setup
-// every triangle
-// @todo support deferred clear
-INLINE
-void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
-{
-    const API_STATE& state = GetApiState(pDC);
-    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
-
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroID, x, y);
-    x *= KNOB_MACROTILE_X_DIM;
-    y *= KNOB_MACROTILE_Y_DIM;
-
-    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-
-    // check RT if enabled
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
-    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
-
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearColorHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
-
-    // check depth if enabled
-    if (state.depthHottileEnable)
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearDepthHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-    }
-
-    // check stencil if enabled
-    if (state.stencilHottileEnable)
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearStencilHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-    }
-}
 
 INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
@@ -568,7 +403,7 @@ void WorkOnFifoBE(
                             SWR_ASSERT(pWork);
                             if (pWork->type == DRAW)
                             {
-                                InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
+                                pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
                             }
                         }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 860393661e2..54a5078ba90 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -29,7 +29,9 @@
 #include <unordered_map>
 
 #include "fifo.hpp"
-#include "tilemgr.h"
+#include "core/tilemgr.h"
+#include "core/multisample.h"
+#include "rdtsc_core.h"
 
 #define TILE_ID(x,y) ((x << 16 | y))
 
@@ -103,3 +105,251 @@ void MacroTileMgr::markTileComplete(uint32_t id)
     tile.mWorkItemsFE = 0;
     tile.mWorkItemsBE = 0;
 }
+
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
+    uint32_t renderTargetArrayIndex)
+{
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    HotTileSet &tile = mHotTiles[x][y];
+    HOTTILE& hotTile = tile.Attachment[attachment];
+    if (hotTile.pBuffer == NULL)
+    {
+        if (create)
+        {
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+    else
+    {
+        // free the old tile and create a new one with enough space to hold all samples
+        if (numSamples > hotTile.numSamples)
+        {
+            // tile should be either uninitialized or resolved if we're deleting and switching to a 
+            // new sample count
+            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
+                (hotTile.state == HOTTILE_RESOLVED) ||
+                (hotTile.state == HOTTILE_CLEAR));
+            _aligned_free(hotTile.pBuffer);
+
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+        }
+
+        // if requested render target array index isn't currently loaded, need to store out the current hottile 
+        // and load the requested array slice
+        if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+        {
+            SWR_FORMAT format;
+            switch (attachment)
+            {
+            case SWR_ATTACHMENT_COLOR0:
+            case SWR_ATTACHMENT_COLOR1:
+            case SWR_ATTACHMENT_COLOR2:
+            case SWR_ATTACHMENT_COLOR3:
+            case SWR_ATTACHMENT_COLOR4:
+            case SWR_ATTACHMENT_COLOR5:
+            case SWR_ATTACHMENT_COLOR6:
+            case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+            default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            }
+
+            if (hotTile.state == HOTTILE_DIRTY)
+            {
+                pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+            }
+
+            pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+                x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+            hotTile.state = HOTTILE_DIRTY;
+        }
+    }
+    return &tile.Attachment[attachment];
+}
+
+void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+            {
+                _simd_store_ps(pfBuf, valR);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valG);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valB);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valA);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+            {
+                _simd_store_ps(pfBuf, valZ);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+    // convert from F32 to U8.
+    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+    //broadcast 32x into __m256i...
+    simdscalari valS = _simd_set1_epi8(clearVal);
+
+    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+            {
+                _simd_store_si(pBuf, valS);
+                pBuf += 1;
+            }
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitializeHotTiles
+/// for draw calls, we initialize the active hot tiles and perform deferred
+/// load on them if tile is in invalid state. we do this in the outer thread
+/// loop instead of inside the draw routine itself mainly for performance,
+/// to avoid unnecessary setup every triangle
+/// @todo support deferred clear
+/// @param pCreateInfo - pointer to creation info.
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
+{
+    const API_STATE& state = GetApiState(pDC);
+    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+    x *= KNOB_MACROTILE_X_DIM;
+    y *= KNOB_MACROTILE_Y_DIM;
+
+    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+    // check RT if enabled
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearColorHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+
+    // check depth if enabled
+    if (state.depthHottileEnable)
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearDepthHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+
+    // check stencil if enabled
+    if (state.stencilHottileEnable)
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearStencilHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+}
\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index b5eaaab63a3..a2dae46e139 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -293,95 +293,14 @@ public:
         }
     }
 
-    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, 
-        uint32_t renderTargetArrayIndex = 0)
-    {
-        uint32_t x, y;
-        MacroTileMgr::getTileIndices(macroID, x, y);
+    void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID);
 
-        SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
-        SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+        uint32_t renderTargetArrayIndex = 0);
 
-        HotTileSet &tile = mHotTiles[x][y];
-        HOTTILE& hotTile = tile.Attachment[attachment];
-        if (hotTile.pBuffer == NULL)
-        {
-            if (create)
-            {
-                uint32_t size = numSamples * mHotTileSize[attachment];
-                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-                hotTile.state = HOTTILE_INVALID;
-                hotTile.numSamples = numSamples;
-                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-            }
-            else
-            {
-                return NULL;
-            }
-        }
-        else
-        {
-            // free the old tile and create a new one with enough space to hold all samples
-            if (numSamples > hotTile.numSamples)
-            {
-                // tile should be either uninitialized or resolved if we're deleting and switching to a 
-                // new sample count
-                SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
-                       (hotTile.state == HOTTILE_RESOLVED) || 
-                       (hotTile.state == HOTTILE_CLEAR));
-                _aligned_free(hotTile.pBuffer);
-
-                uint32_t size = numSamples * mHotTileSize[attachment];
-                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-                hotTile.state = HOTTILE_INVALID;
-                hotTile.numSamples = numSamples;
-            }
-
-            // if requested render target array index isn't currently loaded, need to store out the current hottile 
-            // and load the requested array slice
-            if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
-            {
-                SWR_FORMAT format;
-                switch (attachment)
-                {
-                case SWR_ATTACHMENT_COLOR0:
-                case SWR_ATTACHMENT_COLOR1:
-                case SWR_ATTACHMENT_COLOR2:
-                case SWR_ATTACHMENT_COLOR3:
-                case SWR_ATTACHMENT_COLOR4:
-                case SWR_ATTACHMENT_COLOR5:
-                case SWR_ATTACHMENT_COLOR6:
-                case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-                case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-                case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-                default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-                }
-
-                if (hotTile.state == HOTTILE_DIRTY)
-                {
-                    pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
-                        x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
-                }
-
-                pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
-                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
-
-                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-                hotTile.state = HOTTILE_DIRTY;
-            }
-        }
-        return &tile.Attachment[attachment];
-    }
-
-    HotTileSet &GetHotTile(uint32_t macroID)
-    {
-        uint32_t x, y;
-        MacroTileMgr::getTileIndices(macroID, x, y);
-        SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
-        SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
-
-        return mHotTiles[x][y];
-    }
+    static void ClearColorHotTile(const HOTTILE* pHotTile);
+    static void ClearDepthHotTile(const HOTTILE* pHotTile);
+    static void ClearStencilHotTile(const HOTTILE* pHotTile);
 
 private:
     HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];

From 13303f3320758220560e92450e3e5264ce11e792 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 24 Feb 2016 19:03:33 -0600
Subject: [PATCH 015/238] swr: [rasterizer core] store blend output in
 temporary instead of PS output.

Fixes additive blend problem with MSAA
---
 .../drivers/swr/rasterizer/core/backend.cpp       | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 4a472bc9e5c..8c1858b9291 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -766,6 +766,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
     uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+    simdvector blendOut;
+
     for(uint32_t rt = 0; rt < NumRT; ++rt)
     {
         uint8_t *pColorSample;
@@ -779,6 +781,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
         }
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+        // pfnBlendFunc may not update all channels.  Initialize with PS output.
+        /// TODO: move this into the blend JIT.
+        blendOut = psContext.shaded[rt];
 
         // Blend outputs and update coverage mask for alpha test
         if(pfnBlendFunc[rt] != nullptr)
@@ -789,7 +794,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
                 psContext.shaded[1],
                 sample,
                 pColorSample,
-                psContext.shaded[rt],
+                blendOut,
                 &psContext.oMask,
                 (simdscalari*)&coverageMask);
         }
@@ -805,19 +810,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
         // store with color mask
         if(!pRTBlend->writeDisableRed)
         {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
         }
         if(!pRTBlend->writeDisableGreen)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
         }
         if(!pRTBlend->writeDisableBlue)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
         }
         if(!pRTBlend->writeDisableAlpha)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
         }
     }
 }

From abd4aa68cc1a7d8a20547069c617388eedb3673e Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 24 Feb 2016 13:34:50 -0600
Subject: [PATCH 016/238] swr: [rasterizer core] backend reorganization

---
 .../swr/rasterizer/common/simdintrin.h        |  62 ++++++
 .../drivers/swr/rasterizer/core/api.cpp       |  38 ++--
 .../drivers/swr/rasterizer/core/backend.cpp   | 186 ++----------------
 .../drivers/swr/rasterizer/core/backend.h     | 171 +++++++++++++++-
 .../drivers/swr/rasterizer/core/context.h     |   1 +
 .../swr/rasterizer/core/rasterizer.cpp        |   8 +-
 .../drivers/swr/rasterizer/core/state.h       |   8 +-
 .../swr/rasterizer/memory/tilingtraits.h      |  58 +-----
 8 files changed, 276 insertions(+), 256 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 8fa6d9ef408..90220943fee 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -136,6 +136,8 @@ __m256i func(__m256i a, __m256i b)\
 #define _simd_add_epi8 _simdemu_add_epi8
 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
 #define _simd_movemask_epi8 _simdemu_movemask_epi8
 
 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
@@ -158,6 +160,8 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
 
 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
@@ -295,6 +299,8 @@ int _simdemu_movemask_epi8(__m256i a)
 
 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
+#define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
 #define _simd_movemask_epi8 _mm256_movemask_epi8
 #endif
 
@@ -783,5 +789,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
     return vplaneps(vA, vB, vC, vI, vJ);
 }
 
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pdep_u32(a, mask);
+#else
+    UINT result = 0;
+
+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
+    // using bsf instead of funky loop
+    DWORD maskIndex;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. populate LSB from src
+        const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+        // 3. copy bit from mask
+        result |= LSB & lowest;
+
+        // 4. clear lowest bit
+        mask &= ~lowest;
+
+        // 5. prepare for next iteration
+        a >>= 1;
+    }
+
+    return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pext_u32(a, mask);
+#else
+    UINT result = 0;
+    DWORD maskIndex;
+    uint32_t currentBit = 0;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. copy bit from mask
+        result |= ((a & lowest) > 0) << currentBit++;
+
+        // 3. clear lowest bit
+        mask &= ~lowest;
+    }
+    return result;
+#endif
+}
 
 #endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index e18f9e7a811..f2061e6d1b2 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -721,16 +721,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
         pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
     }
 }
-
+// templated backend function tables
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
     const SWR_RASTSTATE &rastState = pState->state.rastState;
+    const SWR_PS_STATE &psState = pState->state.psState;
     BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
     const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
 
     // setup backend
-    if (pState->state.psState.pfnPixelShader == nullptr)
+    if (psState.pfnPixelShader == nullptr)
     {
         backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
         // always need to generate I & J per sample for Z interpolation
@@ -739,41 +748,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     else
     {
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
-        const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+        const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
 
         // currently only support 'normal' input coverage
-        SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
-                   pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+        SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+                   psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
      
-        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
-        switch(pState->state.psState.shadingRate)
+        switch(psState.shadingRate)
         {
         case SWR_SHADING_RATE_PIXEL:
             if(bMultisampleEnable)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
             }
             else
             {
                 // always need to generate I & J per pixel for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
             SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
-            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
+            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
             break;
-        case SWR_SHADING_RATE_COARSE:
         default:
             SWR_ASSERT(0 && "Invalid shading rate");
             break;
@@ -864,7 +872,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 
     uint32_t numRTs = pState->state.psState.numRenderTargets;
     pState->state.colorHottileEnable = 0;
-    if(pState->state.psState.pfnPixelShader != nullptr)
+    if (psState.pfnPixelShader != nullptr)
     {
         for (uint32_t rt = 0; rt < numRTs; ++rt)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 8c1858b9291..b8f1e5aad82 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -418,11 +418,10 @@ void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t mac
 }
 
 #if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #else
 #error Unsupported vector width
 #endif
@@ -457,155 +456,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
     return _simd_movemask_ps(vClipMask);
 }
 
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
-    // will need to update for avx512
-    assert(KNOB_SIMD_WIDTH == 8);
-
-    __m256i mask[2];
-    __m256i sampleCoverage[2];
-    if(bIsStandardPattern)
-    {
-        __m256i src = _mm256_set1_epi32(0);
-        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-            mask[1] = _mm256_set1_epi32(-1);
-            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-        }
-
-        // gather coverage for samples 0-7
-        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-        if(MultisampleTraits<sampleCountT>::numSamples > 8)
-        {
-            // gather coverage for samples 8-15
-            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
-        }
-    }
-    else
-    {
-        // center coverage is the same for all samples; just broadcast to the sample slots
-        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-        }
-    }
-
-    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-    __m256i packedCoverage1;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-    }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#else
-    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#endif
-
-    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-    {
-        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-        if(!bForcedSampleCount)
-        {
-            // input coverage has to be anded with sample mask if MSAA isn't forced on
-            inputMask[i] &= sampleMask;
-        }
-
-        // shift to the next pixel in the 4x2
-        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-    }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
-    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
-    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
 template<bool perspMask>
 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
 {
@@ -889,9 +739,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
 
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
@@ -903,9 +753,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
             if(coverageMask & MASK)
             {
                 RDTSC_START(BEBarycentric);
-                psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
                 // pixel center
-                psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
 
@@ -1082,15 +932,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // pixel center
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             RDTSC_START(BEBarycentric);
             backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1318,14 +1168,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
             simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             if (bInputCoverage)
             {
@@ -1585,12 +1435,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
 
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
             // UL pixel corners
-            simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
 
             // iterate over active samples
             unsigned long sample = 0;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 53089e5047b..91b8cccf3ac 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -29,7 +29,8 @@
 #pragma once
 
 #include "common/os.h"
-#include "core/context.h" 
+#include "core/context.h"
+#include "core/multisample.h"
 
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
@@ -39,6 +40,9 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
 void InitClearTilesTable();
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
+void InitBackendFuncTables();
+void InitCPSFuncTables();
 
 enum SWR_BACKEND_FUNCS
 {
@@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS
     SWR_BACKEND_MSAA_SAMPLE_RATE,
     SWR_BACKEND_FUNCS_MAX,
 };
-void InitBackendFuncTables();
 
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
+#if KNOB_SIMD_WIDTH == 8
+extern const __m256 vCenterOffsetsX;
+extern const __m256 vCenterOffsetsY;
+extern const __m256 vULOffsetsX;
+extern const __m256 vULOffsetsY;
+#define MASK 0xff
+#endif
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+    // will need to update for avx512
+    assert(KNOB_SIMD_WIDTH == 8);
+
+    __m256i mask[2];
+    __m256i sampleCoverage[2];
+    if(bIsStandardPattern)
+    {
+        __m256i src = _mm256_set1_epi32(0);
+        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+            mask[1] = _mm256_set1_epi32(-1);
+            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+        }
+
+        // gather coverage for samples 0-7
+        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+        if(MultisampleTraits<sampleCountT>::numSamples > 8)
+        {
+            // gather coverage for samples 8-15
+            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+        }
+    }
+    else
+    {
+        // center coverage is the same for all samples; just broadcast to the sample slots
+        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+        }
+    }
+
+    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+    __m256i packedCoverage1;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+    }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#else
+    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#endif
+
+    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+    {
+        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+        if(!bForcedSampleCount)
+        {
+            // input coverage has to be anded with sample mask if MSAA isn't forced on
+            inputMask[i] &= sampleMask;
+        }
+
+        // shift to the next pixel in the 4x2
+        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+    }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
+    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index d75d9754e57..523e7ac87ff 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -83,6 +83,7 @@ struct SWR_TRIANGLE_DESC
     float *pUserClipBuffer;
 
     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+    uint64_t anyCoveredSamples;
 
     TRI_FLAGS triFlags;
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 07c9eedb50d..52fb7c88cdd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -752,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
         {
-            uint64_t anyCoveredSamples = 0;
+            triDesc.anyCoveredSamples = 0;
 
             // is the corner of the edge outside of the raster tile? (vEdge < 0)
             int mask0, mask1, mask2;
@@ -785,7 +785,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
                     if ((mask0 & mask1 & mask2) == 0xf)
                     {
-                        anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
                         // trivial accept, all 4 corners of all 3 edges are negative 
                         // i.e. raster tile completely inside triangle
                         RDTSC_EVENT(BETrivialAccept, 1, 0);
@@ -840,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                         }
                         RDTSC_STOP(BERasterizePartial, 0, 0);
 
-                        anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
                     }
                 }
                 else
@@ -861,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             }
             else
 #endif
-            if(anyCoveredSamples)
+            if(triDesc.anyCoveredSamples)
             {
                 RDTSC_START(BEPixelBackend);
                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2758555fd4b..a71eb6d7853 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -307,6 +307,8 @@ struct PixelPositions
     simdscalar centroid;
 };
 
+#define SWR_MAX_NUM_MULTISAMPLES 16
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_PS_CONTEXT
 /// @brief Input to pixel shader.
@@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT
     uint32_t frontFace;         // IN: front- 1, back- 0
     uint32_t primID;            // IN: primitive ID
     uint32_t sampleIndex;       // IN: sampleIndex
+
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE
 };
 static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
 
-#define SWR_MAX_NUM_MULTISAMPLES 16
 enum SWR_MULTISAMPLE_COUNT
 {
     SWR_MULTISAMPLE_1X = 0,
@@ -786,6 +788,7 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte
 typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
 
 //////////////////////////////////////////////////////////////////////////
@@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE
     uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
 };
 
+
 union SWR_DEPTH_STENCIL_STATE
 {
     struct
@@ -980,7 +984,6 @@ enum SWR_SHADING_RATE
 {
     SWR_SHADING_RATE_PIXEL,
     SWR_SHADING_RATE_SAMPLE,
-    SWR_SHADING_RATE_COARSE,
     SWR_SHADING_RATE_MAX,
 };
 
@@ -1024,4 +1027,5 @@ struct SWR_PS_STATE
     uint32_t barycentricsMask   : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
     uint32_t usesUAV            : 1;    // pixel shader accesses UAV 
     uint32_t forceEarlyZ        : 1;    // force execution of early depth/stencil test
+
 };
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
index 50f8e57c22a..381ac89a7b8 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "core/state.h"
+#include "common/simdintrin.h"
 
 template<SWR_TILE_MODE mode, int>
 struct TilingTraits
@@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
     static UINT GetPdepY() { return 0x1ea; }
 };
 
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
-    return _pdep_u32(a, mask);
-#else
-    UINT result = 0;
-
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
-    // using bsf instead of funky loop
-    DWORD maskIndex;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. populate LSB from src
-        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
-        // 3. copy bit from mask
-        result |= LSB & lowest;
-
-        // 4. clear lowest bit
-        mask &= ~lowest;
-
-        // 5. prepare for next iteration
-        a >>= 1;
-    }
-
-    return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
-    return _pext_u32(a, mask);
-#else
-    UINT result = 0;
-    DWORD maskIndex;
-    uint32_t currentBit = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. copy bit from mask
-        result |= ((a & lowest) > 0) << currentBit++;
-
-        // 3. clear lowest bit
-        mask &= ~lowest;
-    }
-    return result;
-#endif
-}
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief Computes the tileID for 2D tiled surfaces
 /// @param pitch - surface pitch in bytes

From bfb954189e166cee8b748edc29f5751d0c97c608 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 29 Feb 2016 12:01:48 -0600
Subject: [PATCH 017/238] swr: [rasterizer] Add rdtsc buckets support for
 shaders

Pass pointer to core buckets mgr back to sim layer.

Add support for RDTSC_START/RDTSC_STOP macros in the builder.

Each unique shader now has a unique bucket associated with it,
enabling more detailed reporting at the shader level. Currently
due to some llvm issue with thread local storage, 64bit runs require
single threaded mode.
---
 .../swr/rasterizer/common/rdtsc_buckets.cpp   | 12 ++++++
 .../swr/rasterizer/common/rdtsc_buckets.h     |  7 ++++
 .../drivers/swr/rasterizer/core/api.cpp       |  7 +++-
 src/gallium/drivers/swr/rasterizer/core/api.h | 10 ++++-
 .../swr/rasterizer/jitter/builder_misc.cpp    | 38 +++++++++++++++++++
 .../swr/rasterizer/jitter/builder_misc.h      |  4 ++
 6 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 454641b2751..7b40dc44d5d 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -64,8 +64,10 @@ void BucketManager::RegisterThread(const std::string& name)
 
 UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
 {
+    mThreadMutex.lock();
     size_t id = mBuckets.size();
     mBuckets.push_back(desc);
+    mThreadMutex.unlock();
     return (UINT)id;
 }
 
@@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename)
         fclose(f);
     }
 }
+
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+    pBucketMgr->StartBucket(id);
+}
+
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+    pBucketMgr->StopBucket(id);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index 99cb10ec6e8..de4dd8e9119 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -70,7 +70,9 @@ public:
     // removes all registered buckets
     void ClearBuckets()
     {
+        mThreadMutex.lock();
         mBuckets.clear();
+        mThreadMutex.unlock();
     }
 
     /// Registers a new thread with the manager.
@@ -227,3 +229,8 @@ private:
     bool mThreadViz{ false };
     std::string mThreadVizDir;
 };
+
+
+// C helpers for jitter
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index f2061e6d1b2..15dc534da72 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext);
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
 HANDLE SwrCreateContext(
-    const SWR_CREATECONTEXT_INFO* pCreateInfo)
+    SWR_CREATECONTEXT_INFO* pCreateInfo)
 {
     RDTSC_RESET();
     RDTSC_INIT(0);
@@ -118,6 +118,11 @@ HANDLE SwrCreateContext(
     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
     pContext->pfnClearTile = pCreateInfo->pfnClearTile;
 
+    // pass pointer to bucket manager back to caller
+#ifdef KNOB_ENABLE_RDTSC
+    pCreateInfo->pBucketMgr = &gBucketMgr;
+#endif
+
     return (HANDLE)pContext;
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 72fae8b2c21..9c046776bb8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
     SWR_RENDERTARGET_ATTACHMENT rtIndex,
     uint32_t x, uint32_t y, const float* pClearColor);
 
+class BucketManager;
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
 /////////////////////////////////////////////////////////////////////////
@@ -91,10 +93,14 @@ struct SWR_CREATECONTEXT_INFO
     // Each SWR context can have multiple sets of active state
     uint32_t maxSubContexts;
 
-    // tile manipulation functions
+    // Tile manipulation functions
     PFN_LOAD_TILE pfnLoadTile;
     PFN_STORE_TILE pfnStoreTile;
     PFN_CLEAR_TILE pfnClearTile;
+
+    // Pointer to rdtsc buckets mgr returned to the caller.
+    // Only populated when KNOB_ENABLE_RDTSC is set
+    BucketManager* pBucketMgr;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -112,7 +118,7 @@ struct SWR_RECT
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
 HANDLE SWR_API SwrCreateContext(
-    const SWR_CREATECONTEXT_INFO* pCreateInfo);
+    SWR_CREATECONTEXT_INFO* pCreateInfo);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Destroys SWR Context.
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 7ebaca05151..f18a9902c8c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -28,6 +28,8 @@
 * 
 ******************************************************************************/
 #include "builder.h"
+#include "common/rdtsc_buckets.h"
+
 #include "llvm/Support/DynamicLibrary.h"
 
 void __cdecl CallPrint(const char* fmt, ...);
@@ -1447,3 +1449,39 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 #endif
 }
+
+// rdtsc buckets macros
+void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
+void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 48e0558c4dd..172550e28b1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -147,3 +147,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
 
 Value *VEXTRACTI128(Value* a, Constant* imm8);
 Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+// rdtsc buckets macros
+void RDTSC_START(Value* pBucketMgr, Value* pId);
+void RDTSC_STOP(Value* pBucketMgr, Value* pId);

From aca55131843dec6da27f76308b2b4a145fc9e152 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 3 Mar 2016 18:19:45 -0600
Subject: [PATCH 018/238] swr: [rasterizer jitter] vpermps support

---
 .../swr/rasterizer/common/simdintrin.h        | 32 ++++++++++++
 .../swr/rasterizer/jitter/builder_misc.cpp    | 51 ++++++++++++++++++-
 .../swr/rasterizer/jitter/builder_misc.h      |  1 +
 .../jitter/scripts/gen_llvm_ir_macros.py      |  1 +
 4 files changed, 84 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 90220943fee..9ba28177257 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -115,6 +115,30 @@ __m256i func(__m256i a, __m256i b)\
 }
 
 #if (KNOB_ARCH == KNOB_ARCH_AVX)
+INLINE
+__m256 _simdemu_permute_ps(__m256 a, __m256i b)
+{
+    __m128 aHi = _mm256_extractf128_ps(a, 1);
+    __m128i bHi = _mm256_extractf128_si256(b, 1);
+    __m128 aLo = _mm256_castps256_ps128(a);
+    __m128i bLo = _mm256_castsi256_si128(b);
+
+    __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
+    __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+    __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+    __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+    indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
+    resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+    resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+    __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+    __m256 result = _mm256_castps128_ps256(blendLowRes);
+    result = _mm256_insertf128_ps(result, blendHiRes, 1);
+
+    return result;
+}
+
 #define _simd_mul_epi32 _simdemu_mul_epi32
 #define _simd_mullo_epi32 _simdemu_mullo_epi32
 #define _simd_sub_epi32 _simdemu_sub_epi32
@@ -137,8 +161,11 @@ __m256i func(__m256i a, __m256i b)\
 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
 #define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
 #define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
+#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
 #define _simd_movemask_epi8 _simdemu_movemask_epi8
+#define _simd_permute_ps _simdemu_permute_ps
 
 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
@@ -161,7 +188,9 @@ SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
 SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
 SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
 
 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
@@ -300,8 +329,11 @@ int _simdemu_movemask_epi8(__m256i a)
 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
 #define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
+#define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
 #define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
+#define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
 #define _simd_movemask_epi8 _mm256_movemask_epi8
+#define _simd_permute_ps _mm256_permutevar8x32_ps
 #endif
 
 #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index f18a9902c8c..b55752c1025 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -776,11 +776,60 @@ Value *Builder::PERMD(Value* a, Value* idx)
     }
     else
     {
-        res = VSHUFFLE(a, a, idx);
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_I();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
     }
     return res;
 }
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
+/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of float values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMPS(Value* a, Value* idx)
+{
+    Value* res;
+    // use avx2 permute instruction if available
+    if (JM()->mArch.AVX2())
+    {
+        // llvm 3.6.0 swapped the order of the args to vpermd
+        res = VPERMPS(idx, a);
+    }
+    else
+    {
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_F();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
+    }
+
+    return res;
+}
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 /// in LLVM IR.  If not supported on the underlying platform, emulate it
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 172550e28b1..18c30a2891f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -115,6 +115,7 @@ Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
 Value *PERMD(Value* a, Value* idx);
+Value *PERMPS(Value* a, Value* idx);
 Value *CVTPH2PS(Value* a);
 Value *CVTPS2PH(Value* a, Value* rounding);
 Value *PMAXSD(Value* a, Value* b);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 1814b7c8d5f..c78c9784b3d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -103,6 +103,7 @@ intrinsics = [
         ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]],  # sign extend packed 8bit components
         ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]],  # sign extend packed 16bit components
         ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+        ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
         ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
         ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
         ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],

From 49678803f79ee097749a5b2423b82bd50c6ea430 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 24 Mar 2016 11:07:15 -0500
Subject: [PATCH 019/238] swr: [rasterizer common] remove old unused win32
 types

---
 src/gallium/drivers/swr/rasterizer/common/os.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 265b879e1cb..140d6129322 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -77,18 +77,12 @@
 
 typedef void			VOID;
 typedef void*           LPVOID;
-typedef CARD8			BOOL;
-typedef wchar_t			WCHAR;
-typedef uint16_t		UINT16;
 typedef int				INT;
 typedef unsigned int	UINT;
-typedef uint32_t		UINT32;
 typedef uint64_t		UINT64;
-typedef int64_t		    INT64;
 typedef void*			HANDLE;
 typedef float			FLOAT;
 typedef int			    LONG;
-typedef CARD8		    BYTE;
 typedef unsigned char   UCHAR;
 typedef unsigned int	DWORD;
 

From 1da9c8a970207b5aac96b3161706041e781124f6 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 24 Mar 2016 11:07:32 -0500
Subject: [PATCH 020/238] swr: [rasterizer core] don't assume linux is 64-bit

---
 src/gallium/drivers/swr/rasterizer/core/utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index b9dc48c4fd7..45b0e20a91a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -46,8 +46,7 @@ void OpenBitmapFromFile(
     uint32_t *height);
 #endif
 
-/// @todo assume linux is always 64 bit
-#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#if defined(_WIN64) || defined(__x86_64__)
 #define _MM_INSERT_EPI64 _mm_insert_epi64
 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
 #else

From 45d52673c23197966f9b4d1fc302dba6b24c8d22 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Sat, 5 Mar 2016 00:53:04 -0600
Subject: [PATCH 021/238] swr: [rasterizer] add debug/perf category to knobs

---
 .../swr/rasterizer/scripts/knob_defs.py       | 49 ++++++++++++++-----
 1 file changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 8c51e1e8e73..47ded8237cf 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -26,6 +26,7 @@ KNOBS = [
         'default'   : 'true',
         'desc'      : ['Use dialogs when asserts fire.',
                        'Asserts are only enabled in debug builds'],
+        'category'  : 'debug',
     }],
 
     ['SINGLE_THREADED', {
@@ -33,12 +34,14 @@ KNOBS = [
         'default'   : 'false',
         'desc'      : ['If enabled will perform all rendering on the API thread.',
                        'This is useful mainly for debugging purposes.'],
+        'category'  : 'debug',
     }],
 
     ['DUMP_SHADER_IR', {
-       'type'       : 'bool',
-       'default'    : 'false',
-       'desc'       : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+        'category'  : 'debug',
     }],
 
     ['USE_GENERIC_STORETILE', {
@@ -46,6 +49,7 @@ KNOBS = [
         'default'   : 'false',
         'desc'      : ['Always use generic function for performing StoreTile.',
                        'Will be slightly slower than using optimized (jitted) path'],
+        'category'  : 'debug',
     }],
 
     ['FAST_CLEAR', {
@@ -53,6 +57,7 @@ KNOBS = [
         'default'   : 'true',
         'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
                        'defer clear execution to first backend op on hottile, or hottile store'],
+        'category'  : 'perf',
     }],
 
     ['MAX_NUMA_NODES', {
@@ -61,6 +66,7 @@ KNOBS = [
         'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
                        '  0 == ALL NUMA-nodes in the system',
                        '  N == Use at most N NUMA-nodes for rendering'],
+        'category'  : 'perf',
     }],
 
     ['MAX_CORES_PER_NUMA_NODE', {
@@ -69,6 +75,7 @@ KNOBS = [
         'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
                        '  0 == ALL non-API thread cores per NUMA-node',
                        '  N == Use at most N cores per NUMA-node'],
+        'category'  : 'perf',
     }],
 
     ['MAX_THREADS_PER_CORE', {
@@ -77,6 +84,7 @@ KNOBS = [
         'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
                        '  0 == ALL hyper-threads per core',
                        '  N == Use at most N hyper-threads per physical core'],
+        'category'  : 'perf',
     }],
 
     ['MAX_WORKER_THREADS', {
@@ -87,6 +95,7 @@ KNOBS = [
                        'IMPORTANT: If this is non-zero, no worker threads will be bound to',
                        'specific HW threads.  They will all be "floating" SW threads.',
                        'In this case, the above 3 KNOBS will be ignored.'],
+        'category'  : 'perf',
     }],
 
     ['BUCKETS_START_FRAME', {
@@ -96,6 +105,7 @@ KNOBS = [
                        '',
                        'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
                        'for this to have an effect.'],
+        'category'  : 'perf',
     }],
 
     ['BUCKETS_END_FRAME', {
@@ -105,6 +115,7 @@ KNOBS = [
                        '',
                        'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
                        'for this to have an effect.'],
+        'category'  : 'perf',
     }],
 
     ['WORKER_SPIN_LOOP_COUNT', {
@@ -112,46 +123,53 @@ KNOBS = [
         'default'   : '5000',
         'desc'      : ['Number of spin-loop iterations worker threads will perform',
                        'before going to sleep when waiting for work'],
+        'category'  : 'perf',
     }],
 
     ['MAX_DRAWS_IN_FLIGHT', {
         'type'      : 'uint32_t',
         'default'   : '160',
         'desc'      : ['Maximum number of draws outstanding before API thread blocks.'],
+        'category'  : 'perf',
     }],
 
     ['MAX_PRIMS_PER_DRAW', {
-       'type'       : 'uint32_t',
-       'default'    : '2040',
-       'desc'       : ['Maximum primitives in a single Draw().',
+        'type'      : 'uint32_t',
+        'default'   : '2040',
+        'desc'      : ['Maximum primitives in a single Draw().',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (3 * vectorWidth).'],
+        'category'  : 'perf',
     }],
 
     ['MAX_TESS_PRIMS_PER_DRAW', {
-       'type'       : 'uint32_t',
-       'default'    : '16',
-       'desc'       : ['Maximum primitives in a single Draw() with tessellation enabled.',
+        'type'      : 'uint32_t',
+        'default'   : '16',
+        'desc'      : ['Maximum primitives in a single Draw() with tessellation enabled.',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (vectorWidth).'],
+        'category'  : 'perf',
     }],
 
     ['MAX_FRAC_ODD_TESS_FACTOR', {
         'type'      : 'float',
         'default'   : '63.0f',
         'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
+        'category'  : 'perf',
     }],
 
     ['MAX_FRAC_EVEN_TESS_FACTOR', {
         'type'      : 'float',
         'default'   : '64.0f',
         'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
+        'category'  : 'perf',
     }],
 
     ['MAX_INTEGER_TESS_FACTOR', {
         'type'      : 'uint32_t',
         'default'   : '64',
         'desc'      : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+        'category'  : 'perf',
     }],
 
 
@@ -159,12 +177,14 @@ KNOBS = [
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['Enable threadviz output.'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_DRAW', {
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['Disable per-draw/dispatch execution'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_QUEUE_FE', {
@@ -173,6 +193,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at worker FE',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_FETCH', {
@@ -181,6 +202,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at vertex fetch',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_IA', {
@@ -189,6 +211,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at input assembler',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_VS', {
@@ -197,6 +220,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at vertex shader',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_SETUP_TRIS', {
@@ -205,6 +229,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at primitive setup',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_BIN_TRIS', {
@@ -213,6 +238,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at primitive binning',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_RS', {
@@ -221,6 +247,5 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at rasterizer',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-    }],
-
-]
+        'category'  : 'perf',
+    }],]

From 3252fe3705376063f94a7717c07b9824b5d43f46 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 7 Mar 2016 01:14:13 -0600
Subject: [PATCH 022/238] swr: [rasterizer] Fix Coverity issues reported by
 Mesa developers.

---
 .../swr/rasterizer/common/containers.hpp      | 246 +++++++++---------
 .../rasterizer/common/rdtsc_buckets_shared.h  |   4 +-
 .../drivers/swr/rasterizer/core/backend.cpp   |   2 +-
 .../drivers/swr/rasterizer/core/clip.h        |   6 +-
 src/gallium/drivers/swr/rasterizer/core/pa.h  |  92 +++----
 .../drivers/swr/rasterizer/core/tilemgr.h     |   7 +-
 .../drivers/swr/rasterizer/core/utils.h       |  10 +-
 .../swr/rasterizer/jitter/JitManager.h        |   1 -
 .../swr/rasterizer/jitter/builder_misc.cpp    |   2 +
 .../drivers/swr/rasterizer/memory/Convert.h   |  10 +-
 10 files changed, 193 insertions(+), 187 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
index bc96c5f62fd..95af4387fcb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -33,137 +33,137 @@ namespace SWRL
 template <typename T, int NUM_ELEMENTS>
 struct UncheckedFixedVector
 {
-	UncheckedFixedVector() : mSize(0)
-	{
-	}
+    UncheckedFixedVector() : mSize(0)
+    {
+    }
 
-	UncheckedFixedVector(std::size_t size, T const& exemplar)
-	{
-		this->mSize = 0;
-		for (std::size_t i = 0; i < size; ++i)
-			this->push_back(exemplar);
-	}
+    UncheckedFixedVector(std::size_t size, T const& exemplar)
+    {
+        this->mSize = 0;
+        for (std::size_t i = 0; i < size; ++i)
+            this->push_back(exemplar);
+    }
 
-	template <typename Iter>
-	UncheckedFixedVector(Iter fst, Iter lst)
-	{
-		this->mSize = 0;
-		for ( ; fst != lst; ++fst)
-			this->push_back(*fst);
-	}
+    template <typename Iter>
+    UncheckedFixedVector(Iter fst, Iter lst)
+    {
+        this->mSize = 0;
+        for ( ; fst != lst; ++fst)
+            this->push_back(*fst);
+    }
 
-	UncheckedFixedVector(UncheckedFixedVector const& UFV)
-	{
-		this->mSize = 0;
-		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-			(*this)[i] = UFV[i];
-		this->mSize = UFV.size();
-	}
+    UncheckedFixedVector(UncheckedFixedVector const& UFV)
+    {
+        this->mSize = 0;
+        for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+            (*this)[i] = UFV[i];
+        this->mSize = UFV.size();
+    }
 
-	UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
-	{
-		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-			(*this)[i] = UFV[i];
-		this->mSize = UFV.size();
-		return *this;
-	}
+    UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+    {
+        for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+            (*this)[i] = UFV[i];
+        this->mSize = UFV.size();
+        return *this;
+    }
 
-	T* begin()	{ return &this->mElements[0]; }
-	T* end()	{ return &this->mElements[0] + this->mSize; }
-	T const* begin() const	{ return &this->mElements[0]; }
-	T const* end() const	{ return &this->mElements[0] + this->mSize; }
+    T* begin()	{ return &this->mElements[0]; }
+    T* end()	{ return &this->mElements[0] + this->mSize; }
+    T const* begin() const	{ return &this->mElements[0]; }
+    T const* end() const	{ return &this->mElements[0] + this->mSize; }
 
-	friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-	{
-		if (L.size() != R.size()) return false;
-		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-		{
-			if (L[i] != R[i]) return false;
-		}
-		return true;
-	}
+    friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+    {
+        if (L.size() != R.size()) return false;
+        for (std::size_t i = 0, N = L.size(); i < N; ++i)
+        {
+            if (L[i] != R[i]) return false;
+        }
+        return true;
+    }
 
-	friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-	{
-		if (L.size() != R.size()) return true;
-		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-		{
-			if (L[i] != R[i]) return true;
-		}
-		return false;
-	}
+    friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+    {
+        if (L.size() != R.size()) return true;
+        for (std::size_t i = 0, N = L.size(); i < N; ++i)
+        {
+            if (L[i] != R[i]) return true;
+        }
+        return false;
+    }
 
-	T& operator[](std::size_t idx)
-	{
-		return this->mElements[idx];
-	}
-	T const& operator[](std::size_t idx) const
-	{
-		return this->mElements[idx];
-	}
-	void push_back(T const& t)
-	{
-		this->mElements[this->mSize]	= t;
-		++this->mSize;
-	}
-	void pop_back()
-	{
-		SWR_ASSERT(this->mSize > 0);
-		--this->mSize;
-	}
-	T& back()
-	{
-		return this->mElements[this->mSize-1];
-	}
-	T const& back() const
-	{
-		return this->mElements[this->mSize-1];
-	}
-	bool empty() const
-	{
-		return this->mSize == 0;
-	}
-	std::size_t size() const
-	{
-		return this->mSize;
-	}
-	void resize(std::size_t sz)
-	{
-		this->mSize = sz;
-	}
-	void clear()
-	{
-		this->resize(0);
-	}
+    T& operator[](std::size_t idx)
+    {
+        return this->mElements[idx];
+    }
+    T const& operator[](std::size_t idx) const
+    {
+        return this->mElements[idx];
+    }
+    void push_back(T const& t)
+    {
+        this->mElements[this->mSize]	= t;
+        ++this->mSize;
+    }
+    void pop_back()
+    {
+        SWR_ASSERT(this->mSize > 0);
+        --this->mSize;
+    }
+    T& back()
+    {
+        return this->mElements[this->mSize-1];
+    }
+    T const& back() const
+    {
+        return this->mElements[this->mSize-1];
+    }
+    bool empty() const
+    {
+        return this->mSize == 0;
+    }
+    std::size_t size() const
+    {
+        return this->mSize;
+    }
+    void resize(std::size_t sz)
+    {
+        this->mSize = sz;
+    }
+    void clear()
+    {
+        this->resize(0);
+    }
 private:
-	std::size_t	mSize;
-	T			mElements[NUM_ELEMENTS];
+    std::size_t	mSize{ 0 };
+    T mElements[NUM_ELEMENTS];
 };
 
 template <typename T, int NUM_ELEMENTS>
 struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
 {
-	FixedStack() {}
+    FixedStack() {}
 
-	void push(T const& t)
-	{
-		this->push_back(t);
-	}
+    void push(T const& t)
+    {
+        this->push_back(t);
+    }
 
-	void pop()
-	{
-		this->pop_back();
-	}
+    void pop()
+    {
+        this->pop_back();
+    }
 
-	T& top()
-	{
-		return this->back();
-	}
+    T& top()
+    {
+        return this->back();
+    }
 
-	T const& top() const
-	{
-		return this->back();
-	}
+    T const& top() const
+    {
+        return this->back();
+    }
 };
 
 template <typename T>
@@ -190,16 +190,16 @@ namespace std
 template <typename T, int N>
 struct hash<SWRL::UncheckedFixedVector<T, N>>
 {
-	size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
-	{
-		if (v.size() == 0) return 0;
-		std::hash<T> H;
-		size_t x = H(v[0]);
-		if (v.size() == 1) return x;
-		for (size_t i = 1; i < v.size(); ++i)
-			x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
-		return x;
-	}
+    size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+    {
+        if (v.size() == 0) return 0;
+        std::hash<T> H;
+        size_t x = H(v[0]);
+        if (v.size() == 1) return x;
+        for (size_t i = 1; i < v.size(); ++i)
+            x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+        return x;
+    }
 };
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
index 41c6d5dec79..34c322e5a85 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -64,13 +64,13 @@ struct BUCKET_THREAD
     std::string name;
 
     // id for this thread, assigned by the thread manager
-    uint32_t id;
+    uint32_t id{ 0 };
 
     // root of the bucket hierarchy for this thread
     BUCKET root;
 
     // currently executing bucket somewhere in the hierarchy
-    BUCKET* pCurrent;
+    BUCKET* pCurrent{ nullptr };
 
     // currently executing hierarchy level
     uint32_t level{ 0 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index b8f1e5aad82..aae1eac45a7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -1172,7 +1172,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
         psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
+            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
             psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
             psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 49494a4e374..b0b95d64f39 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -854,9 +854,9 @@ private:
         return vNumOutPts;
     }
 
-    const uint32_t workerId;
-    const DRIVER_TYPE driverType;
-    DRAW_CONTEXT* pDC;
+    const uint32_t workerId{ 0 };
+    const DRIVER_TYPE driverType{ DX };
+    DRAW_CONTEXT* pDC{ nullptr };
     const API_STATE& state;
     simdscalar clipCodes[NumVertsPerPrim];
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 2028d9fbcfe..f8f1a33b7e3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -34,12 +34,12 @@
 
 struct PA_STATE
 {
-    DRAW_CONTEXT *pDC;              // draw context
-    uint8_t* pStreamBase;           // vertex stream
-    uint32_t streamSizeInVerts;     // total size of the input stream in verts
+    DRAW_CONTEXT *pDC{ nullptr };              // draw context
+    uint8_t* pStreamBase{ nullptr };           // vertex stream
+    uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
 
     // The topology the binner will use. In some cases the FE changes the topology from the api state.
-    PRIMITIVE_TOPOLOGY binTopology;
+    PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
 
     PA_STATE() {}
     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
@@ -76,37 +76,37 @@ struct PA_STATE
 // cuts
 struct PA_STATE_OPT : public PA_STATE
 {
-    simdvertex leadingVertex;           // For tri-fan
-    uint32_t numPrims;              // Total number of primitives for draw.
-    uint32_t numPrimsComplete;      // Total number of complete primitives.
+    simdvertex leadingVertex;            // For tri-fan
+    uint32_t numPrims{ 0 };              // Total number of primitives for draw.
+    uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 
-    uint32_t numSimdPrims;          // Number of prims in current simd.
+    uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 
-    uint32_t cur;                   // index to current VS output.
-    uint32_t prev;                  // index to prev VS output. Not really needed in the state.
-    uint32_t first;                 // index to first VS output. Used for trifan.
+    uint32_t cur{ 0 };                   // index to current VS output.
+    uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
+    uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
 
-    uint32_t counter;               // state counter
-    bool reset;                     // reset state
+    uint32_t counter{ 0 };               // state counter
+    bool reset{ false };                 // reset state
 
-    uint32_t primIDIncr;            // how much to increment for each vector (typically vector / {1, 2})
+    uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
     simdscalari primID;
 
     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
 
-    PFN_PA_FUNC        pfnPaFunc;        // PA state machine function for assembling 4 triangles.
-    PFN_PA_SINGLE_FUNC pfnPaSingleFunc;  // PA state machine function for assembling single triangle.
-    PFN_PA_FUNC        pfnPaFuncReset;   // initial state to set on reset
+    PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
+    PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 
     // state used to advance the PA when Next is called
-    PFN_PA_FUNC        pfnPaNextFunc;
-    uint32_t           nextNumSimdPrims;
-    uint32_t           nextNumPrimsIncrement;
-    bool               nextReset;
-    bool               isStreaming;
+    PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
+    uint32_t           nextNumSimdPrims{ 0 };
+    uint32_t           nextNumPrimsIncrement{ 0 };
+    bool               nextReset{ false };
+    bool               isStreaming{ false };
 
-    simdmask tmpIndices;             // temporary index store for unused virtual function
+    simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
     
     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
@@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
 // Cut-aware primitive assembler.
 struct PA_STATE_CUT : public PA_STATE
 {
-    simdmask* pCutIndices;          // cut indices buffer, 1 bit per vertex
-    uint32_t numVerts;              // number of vertices available in buffer store
-    uint32_t numAttribs;            // number of attributes
-    int32_t numRemainingVerts;      // number of verts remaining to be assembled
-    uint32_t numVertsToAssemble;    // total number of verts to assemble for the draw
+    simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
+    uint32_t numVerts{ 0 };              // number of vertices available in buffer store
+    uint32_t numAttribs{ 0 };            // number of attributes
+    int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
+    uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
-    uint32_t numPrimsAssembled;     // number of primitives that are fully assembled
-    uint32_t headVertex;            // current unused vertex slot in vertex buffer store
-    uint32_t tailVertex;            // beginning vertex currently assembling
-    uint32_t curVertex;             // current unprocessed vertex
-    uint32_t startPrimId;           // starting prim id
-    simdscalari vPrimId;            // vector of prim ID
-    bool needOffsets;               // need to compute gather offsets for current SIMD
-    uint32_t vertsPerPrim;
-    simdvertex tmpVertex;               // temporary simdvertex for unimplemented API
-    bool processCutVerts;           // vertex indices with cuts should be processed as normal, otherwise they
-                                    // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-                                    // while the GS sends valid verts for every index 
+    uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
+    uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
+    uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
+    uint32_t curVertex{ 0 };             // current unprocessed vertex
+    uint32_t startPrimId{ 0 };           // starting prim id
+    simdscalari vPrimId;                 // vector of prim ID
+    bool needOffsets{ false };           // need to compute gather offsets for current SIMD
+    uint32_t vertsPerPrim{ 0 };
+    simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
+    bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
+                                         // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
+                                         // while the GS sends valid verts for every index 
     // Topology state tracking
     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-    uint32_t curIndex;
-    bool reverseWinding;            // indicates reverse winding for strips
-    int32_t adjExtraVert;           // extra vert uses for tristrip w/ adj
+    uint32_t curIndex{ 0 };
+    bool reverseWinding{ false };        // indicates reverse winding for strips
+    int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 
     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
-    PFN_PA_FUNC pfnPa;              // per-topology function that processes a single vert
+    PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 
     PA_STATE_CUT() {}
     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 
@@ -1199,9 +1199,9 @@ struct PA_FACTORY
 
     PA_STATE_OPT paOpt;
     PA_STATE_CUT paCut;
-    bool cutPA;
+    bool cutPA{ false };
 
-    PRIMITIVE_TOPOLOGY topo;
+    PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
 
     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index a2dae46e139..553e384c9bc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -136,14 +136,13 @@ public:
 
 private:
     Arena& mArena;
-    SWR_FORMAT mFormat;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
 
     // Any tile that has work queued to it is a dirty tile.
     std::vector<uint32_t> mDirtyTiles;
 
-    OSALIGNLINE(LONG) mWorkItemsProduced;
-    OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+    OSALIGNLINE(LONG) mWorkItemsProduced { 0 };
+    OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 };
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -224,7 +223,7 @@ public:
     void *operator new(size_t size);
     void operator delete (void *p);
 
-    void* mpTaskData;        // The API thread will set this up and the callback task function will interpet this.
+    void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
 
     OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
     OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index 45b0e20a91a..e2937021cdc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -88,7 +88,10 @@ INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
 
 OSALIGNLINE(struct) BBOX
 {
-    int top, bottom, left, right;
+    int top{ 0 };
+    int bottom{ 0 };
+    int left{ 0 };
+    int right{ 0 };
 
     BBOX() {}
     BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
@@ -109,7 +112,10 @@ OSALIGNLINE(struct) BBOX
 
 struct simdBBox
 {
-    simdscalari top, bottom, left, right;
+    simdscalari top;
+    simdscalari bottom;
+    simdscalari left;
+    simdscalari right;
 };
 
 INLINE
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c974a611224..c1bccab95ae 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -166,7 +166,6 @@ struct JitManager
     FunctionType* mTrinaryFPTy;
     FunctionType* mUnaryIntTy;
     FunctionType* mBinaryIntTy;
-    FunctionType* mTrinaryIntTy;
 
     Type* mSimtFP32Ty;
     Type* mSimtInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index b55752c1025..178f4c27c65 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -1454,6 +1454,8 @@ void __cdecl CallPrint(const char* fmt, ...)
     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
     OutputDebugString(strBuf);
 #endif
+
+    va_end(args);
 }
 
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
index 0f9e0ad4bd8..b323cc31982 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -230,7 +230,7 @@ static void ConvertPixelFromFloat(
     BYTE* pDstPixel,
     const float srcPixel[4])
 {
-    UINT outColor[4];  // typeless bits
+    uint32_t outColor[4] = { 0 };  // typeless bits
 
     // Store component
     for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
@@ -392,7 +392,7 @@ INLINE static void ConvertPixelToFloat(
     float dstPixel[4],
     const BYTE* pSrc)
 {
-    UINT srcColor[4];  // typeless bits
+    uint32_t srcColor[4];  // typeless bits
 
     // unpack src pixel
     typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
@@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat(
     }
 
     // Convert components
-    for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+    for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
     {
         SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
 
-        UINT src = srcColor[comp];
+        uint32_t src = srcColor[comp];
 
         switch (type)
         {
@@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat(
         }
         case SWR_TYPE_UINT:
         {
-            UINT dst = (UINT)src;
+            uint32_t dst = (uint32_t)src;
             dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
             break;
         }

From 643857f596529f4887eaa13e5d336efd6da371ad Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 7 Mar 2016 10:51:56 -0600
Subject: [PATCH 023/238] swr: [rasterizer] remove use of FLOAT type

---
 src/gallium/drivers/swr/rasterizer/common/os.h          | 1 -
 src/gallium/drivers/swr/rasterizer/core/api.h           | 2 +-
 src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp | 6 +++---
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 140d6129322..6c44d7b625c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -81,7 +81,6 @@ typedef int				INT;
 typedef unsigned int	UINT;
 typedef uint64_t		UINT64;
 typedef void*			HANDLE;
-typedef float			FLOAT;
 typedef int			    LONG;
 typedef unsigned char   UCHAR;
 typedef unsigned int	DWORD;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 9c046776bb8..14ec0f98e7d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -425,7 +425,7 @@ void SWR_API SwrStoreTiles(
 void SWR_API SwrClearRenderTarget(
     HANDLE hContext,
     uint32_t clearMask,
-    const FLOAT clearColor[4],
+    const float clearColor[4],
     float z,
     BYTE stencil);
 
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index ad73cd840a7..0306f796e57 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -33,7 +33,7 @@
 #include "memory/tilingtraits.h"
 #include "memory/Convert.h"
 
-typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT);
 
 //////////////////////////////////////////////////////////////////////////
 /// Clear Raster Tile Function Tables.
@@ -104,7 +104,7 @@ struct StoreMacroTileClear
     /// @param pDstSurface - Destination surface state
     /// @param x, y - Coordinates to macro tile
     static void StoreClear(
-        const FLOAT *pColor,
+        const float *pColor,
         SWR_SURFACE_STATE* pDstSurface,
         UINT x, UINT y)
     {
@@ -112,7 +112,7 @@ struct StoreMacroTileClear
 
         BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
 
-        FLOAT srcColor[4];
+        float srcColor[4];
 
         for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
         {

From 3132f731f8c4e6300ee31805be59920543b22557 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 7 Mar 2016 14:45:17 -0600
Subject: [PATCH 024/238] swr: [rasterizer] remove use of UCHAR and UINT64
 types

---
 src/gallium/drivers/swr/rasterizer/common/os.h              | 2 --
 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp | 6 +++---
 src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h   | 2 +-
 src/gallium/drivers/swr/rasterizer/core/threads.cpp         | 2 +-
 src/gallium/drivers/swr/rasterizer/core/threads.h           | 2 +-
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 6c44d7b625c..ea942105d91 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -79,10 +79,8 @@ typedef void			VOID;
 typedef void*           LPVOID;
 typedef int				INT;
 typedef unsigned int	UINT;
-typedef uint64_t		UINT64;
 typedef void*			HANDLE;
 typedef int			    LONG;
-typedef unsigned char   UCHAR;
 typedef unsigned int	DWORD;
 
 #undef FALSE
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 7b40dc44d5d..c6768b4c566 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -71,7 +71,7 @@ UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
     return (UINT)id;
 }
 
-void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
 {
     const char *arrows[] = {
         "",
@@ -90,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64
     float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
 
     // compute average cycle count per invocation
-    UINT64 CPE = bucket.elapsed / bucket.count;
+    uint64_t CPE = bucket.elapsed / bucket.count;
 
     BUCKET_DESC &desc = mBuckets[bucket.id];
 
@@ -129,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
 
     // compute thread level total cycle counts across all buckets from root
     const BUCKET& root = thread.root;
-    UINT64 totalCycles = 0;
+    uint64_t totalCycles = 0;
     for (const BUCKET& child : root.children)
     {
         totalCycles += child.elapsed;
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index de4dd8e9119..9dfa7f694d0 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -211,7 +211,7 @@ public:
     }
 
 private:
-    void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+    void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
     void PrintThread(FILE* f, const BUCKET_THREAD& thread);
 
     // list of active threads that have registered with this manager
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 351a98be4d8..f17de8ba268 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -443,7 +443,7 @@ void WorkOnFifoBE(
     }
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
 {
     // Try to grab the next DC from the ring
     uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 0fa7196f5ac..ec0b735a4ec 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -58,6 +58,6 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
 void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);

From bef222db22365c2518110d30cd1227625a86195b Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 8 Mar 2016 11:56:06 -0600
Subject: [PATCH 025/238] swr: [rasterizer core] Alleviate potential stack
 overflow for 32bit builds

Move large stack allocations in the GS and clipper into thread local storage.
---
 .../drivers/swr/rasterizer/core/clip.cpp      |  3 +++
 .../drivers/swr/rasterizer/core/clip.h        |  6 +++--
 .../drivers/swr/rasterizer/core/frontend.cpp  | 25 ++++++++++---------
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index ce27bf71d3c..3a2a8b35be8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -31,6 +31,9 @@
 #include "common/os.h"
 #include "core/clip.h"
 
+// Temp storage used by the clipper
+THREAD simdvertex tlsTempVertices[7];
+
 float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
 {
     return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index b0b95d64f39..4f51388d9cf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -32,6 +32,9 @@
 #include "core/pa.h"
 #include "rdtsc_core.h"
 
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
 enum SWR_CLIPCODES
 {
     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -818,8 +821,7 @@ private:
     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
     {
         // temp storage
-        simdvertex tempVertices[7];
-        float* pTempVerts = (float*)&tempVertices[0];
+        float* pTempVerts = (float*)&tlsTempVertices[0];
 
         // zero out num input verts for non-active lanes
         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f43a672bd82..d092a8644c6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -630,6 +630,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
     }
 }
 
+THREAD SWR_GS_CONTEXT tlsGsContext;
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
@@ -651,7 +653,6 @@ static void GeometryShaderStage(
 {
     RDTSC_START(FEGeometryShader);
 
-    SWR_GS_CONTEXT gsContext;
     SWR_CONTEXT* pContext = pDC->pContext;
 
     const API_STATE& state = GetApiState(pDC);
@@ -660,9 +661,9 @@ static void GeometryShaderStage(
     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 
-    gsContext.pStream = (uint8_t*)pGsOut;
-    gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
-    gsContext.PrimitiveID = primID;
+    tlsGsContext.pStream = (uint8_t*)pGsOut;
+    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+    tlsGsContext.PrimitiveID = primID;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
     simdvector attrib[MAX_ATTRIBUTES];
@@ -675,7 +676,7 @@ static void GeometryShaderStage(
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            gsContext.vert[i].attrib[attribSlot] = attrib[i];
+            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
         }
     }
     
@@ -683,7 +684,7 @@ static void GeometryShaderStage(
     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
     {
-        gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
     }
 
     const uint32_t vertexStride = sizeof(simdvertex);
@@ -710,14 +711,14 @@ static void GeometryShaderStage(
 
     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
     {
-        gsContext.InstanceID = instance;
-        gsContext.mask = GenerateMask(numInputPrims);
+        tlsGsContext.InstanceID = instance;
+        tlsGsContext.mask = GenerateMask(numInputPrims);
 
         // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 
-        gsContext.pStream += instanceStride;
-        gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+        tlsGsContext.pStream += instanceStride;
+        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
     }
 
     // set up new binner and state for the GS output topology
@@ -736,7 +737,7 @@ static void GeometryShaderStage(
     // foreach input prim:
     // - setup a new PA based on the emitted verts for that prim
     // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 
     uint32_t totalPrimsGenerated = 0;

From 0c18900cfb65379dea11f699bafccdd50e5c87c0 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 8 Mar 2016 18:58:54 -0600
Subject: [PATCH 026/238] swr: [rasterizer common] add _simd_s[rl]lv_epi32

---
 .../swr/rasterizer/common/simdintrin.h        | 115 ++++++++++++++++++
 1 file changed, 115 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 9ba28177257..96b7fbf8052 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -139,6 +139,117 @@ __m256 _simdemu_permute_ps(__m256 a, __m256i b)
     return result;
 }
 
+INLINE
+__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+INLINE
+__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
 #define _simd_mul_epi32 _simdemu_mul_epi32
 #define _simd_mullo_epi32 _simdemu_mullo_epi32
 #define _simd_sub_epi32 _simdemu_sub_epi32
@@ -166,6 +277,8 @@ __m256 _simdemu_permute_ps(__m256 a, __m256i b)
 #define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
 #define _simd_movemask_epi8 _simdemu_movemask_epi8
 #define _simd_permute_ps _simdemu_permute_ps
+#define _simd_srlv_epi32 _simdemu_srlv_epi32
+#define _simd_sllv_epi32 _simdemu_sllv_epi32
 
 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
@@ -334,6 +447,8 @@ int _simdemu_movemask_epi8(__m256i a)
 #define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
 #define _simd_movemask_epi8 _mm256_movemask_epi8
 #define _simd_permute_ps _mm256_permutevar8x32_ps
+#define _simd_srlv_epi32 _mm256_srlv_epi32
+#define _simd_sllv_epi32 _mm256_sllv_epi32
 #endif
 
 #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))

From 61beaa22795d45f3416ecb27de54a9ee8ae1b283 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 9 Mar 2016 16:15:37 -0600
Subject: [PATCH 027/238] swr: [rasterizer core] subcontext rework

---
 .../drivers/swr/rasterizer/core/api.cpp       | 61 ++++++++-----------
 src/gallium/drivers/swr/rasterizer/core/api.h | 28 ++++++---
 .../drivers/swr/rasterizer/core/context.h     |  4 --
 .../drivers/swr/rasterizer/core/ringbuffer.h  |  2 +-
 .../drivers/swr/rasterizer/core/threads.cpp   | 20 ++++--
 .../swr/rasterizer/scripts/knob_defs.py       |  1 +
 src/gallium/drivers/swr/swr_context.cpp       |  1 -
 7 files changed, 63 insertions(+), 54 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 15dc534da72..398347654f0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -64,13 +64,6 @@ HANDLE SwrCreateContext(
     pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
     pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
-    pContext->numSubContexts = pCreateInfo->maxSubContexts;
-    if (pContext->numSubContexts > 1)
-    {
-        pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
-        memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
-    }
-
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new Arena();
@@ -123,6 +116,8 @@ HANDLE SwrCreateContext(
     pCreateInfo->pBucketMgr = &gBucketMgr;
 #endif
 
+    pCreateInfo->contextSaveSize = sizeof(API_STATE);
+
     return (HANDLE)pContext;
 }
 
@@ -146,8 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
         _aligned_free(pContext->pScratch[i]);
     }
 
-    _aligned_free(pContext->subCtxSave);
-
     delete(pContext->pHotTileMgr);
 
     pContext->~SWR_CONTEXT();
@@ -314,32 +307,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
     return pContext->pCurDrawContext;
 }
 
-void SWR_API SwrSetActiveSubContext(
-    HANDLE hContext,
-    uint32_t subContextIndex)
-{
-    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-    if (subContextIndex >= pContext->numSubContexts)
-    {
-        return;
-    }
-
-    if (subContextIndex != pContext->curSubCtxId)
-    {
-        // Save and restore draw state
-        DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-        CopyState(
-            pContext->subCtxSave[pContext->curSubCtxId],
-            *(pDC->pState));
-
-        CopyState(
-            *(pDC->pState),
-            pContext->subCtxSave[subContextIndex]);
-
-        pContext->curSubCtxId = subContextIndex;
-    }
-}
-
 API_STATE* GetDrawState(SWR_CONTEXT *pContext)
 {
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
@@ -348,6 +315,30 @@ API_STATE* GetDrawState(SWR_CONTEXT *pContext)
     return &pDC->pState->state;
 }
 
+void SWR_API SwrSaveState(
+    HANDLE hContext,
+    void* pOutputStateBlock,
+    size_t memSize)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    auto pSrc = GetDrawState(pContext);
+    SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
+
+    memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
+}
+
+void SWR_API SwrRestoreState(
+    HANDLE hContext,
+    const void* pStateBlock,
+    size_t memSize)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    auto pDst = GetDrawState(pContext);
+    SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
+
+    memcpy(pDst, pStateBlock, sizeof(*pDst));
+}
+
 void SetupDefaultState(SWR_CONTEXT *pContext)
 {
     API_STATE* pState = GetDrawState(pContext);
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 14ec0f98e7d..c7106b3b1ba 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -90,9 +90,6 @@ struct SWR_CREATECONTEXT_INFO
     // Use SwrGetPrivateContextState() to access private state.
     uint32_t privateStateSize;
 
-    // Each SWR context can have multiple sets of active state
-    uint32_t maxSubContexts;
-
     // Tile manipulation functions
     PFN_LOAD_TILE pfnLoadTile;
     PFN_STORE_TILE pfnStoreTile;
@@ -101,6 +98,9 @@ struct SWR_CREATECONTEXT_INFO
     // Pointer to rdtsc buckets mgr returned to the caller.
     // Only populated when KNOB_ENABLE_RDTSC is set
     BucketManager* pBucketMgr;
+
+    // Output: size required memory passed to for SwrSaveState / SwrRestoreState
+    size_t  contextSaveSize;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -127,12 +127,24 @@ void SWR_API SwrDestroyContext(
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Set currently active state context
-/// @param subContextIndex - value from 0 to
-///     SWR_CREATECONTEXT_INFO.maxSubContexts.  Defaults to 0.
-void SWR_API SwrSetActiveSubContext(
+/// @brief Saves API state associated with hContext
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pOutputStateBlock - Memory block to receive API state data
+/// @param memSize - Size of memory pointed to by pOutputStateBlock
+void SWR_API SwrSaveState(
     HANDLE hContext,
-    uint32_t subContextIndex);
+    void* pOutputStateBlock,
+    size_t memSize);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Restores API state to hContext previously saved with SwrSaveState
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStateBlock - Memory block to read API state data from
+/// @param memSize - Size of memory pointed to by pStateBlock
+void SWR_API SwrRestoreState(
+    HANDLE hContext,
+    const void* pStateBlock,
+    size_t memSize);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Sync cmd. Executes the callback func when all rendering up to this sync
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 523e7ac87ff..a17276d1366 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -449,10 +449,6 @@ struct SWR_CONTEXT
 
     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
 
-    DRAW_STATE*   subCtxSave;          // Save area for inactive contexts.
-    uint32_t      curSubCtxId;         // Current index for active state subcontext.
-    uint32_t      numSubContexts;      // Number of available subcontexts
-
     uint32_t NumWorkerThreads;
 
     THREAD_POOL threadPool; // Thread pool associated with this context
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
index e323136bc41..7ff109d4fe8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -93,7 +93,7 @@ public:
     INLINE volatile uint64_t GetTail() { return mRingTail; }
     INLINE volatile uint64_t GetHead() { return mRingHead; }
 
-private:
+protected:
     T* mpRingBuffer;
     uint32_t mNumEntries;
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index f17de8ba268..c4567eaee87 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -688,9 +688,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
             numThreads, KNOB_MAX_NUM_THREADS);
     }
 
+    uint32_t numAPIReservedThreads = 1;
+
+
     if (numThreads == 1)
     {
-        // If only 1 worker thread, try to move it to an available
+        // If only 1 worker threads, try to move it to an available
         // HW thread.  If that fails, use the API thread.
         if (numCoresPerNode < numHWCoresPerNode)
         {
@@ -713,8 +716,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
     else
     {
-        // Save a HW thread for the API thread.
-        numThreads--;
+        // Save HW threads for the API if we can
+        if (numThreads > numAPIReservedThreads)
+        {
+            numThreads -= numAPIReservedThreads;
+        }
+        else
+        {
+            numAPIReservedThreads = 0;
+        }
     }
 
     pPool->numThreads = numThreads;
@@ -753,9 +763,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                 auto& core = node.cores[c];
                 for (uint32_t t = 0; t < numHyperThreads; ++t)
                 {
-                    if (c == 0 && n == 0 && t == 0)
+                    if (numAPIReservedThreads)
                     {
-                        // Skip core 0, thread0  on node 0 to reserve for API thread
+                        --numAPIReservedThreads;
                         continue;
                     }
 
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 47ded8237cf..a137f7518bc 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -21,6 +21,7 @@
 
 # Python source
 KNOBS = [
+
     ['ENABLE_ASSERT_DIALOGS', {
         'type'      : 'bool',
         'default'   : 'true',
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index 78b8fdf619b..46c79a14b2f 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -338,7 +338,6 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
    SWR_CREATECONTEXT_INFO createInfo;
    createInfo.driver = GL;
    createInfo.privateStateSize = sizeof(swr_draw_context);
-   createInfo.maxSubContexts = 0;
    createInfo.pfnLoadTile = swr_LoadHotTile;
    createInfo.pfnStoreTile = swr_StoreHotTile;
    createInfo.pfnClearTile = swr_StoreHotTileClear;

From 51a11658d9aa5b77cef502d128b7889b6952d4f6 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 9 Mar 2016 16:33:33 -0600
Subject: [PATCH 028/238] swr: [rasterizer] remove unused knob

---
 src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index a137f7518bc..cf4af71811d 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -22,14 +22,6 @@
 # Python source
 KNOBS = [
 
-    ['ENABLE_ASSERT_DIALOGS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Use dialogs when asserts fire.',
-                       'Asserts are only enabled in debug builds'],
-        'category'  : 'debug',
-    }],
-
     ['SINGLE_THREADED', {
         'type'      : 'bool',
         'default'   : 'false',

From be4c558d0167dd9d593a9adb44dad53b020817f7 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 9 Mar 2016 17:18:55 -0600
Subject: [PATCH 029/238] swr: [rasterizer core] Fix crash that can occur when
 switching contexts

---
 .../drivers/swr/rasterizer/core/backend.cpp    |  3 ++-
 .../drivers/swr/rasterizer/core/tilemgr.cpp    | 18 ++++++++++++++++++
 .../drivers/swr/rasterizer/core/tilemgr.h      |  2 ++
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index aae1eac45a7..195ac9420e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -408,9 +408,10 @@ void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t mac
     {
         if (pDesc->attachmentMask & (1 << i))
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i);
             if (pHotTile)
             {
+                SWR_ASSERT(pHotTile->state == HOTTILE_INVALID || pHotTile->state == HOTTILE_RESOLVED);
                 pHotTile->state = HOTTILE_INVALID;
             }
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 54a5078ba90..ced904e7119 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -186,6 +186,24 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
     return &tile.Attachment[attachment];
 }
 
+HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment)
+{
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    HotTileSet &tile = mHotTiles[x][y];
+    HOTTILE& hotTile = tile.Attachment[attachment];
+    if (hotTile.pBuffer == NULL)
+    {
+        return NULL;
+    }
+
+    return &hotTile;
+}
+
 void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
 {
     // Load clear color into SIMD register...
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 553e384c9bc..90337a21721 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -297,6 +297,8 @@ public:
     HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
         uint32_t renderTargetArrayIndex = 0);
 
+    HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment);
+
     static void ClearColorHotTile(const HOTTILE* pHotTile);
     static void ClearDepthHotTile(const HOTTILE* pHotTile);
     static void ClearStencilHotTile(const HOTTILE* pHotTile);

From 542d7dec7b8748b164150bd0818e880ed31918e3 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 10 Mar 2016 15:15:40 -0600
Subject: [PATCH 030/238] swr: [rasterizer] remove use of BYTE type

---
 .../drivers/swr/rasterizer/common/os.h        | 11 +-----
 .../drivers/swr/rasterizer/core/api.cpp       |  2 +-
 src/gallium/drivers/swr/rasterizer/core/api.h |  6 +--
 .../drivers/swr/rasterizer/core/backend.cpp   | 12 +++---
 .../drivers/swr/rasterizer/core/context.h     |  2 +-
 .../swr/rasterizer/core/depthstencil.h        |  6 +--
 .../swr/rasterizer/core/format_conversion.h   |  4 +-
 .../swr/rasterizer/core/format_types.h        | 32 ++++++++--------
 .../drivers/swr/rasterizer/core/frontend.cpp  |  4 +-
 .../drivers/swr/rasterizer/core/state.h       |  2 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp   |  6 +--
 .../drivers/swr/rasterizer/core/tilemgr.h     |  2 +-
 .../drivers/swr/rasterizer/core/utils.h       | 38 +++++++++----------
 .../swr/rasterizer/memory/ClearTile.cpp       |  8 ++--
 .../drivers/swr/rasterizer/memory/Convert.h   |  4 +-
 15 files changed, 65 insertions(+), 74 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index ea942105d91..d84c0719eec 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -68,7 +68,6 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <X11/Xmd.h>
 #include <x86intrin.h>
 #include <stdint.h>
 #include <sys/types.h>
@@ -171,14 +170,6 @@ unsigned char _bittest(const LONG *a, LONG b)
 
 #define CreateDirectory(name, pSecurity) mkdir(name, 0777)
 
-#if defined(_WIN32)
-static inline
-unsigned int _mm_popcnt_u32(unsigned int v)
-{
-    return __builtin_popcount(v);
-}
-#endif
-
 #define _aligned_free free
 #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
@@ -198,7 +189,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
 #endif
 
 // Universal types
-typedef BYTE        KILOBYTE[1024];
+typedef uint8_t     KILOBYTE[1024];
 typedef KILOBYTE    MEGABYTE[1024];
 typedef MEGABYTE    GIGABYTE[1024];
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 398347654f0..e2ea5d934d2 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1351,7 +1351,7 @@ void SwrClearRenderTarget(
     uint32_t clearMask,
     const float clearColor[4],
     float z,
-    BYTE stencil)
+    uint8_t stencil)
 {
     RDTSC_START(APIClearRenderTarget);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index c7106b3b1ba..30bafd70c2f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t
 /// @param pDstHotTile - pointer to the hot tile surface
 typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Function signature for store hot tiles
@@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma
 /// @param pSrcHotTile - pointer to the hot tile surface
 typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
 
 /// @brief Function signature for clearing from the hot tiles clear value
 /// @param hPrivateContext - handle to private data
@@ -439,7 +439,7 @@ void SWR_API SwrClearRenderTarget(
     uint32_t clearMask,
     const float clearColor[4],
     float z,
-    BYTE stencil);
+    uint8_t stencil);
 
 void SWR_API SwrSetRastState(
     HANDLE hContext,
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 195ac9420e4..2ca549a2a81 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
 }
 
 template<SWR_FORMAT format>
-void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
 {
     auto lambda = [&](int comp)
     {
@@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             /// @todo clear data should come in as RGBA32_FLOAT
             DWORD clearData[4];
             float clearFloat[4];
-            clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
-            clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
-            clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
-            clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+            clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
+            clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
+            clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
+            clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
             clearData[0] = *(DWORD*)&clearFloat[0];
             clearData[1] = *(DWORD*)&clearFloat[1];
             clearData[2] = *(DWORD*)&clearFloat[2];
@@ -1428,7 +1428,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
     coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
 
-    BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
 
     RDTSC_STOP(BESetup, 0, 0);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index a17276d1366..18c869f176b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -111,7 +111,7 @@ struct CLEAR_DESC
     CLEAR_FLAGS flags;
     float clearRTColor[4];  // RGBA_32F
     float clearDepth;   // [0..1]
-    BYTE clearStencil;
+    uint8_t clearStencil;
 };
 
 struct INVALIDATE_TILES_DESC
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 4f245c8c53e..2cc9d4054ac 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
 
 INLINE
 simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-                 bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+                 bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
                  simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
@@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC
 
 INLINE
 void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-        bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
-        BYTE *pStencilBase, const simdscalar& stencilMask)
+        bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
+        uint8_t *pStencilBase, const simdscalar& stencilMask)
 {
     if (pDSState->depthWriteEnable)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 83d85fc86d8..344758eefe5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -34,7 +34,7 @@
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
 {
     // fast path for float32
     if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
 {
     // fast path for float32
     if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index aa350259a15..9acf846a7f0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false>
 struct PackTraits
 {
     static const uint32_t MyNumBits = NumBits;
-    static simdscalar loadSOA(const BYTE *pSrc) = delete;
-    static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+    static simdscalar loadSOA(const uint8_t *pSrc) = delete;
+    static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
     static simdscalar unpack(simdscalar &in) = delete;
     static simdscalar pack(simdscalar &in) = delete;
 };
@@ -48,8 +48,8 @@ struct PackTraits<0, false>
 {
     static const uint32_t MyNumBits = 0;
 
-    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
-    static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
+    static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 };
@@ -63,7 +63,7 @@ struct PackTraits<8, false>
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -74,7 +74,7 @@ struct PackTraits<8, false>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -125,7 +125,7 @@ struct PackTraits<8, true>
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -136,7 +136,7 @@ struct PackTraits<8, true>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -188,7 +188,7 @@ struct PackTraits<16, false>
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -199,7 +199,7 @@ struct PackTraits<16, false>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -249,7 +249,7 @@ struct PackTraits<16, true>
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -260,7 +260,7 @@ struct PackTraits<16, true>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -311,8 +311,8 @@ struct PackTraits<32, false>
 {
     static const uint32_t MyNumBits = 32;
 
-    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
-    static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
+    static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
     static simdscalar unpack(simdscalar &in) { return in; }
     static simdscalar pack(simdscalar &in) { return in; }
 };
@@ -984,7 +984,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::fromFloat();
     }
 
-    INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+    INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
     {
         switch (comp)
         {
@@ -1001,7 +1001,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
     }
 
-    INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+    INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index d092a8644c6..44966a9e9a0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -1187,7 +1187,7 @@ void ProcessDraw(
 
         // if the entire index buffer isn't being consumed, set the last index
         // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
         if (pLastRequestedIndex < fetchInfo.pLastIndex)
         {
             fetchInfo.pLastIndex = pLastRequestedIndex;
@@ -1363,7 +1363,7 @@ void ProcessDraw(
             i += KNOB_SIMD_WIDTH;
             if (IsIndexedT)
             {
-                fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
             }
             else
             {
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index a71eb6d7853..5752094ca10 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -789,7 +789,7 @@ typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsConte
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
 
 //////////////////////////////////////////////////////////////////////////
 /// FRONTEND_STATE
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index ced904e7119..f26903e2608 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -122,7 +122,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
         if (create)
         {
             uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -145,7 +145,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
             _aligned_free(hotTile.pBuffer);
 
             uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
         }
@@ -370,4 +370,4 @@ void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, ui
             RDTSC_STOP(BELoadTiles, 0, 0);
         }
     }
-}
\ No newline at end of file
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 90337a21721..22cce0381bc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -240,7 +240,7 @@ enum HOTTILE_STATE
 
 struct HOTTILE
 {
-    BYTE *pBuffer;
+    uint8_t *pBuffer;
     HOTTILE_STATE state;
     DWORD clearData[4];                 // May need to change based on pfnClearTile implementation.  Reorder for alignment?
     uint32_t numSamples;
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index e2937021cdc..60a3a6af19e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -276,7 +276,7 @@ struct TransposeSingleComponent
     /// @brief Pass-thru for single component.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
     }
@@ -291,7 +291,7 @@ struct Transpose8_8_8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 #if KNOB_SIMD_WIDTH == 8
@@ -330,7 +330,7 @@ struct Transpose8_8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -342,7 +342,7 @@ struct Transpose8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 
@@ -366,7 +366,7 @@ struct Transpose32_32_32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -399,7 +399,7 @@ struct Transpose32_32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -431,7 +431,7 @@ struct Transpose32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         const float* pfSrc = (const float*)pSrc;
         __m128 src_r0 = _mm_load_ps(pfSrc + 0);
@@ -461,7 +461,7 @@ struct Transpose16_16_16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -501,7 +501,7 @@ struct Transpose16_16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -540,7 +540,7 @@ struct Transpose16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalar src = _simd_load_ps((const float*)pSrc);
 
@@ -571,7 +571,7 @@ struct Transpose24_8
     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -583,7 +583,7 @@ struct Transpose32_8_24
     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 
@@ -597,7 +597,7 @@ struct Transpose4_4_4_4
     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -609,7 +609,7 @@ struct Transpose5_6_5
     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -621,7 +621,7 @@ struct Transpose9_9_9_5
     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -633,7 +633,7 @@ struct Transpose5_5_5_1
     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -645,7 +645,7 @@ struct Transpose10_10_10_2
     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -657,7 +657,7 @@ struct Transpose11_11_10
     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 // helper function to unroll loops
@@ -699,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
     }
 #endif
 
-    BYTE* pRemainderBytes = (BYTE*)pDataWords;
+    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
     for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
     {
         crc = _mm_crc32_u8(crc, *pRemainderBytes++);
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index 0306f796e57..d001cb6b5cb 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -54,17 +54,17 @@ struct StoreRasterTileClear
     /// @param pDstSurface - Destination surface state
     /// @param x, y - Coordinates to raster tile.
     INLINE static void StoreClear(
-        const BYTE* dstFormattedColor,
+        const uint8_t* dstFormattedColor,
         UINT dstBytesPerPixel,
         SWR_SURFACE_STATE* pDstSurface,
         UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
     {
         // Compute destination address for raster tile.
-        BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+        uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress +
             (y * pDstSurface->pitch) + (x * dstBytesPerPixel);
 
         // start of first row
-        BYTE* pDst = pDstTile;
+        uint8_t* pDst = pDstTile;
         UINT dstBytesPerRow = 0;
 
         // For each raster tile pixel in row 0 (rx, 0)
@@ -110,7 +110,7 @@ struct StoreMacroTileClear
     {
         UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
 
-        BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+        uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
 
         float srcColor[4];
 
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
index b323cc31982..7c185e5e454 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -227,7 +227,7 @@ static uint16_t Convert32To16Float(float val)
 /// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
 template<SWR_FORMAT DstFormat>
 static void ConvertPixelFromFloat(
-    BYTE* pDstPixel,
+    uint8_t* pDstPixel,
     const float srcPixel[4])
 {
     uint32_t outColor[4] = { 0 };  // typeless bits
@@ -390,7 +390,7 @@ static void ConvertPixelFromFloat(
 template<SWR_FORMAT SrcFormat>
 INLINE static void ConvertPixelToFloat(
     float dstPixel[4],
-    const BYTE* pSrc)
+    const uint8_t* pSrc)
 {
     uint32_t srcColor[4];  // typeless bits
 

From e374d2d24b0d755c9380da0eb33e4151b1ad145f Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 10 Mar 2016 18:30:40 -0600
Subject: [PATCH 031/238] swr: [rasterizer] Discard work + misc fixes

---
 .../drivers/swr/rasterizer/core/api.cpp       | 40 +++++++++++++--
 src/gallium/drivers/swr/rasterizer/core/api.h | 10 ++++
 .../drivers/swr/rasterizer/core/backend.cpp   | 12 +++--
 .../drivers/swr/rasterizer/core/backend.h     |  2 +-
 .../drivers/swr/rasterizer/core/context.h     | 12 +++--
 .../drivers/swr/rasterizer/core/frontend.cpp  | 50 +++++++++++++++----
 .../drivers/swr/rasterizer/core/frontend.h    |  2 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp   | 17 ++++++-
 .../drivers/swr/rasterizer/core/tilemgr.h     |  2 +-
 9 files changed, 119 insertions(+), 28 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index e2ea5d934d2..c3c603d294c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -1265,7 +1265,10 @@ void SwrDrawIndexedInstanced(
     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
 }
 
-// Attach surfaces to pipeline
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
 void SwrInvalidateTiles(
     HANDLE hContext,
     uint32_t attachmentMask)
@@ -1273,10 +1276,39 @@ void SwrInvalidateTiles(
     SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    pDC->FeWork.type = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+    memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
+    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
+
+    //enqueue
+    QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SwrDiscardRect(
+    HANDLE hContext,
+    uint32_t attachmentMask,
+    SWR_RECT rect)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
     // Queue a load to the hottile
-    pDC->FeWork.type = INVALIDATETILES;
-    pDC->FeWork.pfnWork = ProcessInvalidateTiles;
-    pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.type = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
+    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
 
     //enqueue
     QueueDraw(pContext);
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 30bafd70c2f..90c2f038c46 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -408,6 +408,16 @@ void SWR_API SwrInvalidateTiles(
     HANDLE hContext,
     uint32_t attachmentMask);
 
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SWR_API SwrDiscardRect(
+    HANDLE hContext,
+    uint32_t attachmentMask,
+    SWR_RECT rect);
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief SwrDispatch
 /// @param hContext - Handle passed back from SwrCreateContext
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 2ca549a2a81..7afbb70a383 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -399,20 +399,22 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 }
 
 
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
-    INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
     SWR_CONTEXT *pContext = pDC->pContext;
 
+    const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+
     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
     {
         if (pDesc->attachmentMask & (1 << i))
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
+                pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
             if (pHotTile)
             {
-                SWR_ASSERT(pHotTile->state == HOTTILE_INVALID || pHotTile->state == HOTTILE_RESOLVED);
-                pHotTile->state = HOTTILE_INVALID;
+                pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
             }
         }
     }
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 91b8cccf3ac..2fa18953cad 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -37,7 +37,7 @@ void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
 void InitClearTilesTable();
 simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 18c869f176b..ed972fa5478 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -114,9 +114,13 @@ struct CLEAR_DESC
     uint8_t clearStencil;
 };
 
-struct INVALIDATE_TILES_DESC
+struct DISCARD_INVALIDATE_TILES_DESC
 {
     uint32_t attachmentMask;
+    SWR_RECT rect;
+    SWR_TILE_STATE newTileState;
+    bool createNewTiles;
+    bool fullTilesOnly;
 };
 
 struct SYNC_DESC
@@ -152,7 +156,7 @@ enum WORK_TYPE
     SYNC,
     DRAW,
     CLEAR,
-    INVALIDATETILES,
+    DISCARDINVALIDATETILES,
     STORETILES,
     QUERYSTATS,
 };
@@ -166,7 +170,7 @@ struct BE_WORK
         SYNC_DESC sync;
         TRIANGLE_WORK_DESC tri;
         CLEAR_DESC clear;
-        INVALIDATE_TILES_DESC invalidateTiles;
+        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
         STORE_TILES_DESC storeTiles;
         QUERY_DESC queryStats;
     } desc;
@@ -203,7 +207,7 @@ struct FE_WORK
         SYNC_DESC sync;
         DRAW_WORK draw;
         CLEAR_DESC clear;
-        INVALIDATE_TILES_DESC invalidateTiles;
+        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
         STORE_TILES_DESC storeTiles;
         QUERY_DESC queryStats;
     } desc;
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 44966a9e9a0..6db36395c86 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -193,35 +193,65 @@ void ProcessStoreTiles(
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessInvalidateTiles(
+void ProcessDiscardInvalidateTiles(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     RDTSC_START(FEProcessInvalidateTiles);
-    INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+    DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
-    const API_STATE& state = GetApiState(pDC);
+    SWR_RECT rect;
+
+    if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
+    {
+        // Valid rect
+        rect = pInv->rect;
+    }
+    else
+    {
+        // Use viewport dimensions
+        const API_STATE& state = GetApiState(pDC);
+
+        rect.left   = (uint32_t)state.vp[0].x;
+        rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
+        rect.top    = (uint32_t)state.vp[0].y;
+        rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
+    }
 
     // queue a store to each macro tile
     // compute macro tile bounds for the current render target
     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 
-    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
-    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+    // Setup region assuming full tiles
+    uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
+    uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
+
+    uint32_t macroTileEndX = rect.right / macroWidth;
+    uint32_t macroTileEndY = rect.bottom / macroHeight;
+
+    if (pInv->fullTilesOnly == false)
+    {
+        // include partial tiles
+        macroTileStartX = rect.left / macroWidth;
+        macroTileStartY = rect.top / macroHeight;
+
+        macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
+        macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+    }
 
     // load tiles
     BE_WORK work;
-    work.type = INVALIDATETILES;
-    work.pfnWork = ProcessInvalidateTilesBE;
-    work.desc.invalidateTiles = *pInv;
+    work.type = DISCARDINVALIDATETILES;
+    work.pfnWork = ProcessDiscardInvalidateTilesBE;
+    work.desc.discardInvalidateTiles = *pInv;
 
-    for (uint32_t x = 0; x < numMacroTilesX; ++x)
+    for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
     {
-        for (uint32_t y = 0; y < numMacroTilesY; ++y)
+        for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
         {
             pTileMgr->enqueue(x, y, &work);
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index acb935fc251..9a2f0434db5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -316,7 +316,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo
 
 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index f26903e2608..09cc23e5db7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -186,7 +186,9 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
     return &tile.Attachment[attachment];
 }
 
-HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment)
+HOTTILE* HotTileMgr::GetHotTileNoLoad(
+    SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
+    SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
 {
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
@@ -198,7 +200,18 @@ HOTTILE* HotTileMgr::GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC,
     HOTTILE& hotTile = tile.Attachment[attachment];
     if (hotTile.pBuffer == NULL)
     {
-        return NULL;
+        if (create)
+        {
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+            hotTile.renderTargetArrayIndex = 0;
+        }
+        else
+        {
+            return NULL;
+        }
     }
 
     return &hotTile;
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 22cce0381bc..30f80ce4247 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -297,7 +297,7 @@ public:
     HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
         uint32_t renderTargetArrayIndex = 0);
 
-    HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment);
+    HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
 
     static void ClearColorHotTile(const HOTTILE* pHotTile);
     static void ClearDepthHotTile(const HOTTILE* pHotTile);

From fee56fda6fd78f7fb10b0e8fced0a604ca43f0c0 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 10 Mar 2016 19:19:30 -0600
Subject: [PATCH 032/238] swr: [rasterizer] Stop setting viewport size to
 larger than hottile array

Guard against enquing work to invalid tiles
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 6db36395c86..27afc9640c7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -243,6 +243,12 @@ void ProcessDiscardInvalidateTiles(
         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
     }
 
+    SWR_ASSERT(macroTileEndX < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(macroTileEndY < KNOB_NUM_HOT_TILES_Y);
+
+    macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
+    macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
+
     // load tiles
     BE_WORK work;
     work.type = DISCARDINVALIDATETILES;

From c75314ec67f011599d8e84e6eaef897911d9e892 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 10 Mar 2016 19:20:07 -0600
Subject: [PATCH 033/238] swr: [rasterizer core] Guard against enquing work to
 invalid hot tiles

---
 src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 09cc23e5db7..ac2117bf4a4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -74,6 +74,11 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
+    if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+    {
+        return;
+    }
+
     uint32_t id = TILE_ID(x, y);
 
     MacroTileQueue &tile = mTiles[id];

From e1222ade0039289993fbec261408eea5e0d7d9ae Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 14 Mar 2016 15:54:29 -0600
Subject: [PATCH 034/238] swr: [rasterizer] code styling and update copyrights

---
 .../swr/rasterizer/common/containers.hpp      |  12 +-
 .../drivers/swr/rasterizer/common/os.h        |  16 +-
 .../swr/rasterizer/common/simdintrin.h        | 556 +++++++++---------
 .../drivers/swr/rasterizer/core/backend.cpp   |   2 +-
 .../drivers/swr/rasterizer/core/frontend.h    |   2 +-
 .../jitter/scripts/gen_llvm_ir_macros.py      |  20 +-
 .../jitter/scripts/gen_llvm_types.py          |   2 +-
 .../swr/rasterizer/scripts/gen_knobs.py       |   2 +-
 .../swr/rasterizer/scripts/knob_defs.py       |   2 +-
 .../scripts/templates/knobs.template          |   2 +-
 10 files changed, 308 insertions(+), 308 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
index 95af4387fcb..f3c05979144 100644
--- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -68,10 +68,10 @@ struct UncheckedFixedVector
         return *this;
     }
 
-    T* begin()	{ return &this->mElements[0]; }
-    T* end()	{ return &this->mElements[0] + this->mSize; }
-    T const* begin() const	{ return &this->mElements[0]; }
-    T const* end() const	{ return &this->mElements[0] + this->mSize; }
+    T* begin()  { return &this->mElements[0]; }
+    T* end()    { return &this->mElements[0] + this->mSize; }
+    T const* begin() const  { return &this->mElements[0]; }
+    T const* end() const    { return &this->mElements[0] + this->mSize; }
 
     friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
     {
@@ -103,7 +103,7 @@ struct UncheckedFixedVector
     }
     void push_back(T const& t)
     {
-        this->mElements[this->mSize]	= t;
+        this->mElements[this->mSize]    = t;
         ++this->mSize;
     }
     void pop_back()
@@ -136,7 +136,7 @@ struct UncheckedFixedVector
         this->resize(0);
     }
 private:
-    std::size_t	mSize{ 0 };
+    std::size_t    mSize{ 0 };
     T mElements[NUM_ELEMENTS];
 };
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index d84c0719eec..a1698644eb0 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -47,8 +47,8 @@
 #define DEBUGBREAK __debugbreak()
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...) \
-	__pragma(warning(push));\
-	__pragma(warning(disable:__VA_ARGS__));
+    __pragma(warning(push));\
+    __pragma(warning(disable:__VA_ARGS__));
 
 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
 
@@ -74,13 +74,13 @@
 #include <unistd.h>
 #include <sys/stat.h>
 
-typedef void			VOID;
+typedef void            VOID;
 typedef void*           LPVOID;
-typedef int				INT;
-typedef unsigned int	UINT;
-typedef void*			HANDLE;
-typedef int			    LONG;
-typedef unsigned int	DWORD;
+typedef int             INT;
+typedef unsigned int    UINT;
+typedef void*           HANDLE;
+typedef int             LONG;
+typedef unsigned int    DWORD;
 
 #undef FALSE
 #define FALSE 0
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 96b7fbf8052..fa792b42e1a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -43,14 +43,14 @@ typedef uint8_t simdmask;
 // simd vector
 OSALIGNSIMD(union) simdvector
 {
-	simdscalar	v[4];
-	struct
-	{
-		simdscalar x, y, z, w;
-	};
+    simdscalar  v[4];
+    struct
+    {
+        simdscalar x, y, z, w;
+    };
 
-	simdscalar& operator[] (const int i) { return v[i]; }
-	const simdscalar& operator[] (const int i) const { return v[i]; }
+    simdscalar& operator[] (const int i) { return v[i]; }
+    const simdscalar& operator[] (const int i) const { return v[i]; }
 };
 
 #if KNOB_SIMD_WIDTH == 8
@@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector
 #define _simd_load1_ps _mm256_broadcast_ss
 #define _simd_loadu_ps _mm256_loadu_ps
 #define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps	_mm256_set1_ps
-#define _simd_blend_ps	_mm256_blend_ps
+#define _simd_set1_ps   _mm256_set1_ps
+#define _simd_blend_ps  _mm256_blend_ps
 #define _simd_blendv_ps _mm256_blendv_ps
 #define _simd_store_ps _mm256_store_ps
 #define _simd_mul_ps _mm256_mul_ps
@@ -100,18 +100,18 @@ OSALIGNSIMD(union) simdvector
 INLINE \
 __m256i func(__m256i a, __m256i b)\
 {\
-	__m128i aHi = _mm256_extractf128_si256(a, 1);\
-	__m128i bHi = _mm256_extractf128_si256(b, 1);\
-	__m128i aLo = _mm256_castsi256_si128(a);\
-	__m128i bLo = _mm256_castsi256_si128(b);\
+    __m128i aHi = _mm256_extractf128_si256(a, 1);\
+    __m128i bHi = _mm256_extractf128_si256(b, 1);\
+    __m128i aLo = _mm256_castsi256_si128(a);\
+    __m128i bLo = _mm256_castsi256_si128(b);\
 \
-	__m128i subLo = intrin(aLo, bLo);\
-	__m128i subHi = intrin(aHi, bHi);\
+    __m128i subLo = intrin(aLo, bLo);\
+    __m128i subHi = intrin(aHi, bHi);\
 \
-	__m256i result = _mm256_castsi128_si256(subLo);\
-	        result = _mm256_insertf128_si256(result, subHi, 1);\
+    __m256i result = _mm256_castsi128_si256(subLo);\
+            result = _mm256_insertf128_si256(result, subHi, 1);\
 \
-	return result;\
+    return result;\
 }
 
 #if (KNOB_ARCH == KNOB_ARCH_AVX)
@@ -322,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
 INLINE
 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
 {
-	__m128 res = _mm_mul_ps(a, b);
-	res = _mm_add_ps(res, c);
-	return res;
+    __m128 res = _mm_mul_ps(a, b);
+    res = _mm_add_ps(res, c);
+    return res;
 }
 
 INLINE
 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
 {
-	__m256 res = _mm256_mul_ps(a, b);
-	res = _mm256_add_ps(res, c);
-	return res;
+    __m256 res = _mm256_mul_ps(a, b);
+    res = _mm256_add_ps(res, c);
+    return res;
 }
 
 INLINE
 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
 {
-	__m256 res = _mm256_mul_ps(a, b);
-	res = _mm256_sub_ps(res, c);
-	return res;
+    __m256 res = _mm256_mul_ps(a, b);
+    res = _mm256_sub_ps(res, c);
+    return res;
 }
 
 INLINE
@@ -496,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl
 
 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
 {
-	__m128i aHi = _mm256_extractf128_si256(a, 1);
-	__m128i aLo = _mm256_castsi256_si128(a);
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
 
-	__m128i resHi = _mm_slli_epi32(aHi, i);
-	__m128i resLo = _mm_slli_epi32(aLo, i);
+    __m128i resHi = _mm_slli_epi32(aHi, i);
+    __m128i resLo = _mm_slli_epi32(aLo, i);
 
-	__m256i result = _mm256_castsi128_si256(resLo);
-		    result = _mm256_insertf128_si256(result, resHi, 1);
+    __m256i result = _mm256_castsi128_si256(resLo);
+            result = _mm256_insertf128_si256(result, resHi, 1);
 
-	return result;
+    return result;
 }
 
 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
 {
-	__m128i aHi = _mm256_extractf128_si256(a, 1);
-	__m128i aLo = _mm256_castsi256_si128(a);
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
 
-	__m128i resHi = _mm_srai_epi32(aHi, i);
-	__m128i resLo = _mm_srai_epi32(aLo, i);
+    __m128i resHi = _mm_srai_epi32(aHi, i);
+    __m128i resLo = _mm_srai_epi32(aLo, i);
 
-	__m256i result = _mm256_castsi128_si256(resLo);
-		    result = _mm256_insertf128_si256(result, resHi, 1);
+    __m256i result = _mm256_castsi128_si256(resLo);
+            result = _mm256_insertf128_si256(result, resHi, 1);
 
-	return result;
+    return result;
 }
 
 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
@@ -539,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
 INLINE
 void _simdvec_transpose(simdvector &v)
 {
-	SWR_ASSERT(false, "Need to implement 8 wide version");
+    SWR_ASSERT(false, "Need to implement 8 wide version");
 }
 
 #else
@@ -550,132 +550,132 @@ void _simdvec_transpose(simdvector &v)
 INLINE
 void _simdvec_load_ps(simdvector& r, const float *p)
 {
-	r[0] = _simd_set1_ps(p[0]);
-	r[1] = _simd_set1_ps(p[1]);
-	r[2] = _simd_set1_ps(p[2]);
-	r[3] = _simd_set1_ps(p[3]);
+    r[0] = _simd_set1_ps(p[0]);
+    r[1] = _simd_set1_ps(p[1]);
+    r[2] = _simd_set1_ps(p[2]);
+    r[3] = _simd_set1_ps(p[3]);
 }
 
 INLINE
 void _simdvec_mov(simdvector& r, const simdscalar& s)
 {
-	r[0] = s;
-	r[1] = s;
-	r[2] = s;
-	r[3] = s;
+    r[0] = s;
+    r[1] = s;
+    r[2] = s;
+    r[3] = s;
 }
 
 INLINE
 void _simdvec_mov(simdvector& r, const simdvector& v)
 {
-	r[0] = v[0];
-	r[1] = v[1];
-	r[2] = v[2];
-	r[3] = v[3];
+    r[0] = v[0];
+    r[1] = v[1];
+    r[2] = v[2];
+    r[3] = v[3];
 }
 
 // just move a lane from the source simdvector to dest simdvector
 INLINE
 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
 {
-	_simd_mov(r[0], rlane, s[0], slane);
-	_simd_mov(r[1], rlane, s[1], slane);
-	_simd_mov(r[2], rlane, s[2], slane);
-	_simd_mov(r[3], rlane, s[3], slane);
+    _simd_mov(r[0], rlane, s[0], slane);
+    _simd_mov(r[1], rlane, s[1], slane);
+    _simd_mov(r[2], rlane, s[2], slane);
+    _simd_mov(r[3], rlane, s[3], slane);
 }
 
 INLINE
 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
 {
-	simdscalar tmp;
-	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+    simdscalar tmp;
+    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 }
 
 INLINE
 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
 {
-	simdscalar tmp;
-	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+    simdscalar tmp;
+    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 
-	tmp	= _simd_mul_ps(v0[3], v1[3]);	// (v0.w*v1.w)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 }
 
 INLINE
 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
 {
-	simdscalar length;
-	_simdvec_dp4_ps(length, v, v);
-	return _simd_rsqrt_ps(length);
+    simdscalar length;
+    _simdvec_dp4_ps(length, v, v);
+    return _simd_rsqrt_ps(length);
 }
 
 INLINE
 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
 {
-	simdscalar vecLength;
-	vecLength = _simdvec_rcp_length_ps(v);
+    simdscalar vecLength;
+    vecLength = _simdvec_rcp_length_ps(v);
 
-	r[0] = _simd_mul_ps(v[0], vecLength);
-	r[1] = _simd_mul_ps(v[1], vecLength);
-	r[2] = _simd_mul_ps(v[2], vecLength);
-	r[3] = _simd_mul_ps(v[3], vecLength);
+    r[0] = _simd_mul_ps(v[0], vecLength);
+    r[1] = _simd_mul_ps(v[1], vecLength);
+    r[2] = _simd_mul_ps(v[2], vecLength);
+    r[3] = _simd_mul_ps(v[3], vecLength);
 }
 
 INLINE
 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
 {
-	r[0] = _simd_mul_ps(v[0], s);
-	r[1] = _simd_mul_ps(v[1], s);
-	r[2] = _simd_mul_ps(v[2], s);
-	r[3] = _simd_mul_ps(v[3], s);
+    r[0] = _simd_mul_ps(v[0], s);
+    r[1] = _simd_mul_ps(v[1], s);
+    r[2] = _simd_mul_ps(v[2], s);
+    r[3] = _simd_mul_ps(v[3], s);
 }
 
 INLINE
 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
 {
-	r[0] = _simd_mul_ps(v0[0], v1[0]);
-	r[1] = _simd_mul_ps(v0[1], v1[1]);
-	r[2] = _simd_mul_ps(v0[2], v1[2]);
-	r[3] = _simd_mul_ps(v0[3], v1[3]);
+    r[0] = _simd_mul_ps(v0[0], v1[0]);
+    r[1] = _simd_mul_ps(v0[1], v1[1]);
+    r[2] = _simd_mul_ps(v0[2], v1[2]);
+    r[3] = _simd_mul_ps(v0[3], v1[3]);
 }
 
 INLINE
 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
 {
-	r[0] = _simd_add_ps(v0[0], v1[0]);
-	r[1] = _simd_add_ps(v0[1], v1[1]);
-	r[2] = _simd_add_ps(v0[2], v1[2]);
-	r[3] = _simd_add_ps(v0[3], v1[3]);
+    r[0] = _simd_add_ps(v0[0], v1[0]);
+    r[1] = _simd_add_ps(v0[1], v1[1]);
+    r[2] = _simd_add_ps(v0[2], v1[2]);
+    r[3] = _simd_add_ps(v0[3], v1[3]);
 }
 
 INLINE
 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 {
-	r[0] = _simd_min_ps(v0[0], s);
-	r[1] = _simd_min_ps(v0[1], s);
-	r[2] = _simd_min_ps(v0[2], s);
-	r[3] = _simd_min_ps(v0[3], s);
+    r[0] = _simd_min_ps(v0[0], s);
+    r[1] = _simd_min_ps(v0[1], s);
+    r[2] = _simd_min_ps(v0[2], s);
+    r[3] = _simd_min_ps(v0[3], s);
 }
 
 INLINE
 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 {
-	r[0] = _simd_max_ps(v0[0], s);
-	r[1] = _simd_max_ps(v0[1], s);
-	r[2] = _simd_max_ps(v0[2], s);
-	r[3] = _simd_max_ps(v0[3], s);
+    r[0] = _simd_max_ps(v0[0], s);
+    r[1] = _simd_max_ps(v0[1], s);
+    r[2] = _simd_max_ps(v0[2], s);
+    r[3] = _simd_max_ps(v0[3], s);
 }
 
 // Matrix4x4 * Vector4
@@ -685,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
 INLINE
 void _simd_mat4x4_vec4_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
 {
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
 
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[0] = r0;
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[0] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[1] = r0;
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[1] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[2] = r0;
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[2] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[3] = r0;
+    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[3] = r0;
 }
 
 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
@@ -753,45 +753,45 @@ void _simd_mat4x4_vec4_multiply(
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
 INLINE
 void _simd_mat3x3_vec3_w0_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
 {
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
 
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[0] = r0;
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[0] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[1] = r0;
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[1] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[2] = r0;
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[2] = r0;
 
-	result[3] = _simd_setzero_ps();
+    result[3] = _simd_setzero_ps();
 }
 
 // Matrix4x4 * Vector3 - Position vector where w = 1.
@@ -801,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply(
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
 INLINE
 void _simd_mat4x4_vec3_w1_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
 {
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
 
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[0] = r0;
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[0] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[1] = r0;
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[1] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[2] = r0;
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[2] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-	result[3]	= _simd_add_ps(r0, m);			// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
+    result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
 }
 
 INLINE
 void _simd_mat4x3_vec3_w1_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
 {
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
 
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[0] = r0;
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[0] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[1] = r0;
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[1] = r0;
 
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[2] = r0;
-	result[3] = _simd_set1_ps(1.0f);
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[2] = r0;
+    result[3] = _simd_set1_ps(1.0f);
 }
 
 //////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 7afbb70a383..c9a5fd0f23f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -1211,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             }
             else
             {
-				psContext.activeMask = _simd_set1_epi32(-1);
+                psContext.activeMask = _simd_set1_epi32(-1);
             }
 
             // need to declare enough space for all samples
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 9a2f0434db5..d11de79b01f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -146,7 +146,7 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB)
     //vMul = [A1*B2 - B1*A2]
     vMul = _mm_sub_epi64(vMul, vMul2);
 
-	// According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
+    // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
     OSALIGN(int64_t, 16) result;
     _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index c78c9784b3d..e73b232757b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -27,7 +27,7 @@ import json as JSON
 import operator
 
 header = r"""/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -84,16 +84,16 @@ inst_aliases = {
 }
 
 intrinsics = [
-	    ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+        ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
         ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
-	    ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
-	    ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
-	    ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
-	    ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
-	    ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
-	    ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
-	    ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
-	    ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+        ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+        ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+        ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+        ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+        ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+        ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+        ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+        ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
         ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
         ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
         ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index 7bba435467b..0b53a929e6c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -28,7 +28,7 @@ import operator
 
 header = r"""
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
index 44ab69815b1..3d003fb4a33 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index cf4af71811d..9aa43376f35 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -1,4 +1,4 @@
-﻿# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+﻿# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
index 66c8e84b827..521346ca833 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -10,7 +10,7 @@
         return ' '*(max_len - knob_len)
 %>/******************************************************************************
 *
-* Copyright 2015
+* Copyright 2015-2016
 * Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");

From b958aea78abca7f7dc4e3724950fa02a11c25e99 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 16 Mar 2016 11:56:50 -0600
Subject: [PATCH 035/238] swr: [rasterizer common] changes for cygwin

---
 src/gallium/drivers/swr/rasterizer/common/os.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index a1698644eb0..d4bec908bb4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -73,6 +73,7 @@
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#include <stdio.h>
 
 typedef void            VOID;
 typedef void*           LPVOID;
@@ -94,8 +95,11 @@ typedef unsigned int    DWORD;
 #define INLINE __inline
 #endif
 #define DEBUGBREAK asm ("int $3")
+#if !defined(__CYGWIN__)
 #define __cdecl
+#define __stdcall
 #define __declspec(X)
+#endif
 
 #define GCC_VERSION (__GNUC__ * 10000 \
                      + __GNUC_MINOR__ * 100 \
@@ -177,7 +181,6 @@ unsigned char _bittest(const LONG *a, LONG b)
 #define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
 #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
 #define _ReadWriteBarrier() asm volatile("" ::: "memory")
-#define __stdcall
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...)
 #define PRAGMA_WARNING_POP()

From 257db3610a91a9355d8301b8fb6123346f9c1b07 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 16 Mar 2016 17:54:04 -0600
Subject: [PATCH 036/238] swr: [rasterizer jitter] signed immediate builder

---
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp | 7 +++++++
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h   | 1 +
 2 files changed, 8 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 178f4c27c65..876fe83511e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -259,6 +259,13 @@ uint32_t Builder::IMMED(Value* v)
     return pValConst->getZExtValue();
 }
 
+int32_t Builder::S_IMMED(Value* v)
+{
+    SWR_ASSERT(isa<ConstantInt>(v));
+    ConstantInt *pValConst = cast<ConstantInt>(v);
+    return pValConst->getSExtValue();
+}
+
 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 {
     std::vector<Value*> indices;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 18c30a2891f..4c9c431179f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -67,6 +67,7 @@ Value *VRCP(Value *va);
 Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
 
 uint32_t IMMED(Value* i);
+int32_t S_IMMED(Value* i);
 
 Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
 Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);

From 9111d63228afffed301bb888eb71af287a0887d3 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 17 Mar 2016 12:22:43 -0600
Subject: [PATCH 037/238] swr: [rasterizer] Fix run-time check asserts

One innocuous (uninitialized variable), and one not so innocuous
(stack corruption).
---
 src/gallium/drivers/swr/rasterizer/core/frontend.cpp |  4 ++--
 src/gallium/drivers/swr/rasterizer/core/frontend.h   | 11 +++++------
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 27afc9640c7..e780ffbf175 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -243,8 +243,8 @@ void ProcessDiscardInvalidateTiles(
         macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
     }
 
-    SWR_ASSERT(macroTileEndX < KNOB_NUM_HOT_TILES_X);
-    SWR_ASSERT(macroTileEndY < KNOB_NUM_HOT_TILES_Y);
+    SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
 
     macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
     macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index d11de79b01f..f92f88c3226 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB)
     //vMul = [A1*B2 - B1*A2]
     vMul = _mm_sub_epi64(vMul, vMul2);
 
-    // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
-    OSALIGN(int64_t, 16) result;
-    _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+    int64_t result;
+    _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
 
-    double fResult = (double)result;
-    fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+    double dResult = (double)result;
+    dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
 
-    return (float)fResult;
+    return (float)dResult;
 }
 
 INLINE

From 4b4547a7216ec6309da54f508211c0aba02ad5e3 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 17 Mar 2016 15:39:13 -0600
Subject: [PATCH 038/238] swr: [rasterizer] Reduce max in-flight draws to 96
 (by default)

---
 src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 9aa43376f35..5843abf2aeb 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -121,7 +121,7 @@ KNOBS = [
 
     ['MAX_DRAWS_IN_FLIGHT', {
         'type'      : 'uint32_t',
-        'default'   : '160',
+        'default'   : '96',
         'desc'      : ['Maximum number of draws outstanding before API thread blocks.'],
         'category'  : 'perf',
     }],

From 73904184412fa5e9c2f1bab8580664c449f12aa2 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 17 Mar 2016 16:12:17 -0600
Subject: [PATCH 039/238] swr: [rasterizer core] Add clipping of user clip
 planes in clipper.

---
 .../drivers/swr/rasterizer/core/clip.h        | 86 +++++++++++++++++++
 1 file changed, 86 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 4f51388d9cf..ba5870a92bb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -357,6 +357,25 @@ public:
             }
         }
 
+        // assemble user clip distances if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+            }
+        }
+
         uint32_t numAttribs = maxSlot + 1;
 
         simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
@@ -439,6 +458,27 @@ public:
                 }
             }
 
+            // transpose user clip distances if enabled
+            if (this->state.rastState.clipDistanceMask & 0xf)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
+            if (this->state.rastState.clipDistanceMask & 0xf0)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
             PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
@@ -633,6 +673,31 @@ private:
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
+
+        // interpolate clip distance if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
     }
 
     template<SWR_CLIPCODES ClippingPlane>
@@ -703,6 +768,27 @@ private:
                     }
                 }
 
+                // store clip distance if enabled
+                if (this->state.rastState.clipDistanceMask & 0xf)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                if (this->state.rastState.clipDistanceMask & 0xf0)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
                 // increment outIndex
                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
             }

From 5899076b6b24a7275fb6b4ad6a42686225ef0156 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 17 Mar 2016 16:50:46 -0600
Subject: [PATCH 040/238] swr: [rasterizer core] Reset DrawContext arena at end
 of draw rather than upon reclaim of DC

Keeps overall memory consumption lower.
Also, remove unused knobs.
---
 .../drivers/swr/rasterizer/core/threads.cpp   |  4 ++++
 .../swr/rasterizer/scripts/knob_defs.py       | 21 -------------------
 2 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index c4567eaee87..57408049d03 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -290,6 +290,10 @@ INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     {
         _ReadWriteBarrier();
 
+        // Cleanup memory allocations
+        pDC->pArena->Reset();
+        pDC->pTileMgr->initialize();
+
         pContext->dcRing.Dequeue();  // Remove from tail
     }
 }
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 5843abf2aeb..0f3ded68544 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -144,27 +144,6 @@ KNOBS = [
         'category'  : 'perf',
     }],
 
-    ['MAX_FRAC_ODD_TESS_FACTOR', {
-        'type'      : 'float',
-        'default'   : '63.0f',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_FRAC_EVEN_TESS_FACTOR', {
-        'type'      : 'float',
-        'default'   : '64.0f',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
-        'category'  : 'perf',
-    }],
-
-    ['MAX_INTEGER_TESS_FACTOR', {
-        'type'      : 'uint32_t',
-        'default'   : '64',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
-        'category'  : 'perf',
-    }],
-
 
     ['BUCKETS_ENABLE_THREADVIZ', {
         'type'      : 'bool',

From 700a5b06e036d7515c6d5f2f9e2d40e5a65eb964 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 17 Mar 2016 18:10:25 -0600
Subject: [PATCH 041/238] swr: [rasterizer core] Arena optimizations -
 preparing for global allocator.

---
 src/gallium/drivers/swr/Makefile.sources-arch |   1 -
 .../drivers/swr/rasterizer/core/arena.cpp     | 166 ------------------
 .../drivers/swr/rasterizer/core/arena.h       | 134 ++++++++++++--
 .../drivers/swr/rasterizer/core/tilemgr.cpp   |   8 -
 .../drivers/swr/rasterizer/core/tilemgr.h     |   9 +-
 5 files changed, 131 insertions(+), 187 deletions(-)
 delete mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.cpp

diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
index 7544f8efccc..a04b1203c7c 100644
--- a/src/gallium/drivers/swr/Makefile.sources-arch
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -59,7 +59,6 @@ COMMON_CXX_SOURCES := \
 CORE_CXX_SOURCES := \
 	rasterizer/core/api.cpp \
 	rasterizer/core/api.h \
-	rasterizer/core/arena.cpp \
 	rasterizer/core/arena.h \
 	rasterizer/core/backend.cpp \
 	rasterizer/core/backend.h \
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
deleted file mode 100644
index 8184c8d3f4c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.cpp
-*
-* @brief Arena memory manager
-*        The arena is convenient and fast for managing allocations for any of
-*        our allocations that are associated with operations and can all be freed
-*        once when their operation has completed. Allocations are cheap since
-*        most of the time its simply an increment of an offset. Also, no need to
-*        free individual allocations. All of the arena memory can be freed at once.
-*
-******************************************************************************/
-
-#include "context.h"
-#include "arena.h"
-
-#include <cmath>
-
-Arena::Arena()
-    : m_pCurBlock(nullptr), m_size(0)
-{
-    m_pMutex = new std::mutex();
-}
-
-Arena::~Arena()
-{
-    Reset();        // Reset just in case to avoid leaking memory.
-
-    if (m_pCurBlock)
-    {
-        _aligned_free(m_pCurBlock->pMem);
-        delete m_pCurBlock;
-    }
-
-    delete m_pMutex;
-}
-
-///@todo Remove this when all users have stopped using this.
-void Arena::Init()
-{
-    m_size = 0;
-    m_pCurBlock = nullptr;
-
-    m_pMutex = new std::mutex();
-}
-
-void* Arena::AllocAligned(size_t size, size_t align)
-{
-    if (m_pCurBlock)
-    {
-        ArenaBlock* pCurBlock = m_pCurBlock;
-        pCurBlock->offset = AlignUp(pCurBlock->offset, align);
-
-        if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
-        {
-            void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
-            pCurBlock->offset += size;
-            m_size += size;
-            return pMem;
-        }
-
-        // Not enough memory in this block, fall through to allocate
-        // a new block
-    }
-
-    static const size_t ArenaBlockSize = 1024*1024;
-    size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
-    blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
-
-    void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4);    // Arena blocks are always simd byte aligned.
-    SWR_ASSERT(pMem != nullptr);
-
-    ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
-    SWR_ASSERT(pNewBlock != nullptr);
-
-    if (pNewBlock != nullptr)
-    {
-        pNewBlock->pNext        = m_pCurBlock;
-
-        m_pCurBlock             = pNewBlock;
-        m_pCurBlock->pMem       = pMem;
-        m_pCurBlock->blockSize  = blockSize;
-
-    }
-
-    return AllocAligned(size, align);
-}
-
-void* Arena::Alloc(size_t size)
-{
-    return AllocAligned(size, 1);
-}
-
-void* Arena::AllocAlignedSync(size_t size, size_t align)
-{
-    void* pAlloc = nullptr;
-
-    SWR_ASSERT(m_pMutex != nullptr);
-
-    m_pMutex->lock();
-    pAlloc = AllocAligned(size, align);
-    m_pMutex->unlock();
-
-    return pAlloc;
-}
-
-void* Arena::AllocSync(size_t size)
-{
-    void* pAlloc = nullptr;
-
-    SWR_ASSERT(m_pMutex != nullptr);
-
-    m_pMutex->lock();
-    pAlloc = Alloc(size);
-    m_pMutex->unlock();
-
-    return pAlloc;
-}
-
-void Arena::Reset(bool removeAll)
-{
-    if (m_pCurBlock)
-    {
-        m_pCurBlock->offset = 0;
-
-        ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
-        m_pCurBlock->pNext = nullptr;
-        while(pUsedBlocks)
-        {
-            ArenaBlock* pBlock = pUsedBlocks;
-            pUsedBlocks = pBlock->pNext;
-
-            _aligned_free(pBlock->pMem);
-            delete pBlock;
-        }
-
-        if (removeAll)
-        {
-            _aligned_free(m_pCurBlock->pMem);
-            delete m_pCurBlock;
-            m_pCurBlock = nullptr;
-        }
-    }
-
-    m_size = 0;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 76eee11fb08..b6b4d829576 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -34,25 +34,134 @@
 
 #include <mutex>
 
-class Arena
+class DefaultAllocator
 {
 public:
-    Arena();
-   ~Arena();
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        void* p = _aligned_malloc(size, align);
+        return p;
+    }
+    void  Free(void* pMem)
+    {
+        _aligned_free(pMem);
+    }
+};
 
-    void        Init();
+template<typename T = DefaultAllocator>
+class TArena
+{
+public:
+    TArena(T& in_allocator)  : m_allocator(in_allocator) {}
+    TArena()                 : m_allocator(m_defAllocator) {}
+    ~TArena()
+    {
+        Reset(true);
+    }
 
-    void*       AllocAligned(size_t size, size_t  align);
-    void*       Alloc(size_t  size);
+    void* AllocAligned(size_t size, size_t  align)
+    {
+        if (m_pCurBlock)
+        {
+            ArenaBlock* pCurBlock = m_pCurBlock;
+            pCurBlock->offset = AlignUp(pCurBlock->offset, align);
 
-    void*       AllocAlignedSync(size_t size, size_t align);
-    void*       AllocSync(size_t size);
+            if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
+            {
+                void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
+                pCurBlock->offset += size;
+                m_size += size;
+                return pMem;
+            }
 
-    void        Reset(bool removeAll = false);
-    size_t      Size() { return m_size; }
+            // Not enough memory in this block, fall through to allocate
+            // a new block
+        }
+
+        static const size_t ArenaBlockSize = 1024 * 1024;
+        size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
+
+        // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
+        blockSize = AlignUp(blockSize + BLOCK_ALIGN, BLOCK_ALIGN);
+
+        void *pMem = m_allocator.AllocateAligned(blockSize, BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        SWR_ASSERT(pMem != nullptr);
+
+        ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
+
+        if (pNewBlock != nullptr)
+        {
+            pNewBlock->pNext = m_pCurBlock;
+
+            m_pCurBlock = pNewBlock;
+            m_pCurBlock->pMem = PtrAdd(pMem, BLOCK_ALIGN);
+            m_pCurBlock->blockSize = blockSize - BLOCK_ALIGN;
+
+        }
+
+        return AllocAligned(size, align);
+    }
+
+    void* Alloc(size_t  size)
+    {
+        return AllocAligned(size, 1);
+    }
+
+    void* AllocAlignedSync(size_t size, size_t align)
+    {
+        void* pAlloc = nullptr;
+
+        std::unique_lock<std::mutex> l(m_mutex);
+        pAlloc = AllocAligned(size, align);
+
+        return pAlloc;
+    }
+
+    void* AllocSync(size_t size)
+    {
+        void* pAlloc = nullptr;
+
+        std::unique_lock<std::mutex> l(m_mutex);
+        pAlloc = Alloc(size);
+
+        return pAlloc;
+    }
+
+    void Reset(bool removeAll = false)
+    {
+        if (m_pCurBlock)
+        {
+            m_pCurBlock->offset = 0;
+
+            ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+            m_pCurBlock->pNext = nullptr;
+            while (pUsedBlocks)
+            {
+                ArenaBlock* pBlock = pUsedBlocks;
+                pUsedBlocks = pBlock->pNext;
+
+                m_allocator.Free(pBlock);
+            }
+
+            if (removeAll)
+            {
+                m_allocator.Free(m_pCurBlock);
+                m_pCurBlock = nullptr;
+            }
+        }
+
+        m_size = 0;
+    }
+
+    size_t Size() const { return m_size; }
 
 private:
 
+    static const size_t BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4;
+
+    DefaultAllocator    m_defAllocator;
+    T&                  m_allocator;
+
     struct ArenaBlock
     {
         void*       pMem        = nullptr;
@@ -60,10 +169,13 @@ private:
         size_t      offset      = 0;
         ArenaBlock* pNext       = nullptr;
     };
+    static_assert(sizeof(ArenaBlock) <= BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
 
     ArenaBlock*     m_pCurBlock = nullptr;
     size_t          m_size      = 0;
 
     /// @note Mutex is only used by sync allocation functions.
-    std::mutex*     m_pMutex;
+    std::mutex      m_mutex;
 };
+
+typedef TArena<> Arena;
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index ac2117bf4a4..f3c24dacb48 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -60,14 +60,6 @@ MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
 {
 }
 
-void MacroTileMgr::initialize()
-{
-    mWorkItemsProduced = 0;
-    mWorkItemsConsumed = 0;
-
-    mDirtyTiles.clear();
-}
-
 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
 {
     // Should not enqueue more then what we have backing for in the hot tile manager.
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 30f80ce4247..f3e1373b00f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -113,7 +113,14 @@ public:
         }
     }
 
-    void initialize();
+    INLINE void initialize()
+    {
+        mWorkItemsProduced = 0;
+        mWorkItemsConsumed = 0;
+
+        mDirtyTiles.clear();
+    }
+
     INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
     INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
     void markTileComplete(uint32_t id);

From 4893224e2851683341d848926d267e5b5a4f39dc Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Fri, 18 Mar 2016 11:48:47 -0600
Subject: [PATCH 042/238] swr: [rasterizer core] Add clipping against user clip
 distances in the NullPS backend.

---
 .../drivers/swr/rasterizer/core/backend.cpp        | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index c9a5fd0f23f..7fb83edf169 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -1410,9 +1410,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     RDTSC_START(BESetup);
 
     static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1451,7 +1453,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             while (_BitScanForward(&sample, sampleMask))
             {
                 sampleMask &= ~(1 << sample);
-                if (work.coverageMask[sample] & MASK)
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+                if (coverageMask)
                 {
                     RDTSC_START(BEBarycentric);
                     // calculate per sample positions
@@ -1465,7 +1468,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
-                    simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+                    // interpolate user clip distance if available
+                    if (rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                            psContext.vI.sample, psContext.vJ.sample);
+                    }
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample

From 12ce9d9aa1819c0d7fb969b459a070c3cc9a617f Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Fri, 18 Mar 2016 12:11:20 -0600
Subject: [PATCH 043/238] swr: [rasterizer] more arena work

---
 .../drivers/swr/rasterizer/core/api.cpp       |   2 +-
 .../drivers/swr/rasterizer/core/arena.h       | 104 +++++++++++++++++-
 .../drivers/swr/rasterizer/core/threads.cpp   |   5 +-
 .../drivers/swr/rasterizer/core/threads.h     |   4 +-
 .../drivers/swr/rasterizer/core/utils.cpp     |   5 +
 5 files changed, 110 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index c3c603d294c..453d0295b54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
         if (IsDraw)
         {
-            std::unordered_set<uint32_t> lockedTiles;
+            static TileSet lockedTiles;
             uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
             WorkOnFifoFE(pContext, 0, curDraw[0], 0);
             WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index b6b4d829576..4cdb728e1ef 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -33,6 +33,9 @@
 #pragma once
 
 #include <mutex>
+#include <algorithm>
+#include <atomic>
+#include "core/utils.h"
 
 class DefaultAllocator
 {
@@ -48,7 +51,7 @@ public:
     }
 };
 
-template<typename T = DefaultAllocator>
+template<typename MutexT = std::mutex, typename T = DefaultAllocator>
 class TArena
 {
 public:
@@ -79,7 +82,7 @@ public:
         }
 
         static const size_t ArenaBlockSize = 1024 * 1024;
-        size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
+        size_t blockSize = std::max<size_t>(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
         blockSize = AlignUp(blockSize + BLOCK_ALIGN, BLOCK_ALIGN);
@@ -111,8 +114,9 @@ public:
     {
         void* pAlloc = nullptr;
 
-        std::unique_lock<std::mutex> l(m_mutex);
+        m_mutex.lock();
         pAlloc = AllocAligned(size, align);
+        m_mutex.unlock();
 
         return pAlloc;
     }
@@ -121,8 +125,9 @@ public:
     {
         void* pAlloc = nullptr;
 
-        std::unique_lock<std::mutex> l(m_mutex);
+        m_mutex.lock();
         pAlloc = Alloc(size);
+        m_mutex.unlock();
 
         return pAlloc;
     }
@@ -175,7 +180,96 @@ private:
     size_t          m_size      = 0;
 
     /// @note Mutex is only used by sync allocation functions.
-    std::mutex      m_mutex;
+    MutexT          m_mutex;
 };
 
 typedef TArena<> Arena;
+
+struct NullMutex
+{
+    void lock() {}
+    void unlock() {}
+};
+
+// Ref counted Arena for ArenaAllocator
+// NOT THREAD SAFE!!
+struct RefArena : TArena<NullMutex>
+{
+    uint32_t AddRef() { return ++m_refCount; }
+    uint32_t Release() { if (--m_refCount) { return m_refCount; } delete this; return 0; }
+
+    void* allocate(std::size_t n)
+    {
+        ++m_numAllocations;
+        return Alloc(n);
+    }
+
+    void deallocate(void* p) { --m_numAllocations; }
+    void clear() { SWR_ASSERT(0 == m_numAllocations); Reset(); }
+
+private:
+    uint32_t m_refCount = 0;
+    uint32_t m_numAllocations = 0;
+};
+
+#if 0 // THIS DOESN'T WORK!!!
+// Arena based replacement for std::allocator
+template <typename T>
+struct ArenaAllocator
+{
+    typedef T value_type;
+    ArenaAllocator()
+    {
+        m_pArena = new RefArena();
+        m_pArena->AddRef();
+    }
+    ~ArenaAllocator()
+    {
+        m_pArena->Release(); m_pArena = nullptr;
+    }
+    ArenaAllocator(const ArenaAllocator& copy)
+    {
+        m_pArena = const_cast<RefArena*>(copy.m_pArena); m_pArena->AddRef();
+    }
+
+
+    template <class U> ArenaAllocator(const ArenaAllocator<U>& copy)
+    {
+        m_pArena = const_cast<RefArena*>(copy.m_pArena); m_pArena->AddRef();
+    }
+    T* allocate(std::size_t n)
+    {
+#if defined(_DEBUG)
+        char buf[32];
+        sprintf_s(buf, "Alloc: %lld\n", n);
+        OutputDebugStringA(buf);
+#endif
+        void* p = m_pArena->allocate(n * sizeof(T));
+        return static_cast<T*>(p);
+    }
+    void deallocate(T* p, std::size_t n)
+    {
+#if defined(_DEBUG)
+        char buf[32];
+        sprintf_s(buf, "Dealloc: %lld\n", n);
+        OutputDebugStringA(buf);
+#endif
+        m_pArena->deallocate(p);
+    }
+    void clear() { m_pArena->clear(); }
+
+    RefArena* m_pArena = nullptr;
+};
+
+template <class T, class U>
+bool operator== (const ArenaAllocator<T>&, const ArenaAllocator<U>&)
+{
+    return true;
+}
+
+template <class T, class U>
+bool operator!= (const ArenaAllocator<T>&, const ArenaAllocator<U>&)
+{
+    return false;
+}
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 57408049d03..ff25e82f0fe 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -24,7 +24,6 @@
 #include <stdio.h>
 #include <thread>
 #include <algorithm>
-#include <unordered_set>
 #include <float.h>
 #include <vector>
 #include <utility>
@@ -345,7 +344,7 @@ void WorkOnFifoBE(
     SWR_CONTEXT *pContext,
     uint32_t workerId,
     uint64_t &curDrawBE,
-    std::unordered_set<uint32_t>& lockedTiles)
+    TileSet& lockedTiles)
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -550,7 +549,7 @@ DWORD workerThreadMain(LPVOID pData)
 
     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
     // locked then we'll add it to this list so that we don't try and lock it again.
-    std::unordered_set<uint32_t> lockedTiles;
+    TileSet lockedTiles;
 
     // each worker has the ability to work on any of the queued draws as long as certain
     // conditions are met. the data associated
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index ec0b735a4ec..6b37e3ac179 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -54,10 +54,12 @@ struct THREAD_POOL
     THREAD_DATA *pThreadData;
 };
 
+typedef std::unordered_set<uint32_t> TileSet;
+
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
index f36452f2cec..a1d665e77cc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -27,6 +27,11 @@
 ******************************************************************************/
 #if defined(_WIN32)
 
+#if defined(NOMINMAX)
+// GDI Plus requires non-std min / max macros be defined :(
+#undef NOMINMAX
+#endif
+
 #include<Windows.h>
 #include <Gdiplus.h>
 #include <Gdiplusheaders.h>

From ec9d4c4b372df773e4453c228b938e7c6c526c4c Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 21 Mar 2016 11:15:32 -0600
Subject: [PATCH 044/238] swr: [rasterizer core] Globally cache allocated arena
 blocks for fast re-allocation.

---
 .../drivers/swr/rasterizer/core/api.cpp       |   6 +-
 .../drivers/swr/rasterizer/core/arena.h       |  58 +++++----
 .../drivers/swr/rasterizer/core/context.h     | 120 +++++++++++++++++-
 .../drivers/swr/rasterizer/core/fifo.hpp      |   6 +-
 .../drivers/swr/rasterizer/core/frontend.cpp  |  10 +-
 .../drivers/swr/rasterizer/core/threads.cpp   |   2 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp   |   2 +-
 .../drivers/swr/rasterizer/core/tilemgr.h     |  10 +-
 8 files changed, 168 insertions(+), 46 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 453d0295b54..6ebb3f87f7a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -66,11 +66,11 @@ HANDLE SwrCreateContext(
 
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
-        pContext->dcRing[dc].pArena = new Arena();
+        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
         pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
         pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
 
-        pContext->dsRing[dc].pArena = new Arena();
+        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
 
     if (!KNOB_SINGLE_THREADED)
@@ -252,7 +252,7 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
-        Arena& stateArena = *(pCurDrawContext->pState->pArena);
+        auto& stateArena = *(pCurDrawContext->pState->pArena);
 
         // Copy previous state to current state.
         if (pContext->pPrevDrawContext)
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 4cdb728e1ef..71fb258f4d4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -51,6 +51,16 @@ public:
     }
 };
 
+static const size_t ARENA_BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4;
+
+struct ArenaBlock
+{
+    void*       pMem = nullptr;
+    size_t      blockSize = 0;
+    ArenaBlock* pNext = nullptr;
+};
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
+
 template<typename MutexT = std::mutex, typename T = DefaultAllocator>
 class TArena
 {
@@ -67,12 +77,12 @@ public:
         if (m_pCurBlock)
         {
             ArenaBlock* pCurBlock = m_pCurBlock;
-            pCurBlock->offset = AlignUp(pCurBlock->offset, align);
+            m_offset = AlignUp(m_offset, align);
 
-            if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
+            if ((m_offset + size) <= pCurBlock->blockSize)
             {
-                void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
-                pCurBlock->offset += size;
+                void* pMem = PtrAdd(pCurBlock->pMem, m_offset);
+                m_offset += size;
                 m_size += size;
                 return pMem;
             }
@@ -85,21 +95,21 @@ public:
         size_t blockSize = std::max<size_t>(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
-        blockSize = AlignUp(blockSize + BLOCK_ALIGN, BLOCK_ALIGN);
+        blockSize = AlignUp(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);
 
-        void *pMem = m_allocator.AllocateAligned(blockSize, BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        void *pMem = m_allocator.AllocateAligned(blockSize, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
         SWR_ASSERT(pMem != nullptr);
 
         ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
 
         if (pNewBlock != nullptr)
         {
+            m_offset = 0;
             pNewBlock->pNext = m_pCurBlock;
 
             m_pCurBlock = pNewBlock;
-            m_pCurBlock->pMem = PtrAdd(pMem, BLOCK_ALIGN);
-            m_pCurBlock->blockSize = blockSize - BLOCK_ALIGN;
-
+            m_pCurBlock->pMem = PtrAdd(pMem, ARENA_BLOCK_ALIGN);
+            m_pCurBlock->blockSize = blockSize - ARENA_BLOCK_ALIGN;
         }
 
         return AllocAligned(size, align);
@@ -134,10 +144,10 @@ public:
 
     void Reset(bool removeAll = false)
     {
+        m_offset = 0;
+
         if (m_pCurBlock)
         {
-            m_pCurBlock->offset = 0;
-
             ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
             m_pCurBlock->pNext = nullptr;
             while (pUsedBlocks)
@@ -162,28 +172,20 @@ public:
 
 private:
 
-    static const size_t BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4;
+    ArenaBlock*         m_pCurBlock = nullptr;
+    size_t              m_offset    = 0;
+    size_t              m_size      = 0;
+
+    /// @note Mutex is only used by sync allocation functions.
+    MutexT              m_mutex;
 
     DefaultAllocator    m_defAllocator;
     T&                  m_allocator;
-
-    struct ArenaBlock
-    {
-        void*       pMem        = nullptr;
-        size_t      blockSize   = 0;
-        size_t      offset      = 0;
-        ArenaBlock* pNext       = nullptr;
-    };
-    static_assert(sizeof(ArenaBlock) <= BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
-
-    ArenaBlock*     m_pCurBlock = nullptr;
-    size_t          m_size      = 0;
-
-    /// @note Mutex is only used by sync allocation functions.
-    MutexT          m_mutex;
 };
 
-typedef TArena<> Arena;
+template<typename T>
+using Arena     = TArena<std::mutex, T>;
+using StdArena  = Arena<DefaultAllocator>;
 
 struct NullMutex
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index ed972fa5478..6240b2e08d3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -360,6 +360,120 @@ struct BACKEND_FUNCS
     PFN_OUTPUT_MERGER pfnOutputMerger;
 };
 
+// Caching Allocator for Arena
+struct CachingAllocator : DefaultAllocator
+{
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        SWR_ASSERT(size >= sizeof(ArenaBlock));
+
+        {
+            // search cached blocks
+            std::lock_guard<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks;
+            ArenaBlock* pBlock = m_cachedBlocks.pNext;
+            ArenaBlock* pPotentialBlock = nullptr;
+            ArenaBlock* pPotentialPrev = nullptr;
+
+            while (pBlock)
+            {
+                if (pBlock->blockSize >= (size - ARENA_BLOCK_ALIGN))
+                {
+                    if (pBlock == AlignUp(pBlock, align))
+                    {
+                        if (pBlock->blockSize == size)
+                        {
+                            // Won't find a better match
+                            break;
+                        }
+
+                        // We could use this as it is larger than we wanted, but
+                        // continue to search for a better match
+                        pPotentialBlock = pBlock;
+                        pPotentialPrev = pPrevBlock;
+                    }
+                }
+                else
+                {
+                    // Blocks are sorted by size (biggest first)
+                    // So, if we get here, there are no blocks 
+                    // large enough, fall through to allocation.
+                    pBlock = nullptr;
+                    break;
+                }
+
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            if (!pBlock)
+            {
+                // Couldn't find an exact match, use next biggest size
+                pBlock = pPotentialBlock;
+                pPrevBlock = pPotentialPrev;
+            }
+
+            if (pBlock)
+            {
+                SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
+                pPrevBlock->pNext = pBlock->pNext;
+                pBlock->pNext = nullptr;
+
+                return pBlock;
+            }
+        }
+
+        return this->DefaultAllocator::AllocateAligned(size, align);
+    }
+
+    void  Free(void* pMem)
+    {
+        if (pMem)
+        {
+            ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
+            SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr);
+
+            std::unique_lock<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks;
+            ArenaBlock* pBlock = m_cachedBlocks.pNext;
+
+            while (pBlock)
+            {
+                if (pNewBlock->blockSize >= pBlock->blockSize)
+                {
+                    // Insert here
+                    break;
+                }
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            // Insert into list
+            SWR_ASSERT(pPrevBlock);
+            pPrevBlock->pNext = pNewBlock;
+            pNewBlock->pNext = pBlock;
+        }
+    }
+
+    ~CachingAllocator()
+    {
+        // Free all cached blocks
+        ArenaBlock* pBlock = m_cachedBlocks.pNext;
+        while (pBlock)
+        {
+            ArenaBlock* pNext = pBlock->pNext;
+            this->DefaultAllocator::Free(pBlock);
+            pBlock = pNext;
+        }
+    }
+
+    ArenaBlock m_cachedBlocks;
+    std::mutex m_mutex;
+
+};
+
+using CachingArena = Arena<CachingAllocator>;
+
 // Draw State
 struct DRAW_STATE
 {
@@ -371,7 +485,7 @@ struct DRAW_STATE
     BACKEND_FUNCS backendFuncs;
     PFN_PROCESS_PRIMS pfnProcessPrims;
 
-    Arena*    pArena;     // This should only be used by API thread.
+    CachingArena* pArena;     // This should only be used by API thread.
 };
 
 // Draw Context
@@ -398,7 +512,7 @@ struct DRAW_CONTEXT
     DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
 
     DRAW_STATE* pState;
-    Arena*    pArena;
+    CachingArena* pArena;
 
     uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
 };
@@ -476,6 +590,8 @@ struct SWR_CONTEXT
 
     // Scratch space for workers.
     uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+
+    CachingAllocator cachingArenaAllocator;
 };
 
 void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 7e556012e6b..ccf0b70544f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -49,7 +49,8 @@ struct QUEUE
     static const uint32_t mBlockSizeShift = 6;
     static const uint32_t mBlockSize = 1 << mBlockSizeShift;
 
-    void clear(Arena& arena)
+    template <typename ArenaT>
+    void clear(ArenaT& arena)
     {
         mHead = 0;
         mTail = 0;
@@ -102,7 +103,8 @@ struct QUEUE
         mNumEntries --;
     }
 
-    bool enqueue_try_nosync(Arena& arena, const T* entry)
+    template <typename ArenaT>
+    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
     {
         memcpy(&mCurBlock[mTail], entry, sizeof(T));
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index e780ffbf175..36721e00beb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -881,7 +881,7 @@ static void GeometryShaderStage(
 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
     void **ppStreamCutBuffer)
 {
-    Arena* pArena = pDC->pArena;
+    auto pArena = pDC->pArena;
     SWR_ASSERT(pArena != nullptr);
     SWR_ASSERT(state.gsState.gsEnable);
     // allocate arena space to hold GS output verts
@@ -1813,7 +1813,7 @@ void BinTriangles(
             work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
         }
 
-        Arena* pArena = pDC->pArena;
+        auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
@@ -1985,7 +1985,7 @@ void BinPoints(
 
             work.pfnWork = RasterizeSimplePoint;
 
-            Arena* pArena = pDC->pArena;
+            auto pArena = pDC->pArena;
             SWR_ASSERT(pArena != nullptr);
 
             // store attributes
@@ -2119,7 +2119,7 @@ void BinPoints(
 
             work.pfnWork = RasterizeTriPoint;
 
-            Arena* pArena = pDC->pArena;
+            auto pArena = pDC->pArena;
             SWR_ASSERT(pArena != nullptr);
 
             // store active attribs
@@ -2336,7 +2336,7 @@ void BinLines(
 
         work.pfnWork = RasterizeLine;
 
-        Arena* pArena = pDC->pArena;
+        auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index ff25e82f0fe..ce8646fb28d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -290,7 +290,7 @@ INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
         _ReadWriteBarrier();
 
         // Cleanup memory allocations
-        pDC->pArena->Reset();
+        pDC->pArena->Reset(true);
         pDC->pTileMgr->initialize();
 
         pContext->dcRing.Dequeue();  // Remove from tail
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index f3c24dacb48..89c779e04d9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -56,7 +56,7 @@ void DispatchQueue::operator delete(void *p)
     _aligned_free(p);
 }
 
-MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index f3e1373b00f..cf9d2fea32a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -59,7 +59,8 @@ struct MacroTileQueue
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Clear fifo and unlock it.
-    void clear(Arena& arena)
+    template <typename ArenaT>
+    void clear(ArenaT& arena)
     {
         mFifo.clear(arena);
     }
@@ -71,7 +72,8 @@ struct MacroTileQueue
         return mFifo.peek();
     }
 
-    bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+    template <typename ArenaT>
+    bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
     {
         return mFifo.enqueue_try_nosync(arena, entry);
     }
@@ -104,7 +106,7 @@ private:
 class MacroTileMgr
 {
 public:
-    MacroTileMgr(Arena& arena);
+    MacroTileMgr(CachingArena& arena);
     ~MacroTileMgr()
     {
         for (auto &tile : mTiles)
@@ -142,7 +144,7 @@ public:
     void operator delete (void *p);
 
 private:
-    Arena& mArena;
+    CachingArena& mArena;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
 
     // Any tile that has work queued to it is a dirty tile.

From 68314b676968e2cf0f8e94f573fa28e766e48349 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 21 Mar 2016 14:08:38 -0600
Subject: [PATCH 045/238] swr: [rasterizer jitter] support llvm-svn

---
 .../drivers/swr/rasterizer/jitter/JitManager.h     |  7 +++++--
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp    |  8 +++++++-
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 10 +++++-----
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  2 +-
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp    | 14 ++++++++++++--
 .../swr/rasterizer/jitter/streamout_jit.cpp        |  8 +++++++-
 6 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c1bccab95ae..4ffb0fbee01 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -53,6 +53,10 @@
 #include "llvm/Config/config.h"
 #endif
 
+#ifndef HAVE_LLVM
+#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#endif
+
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/Support/FileSystem.h"
@@ -60,11 +64,10 @@
 
 #include "llvm/Analysis/Passes.h"
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 #include "llvm/PassManager.h"
 #else
 #include "llvm/IR/LegacyPassManager.h"
-using namespace llvm::legacy;
 #endif
 
 #include "llvm/CodeGen/Passes.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 954524afd3a..2fed2bf4831 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -717,7 +717,13 @@ struct BlendJit : public Builder
 
         JitManager::DumpToFile(blendFunc, "");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 876fe83511e..c6cf793139c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -234,7 +234,7 @@ Value *Builder::VUNDEF(Type* t)
     return UndefValue::get(VectorType::get(t, mVWidth));
 }
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 {
     return VINSERT(vec, val, C((int64_t)index));
@@ -521,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list
 
     // get a pointer to the first character in the constant string array
     std::vector<Constant*> geplist{C(0),C(0)};
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 #else
     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
@@ -1409,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high)
 Value* Builder::STACKSAVE()
 {
     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     return CALL(pfnStackSave);
 #else
     return CALLA(pfnStackSave);
@@ -1467,7 +1467,7 @@ void __cdecl CallPrint(const char* fmt, ...)
 
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vextractf128_si_256);
@@ -1484,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 
 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vinsertf128_si_256);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 4c9c431179f..f43ef69d1ed 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -59,7 +59,7 @@ Value *VUNDEF_F();
 Value *VUNDEF_I();
 Value *VUNDEF(Type* ty, uint32_t size);
 Value *VUNDEF_IPTR();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *VINSERT(Value *vec, Value *val, uint64_t index);
 #endif
 Value *VBROADCAST(Value *src);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 2ca01309d05..2c2c56bd151 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     verifyFunction(*fetch);
 
-    FunctionPassManager setupPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            setupPasses(JM()->mpCurrentModule);
 
     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
     setupPasses.add(createBreakCriticalEdgesPass());
@@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     JitManager::DumpToFile(fetch, "se");
 
-    FunctionPassManager optPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            optPasses(JM()->mpCurrentModule);
 
     ///@todo Haven't touched these either. Need to remove some of these and add others.
     optPasses.add(createCFGSimplificationPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 6c5f22bc47c..36baa8d794b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -293,7 +293,13 @@ struct StreamOutJit : public Builder
 
         JitManager::DumpToFile(soFunc, "SoFunc");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());

From ee6be9e92dbdc3dbeb26e0f873c1784d563bf641 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 21 Mar 2016 17:30:03 -0600
Subject: [PATCH 046/238] swr: [rasterizer core] CachedArena optimizations

Reduce list traversal during Alloc and Free.

Add ability to have multiple lists based on alloc size (not used for now)
---
 .../drivers/swr/rasterizer/common/os.h        |   2 +
 .../drivers/swr/rasterizer/core/arena.h       | 256 +++++++++++-------
 .../drivers/swr/rasterizer/core/context.h     | 113 --------
 3 files changed, 161 insertions(+), 210 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index d4bec908bb4..5794f3f625a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -54,9 +54,11 @@
 
 #if defined(_WIN32)
 #if defined(_WIN64)
+#define BitScanReverseSizeT BitScanReverse64
 #define BitScanForwardSizeT BitScanForward64
 #define _mm_popcount_sizeT _mm_popcnt_u64
 #else
+#define BitScanReverseSizeT BitScanReverse
 #define BitScanForwardSizeT BitScanForward
 #define _mm_popcount_sizeT _mm_popcnt_u32
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 71fb258f4d4..a2db7b38208 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -51,7 +51,10 @@ public:
     }
 };
 
+static const size_t ARENA_BLOCK_SHIFT = 5;
 static const size_t ARENA_BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4;
+static_assert((1U << ARENA_BLOCK_SHIFT) == ARENA_BLOCK_ALIGN,
+              "Invalid value for ARENA_BLOCK_ALIGN/SHIFT");
 
 struct ArenaBlock
 {
@@ -59,9 +62,158 @@ struct ArenaBlock
     size_t      blockSize = 0;
     ArenaBlock* pNext = nullptr;
 };
-static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size");
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
+              "Increase BLOCK_ALIGN size");
 
-template<typename MutexT = std::mutex, typename T = DefaultAllocator>
+// Caching Allocator for Arena
+template<uint32_t NumBucketsT = 1, uint32_t StartBucketBitT = 20>
+struct CachingAllocatorT : DefaultAllocator
+{
+    static uint32_t GetBucketId(size_t blockSize)
+    {
+        uint32_t bucketId = 0;
+
+#if defined(BitScanReverseSizeT)
+        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
+
+        return bucketId;
+    }
+
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        SWR_ASSERT(size >= sizeof(ArenaBlock));
+        SWR_ASSERT(size <= uint32_t(-1));
+
+        size_t blockSize = size - ARENA_BLOCK_ALIGN;
+
+        {
+            // search cached blocks
+            std::lock_guard<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
+            ArenaBlock* pBlock = pPrevBlock->pNext;
+            ArenaBlock* pPotentialBlock = nullptr;
+            ArenaBlock* pPotentialPrev = nullptr;
+
+            while (pBlock)
+            {
+                if (pBlock->blockSize >= blockSize)
+                {
+                    if (pBlock == AlignUp(pBlock, align))
+                    {
+                        if (pBlock->blockSize == blockSize)
+                        {
+                            // Won't find a better match
+                            break;
+                        }
+
+                        // We could use this as it is larger than we wanted, but
+                        // continue to search for a better match
+                        pPotentialBlock = pBlock;
+                        pPotentialPrev = pPrevBlock;
+                    }
+                }
+                else
+                {
+                    // Blocks are sorted by size (biggest first)
+                    // So, if we get here, there are no blocks 
+                    // large enough, fall through to allocation.
+                    pBlock = nullptr;
+                    break;
+                }
+
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            if (!pBlock)
+            {
+                // Couldn't find an exact match, use next biggest size
+                pBlock = pPotentialBlock;
+                pPrevBlock = pPotentialPrev;
+            }
+
+            if (pBlock)
+            {
+                SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
+                pPrevBlock->pNext = pBlock->pNext;
+                pBlock->pNext = nullptr;
+
+                return pBlock;
+            }
+
+            m_totalAllocated += size;
+
+#if 0
+            {
+                static uint32_t count = 0;
+                char buf[128];
+                sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
+                OutputDebugStringA(buf);
+            }
+#endif
+        }
+
+        return this->DefaultAllocator::AllocateAligned(size, align);
+    }
+
+    void  Free(void* pMem)
+    {
+        if (pMem)
+        {
+            ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
+            SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr);
+
+            std::unique_lock<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
+            ArenaBlock* pBlock = pPrevBlock->pNext;
+
+            while (pBlock)
+            {
+                if (pNewBlock->blockSize >= pBlock->blockSize)
+                {
+                    // Insert here
+                    break;
+                }
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            // Insert into list
+            SWR_ASSERT(pPrevBlock);
+            pPrevBlock->pNext = pNewBlock;
+            pNewBlock->pNext = pBlock;
+        }
+    }
+
+    ~CachingAllocatorT()
+    {
+        // Free all cached blocks
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
+            while (pBlock)
+            {
+                ArenaBlock* pNext = pBlock->pNext;
+                this->DefaultAllocator::Free(pBlock);
+                pBlock = pNext;
+            }
+        }
+    }
+
+    // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
+    static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
+    static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
+
+    ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
+    std::mutex              m_mutex;
+
+    size_t                  m_totalAllocated = 0;
+};
+typedef CachingAllocatorT<> CachingAllocator;
+
+template<typename T = DefaultAllocator>
 class TArena
 {
 public:
@@ -91,8 +243,8 @@ public:
             // a new block
         }
 
-        static const size_t ArenaBlockSize = 1024 * 1024;
-        size_t blockSize = std::max<size_t>(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
+        static const size_t ArenaBlockSize = 1024 * 1024 - ARENA_BLOCK_ALIGN;
+        size_t blockSize = std::max(size, ArenaBlockSize);
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
         blockSize = AlignUp(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);
@@ -177,101 +329,11 @@ private:
     size_t              m_size      = 0;
 
     /// @note Mutex is only used by sync allocation functions.
-    MutexT              m_mutex;
+    std::mutex          m_mutex;
 
     DefaultAllocator    m_defAllocator;
     T&                  m_allocator;
 };
 
-template<typename T>
-using Arena     = TArena<std::mutex, T>;
-using StdArena  = Arena<DefaultAllocator>;
-
-struct NullMutex
-{
-    void lock() {}
-    void unlock() {}
-};
-
-// Ref counted Arena for ArenaAllocator
-// NOT THREAD SAFE!!
-struct RefArena : TArena<NullMutex>
-{
-    uint32_t AddRef() { return ++m_refCount; }
-    uint32_t Release() { if (--m_refCount) { return m_refCount; } delete this; return 0; }
-
-    void* allocate(std::size_t n)
-    {
-        ++m_numAllocations;
-        return Alloc(n);
-    }
-
-    void deallocate(void* p) { --m_numAllocations; }
-    void clear() { SWR_ASSERT(0 == m_numAllocations); Reset(); }
-
-private:
-    uint32_t m_refCount = 0;
-    uint32_t m_numAllocations = 0;
-};
-
-#if 0 // THIS DOESN'T WORK!!!
-// Arena based replacement for std::allocator
-template <typename T>
-struct ArenaAllocator
-{
-    typedef T value_type;
-    ArenaAllocator()
-    {
-        m_pArena = new RefArena();
-        m_pArena->AddRef();
-    }
-    ~ArenaAllocator()
-    {
-        m_pArena->Release(); m_pArena = nullptr;
-    }
-    ArenaAllocator(const ArenaAllocator& copy)
-    {
-        m_pArena = const_cast<RefArena*>(copy.m_pArena); m_pArena->AddRef();
-    }
-
-
-    template <class U> ArenaAllocator(const ArenaAllocator<U>& copy)
-    {
-        m_pArena = const_cast<RefArena*>(copy.m_pArena); m_pArena->AddRef();
-    }
-    T* allocate(std::size_t n)
-    {
-#if defined(_DEBUG)
-        char buf[32];
-        sprintf_s(buf, "Alloc: %lld\n", n);
-        OutputDebugStringA(buf);
-#endif
-        void* p = m_pArena->allocate(n * sizeof(T));
-        return static_cast<T*>(p);
-    }
-    void deallocate(T* p, std::size_t n)
-    {
-#if defined(_DEBUG)
-        char buf[32];
-        sprintf_s(buf, "Dealloc: %lld\n", n);
-        OutputDebugStringA(buf);
-#endif
-        m_pArena->deallocate(p);
-    }
-    void clear() { m_pArena->clear(); }
-
-    RefArena* m_pArena = nullptr;
-};
-
-template <class T, class U>
-bool operator== (const ArenaAllocator<T>&, const ArenaAllocator<U>&)
-{
-    return true;
-}
-
-template <class T, class U>
-bool operator!= (const ArenaAllocator<T>&, const ArenaAllocator<U>&)
-{
-    return false;
-}
-#endif
+using StdArena      = TArena<DefaultAllocator>;
+using CachingArena  = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 6240b2e08d3..b8f15cae4a3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -360,119 +360,6 @@ struct BACKEND_FUNCS
     PFN_OUTPUT_MERGER pfnOutputMerger;
 };
 
-// Caching Allocator for Arena
-struct CachingAllocator : DefaultAllocator
-{
-    void* AllocateAligned(size_t size, size_t align)
-    {
-        SWR_ASSERT(size >= sizeof(ArenaBlock));
-
-        {
-            // search cached blocks
-            std::lock_guard<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks;
-            ArenaBlock* pBlock = m_cachedBlocks.pNext;
-            ArenaBlock* pPotentialBlock = nullptr;
-            ArenaBlock* pPotentialPrev = nullptr;
-
-            while (pBlock)
-            {
-                if (pBlock->blockSize >= (size - ARENA_BLOCK_ALIGN))
-                {
-                    if (pBlock == AlignUp(pBlock, align))
-                    {
-                        if (pBlock->blockSize == size)
-                        {
-                            // Won't find a better match
-                            break;
-                        }
-
-                        // We could use this as it is larger than we wanted, but
-                        // continue to search for a better match
-                        pPotentialBlock = pBlock;
-                        pPotentialPrev = pPrevBlock;
-                    }
-                }
-                else
-                {
-                    // Blocks are sorted by size (biggest first)
-                    // So, if we get here, there are no blocks 
-                    // large enough, fall through to allocation.
-                    pBlock = nullptr;
-                    break;
-                }
-
-                pPrevBlock = pBlock;
-                pBlock = pBlock->pNext;
-            }
-
-            if (!pBlock)
-            {
-                // Couldn't find an exact match, use next biggest size
-                pBlock = pPotentialBlock;
-                pPrevBlock = pPotentialPrev;
-            }
-
-            if (pBlock)
-            {
-                SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
-                pPrevBlock->pNext = pBlock->pNext;
-                pBlock->pNext = nullptr;
-
-                return pBlock;
-            }
-        }
-
-        return this->DefaultAllocator::AllocateAligned(size, align);
-    }
-
-    void  Free(void* pMem)
-    {
-        if (pMem)
-        {
-            ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
-            SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr);
-
-            std::unique_lock<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks;
-            ArenaBlock* pBlock = m_cachedBlocks.pNext;
-
-            while (pBlock)
-            {
-                if (pNewBlock->blockSize >= pBlock->blockSize)
-                {
-                    // Insert here
-                    break;
-                }
-                pPrevBlock = pBlock;
-                pBlock = pBlock->pNext;
-            }
-
-            // Insert into list
-            SWR_ASSERT(pPrevBlock);
-            pPrevBlock->pNext = pNewBlock;
-            pNewBlock->pNext = pBlock;
-        }
-    }
-
-    ~CachingAllocator()
-    {
-        // Free all cached blocks
-        ArenaBlock* pBlock = m_cachedBlocks.pNext;
-        while (pBlock)
-        {
-            ArenaBlock* pNext = pBlock->pNext;
-            this->DefaultAllocator::Free(pBlock);
-            pBlock = pNext;
-        }
-    }
-
-    ArenaBlock m_cachedBlocks;
-    std::mutex m_mutex;
-
-};
-
-using CachingArena = Arena<CachingAllocator>;
 
 // Draw State
 struct DRAW_STATE

From ed5b9539191ca700887566a82162c06d94f57497 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Mon, 21 Mar 2016 17:55:46 -0600
Subject: [PATCH 047/238] swr: [rasterizer core] One last pass at Arena
 optimizations

---
 .../drivers/swr/rasterizer/core/arena.h       | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index a2db7b38208..5d08cda6506 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -58,7 +58,6 @@ static_assert((1U << ARENA_BLOCK_SHIFT) == ARENA_BLOCK_ALIGN,
 
 struct ArenaBlock
 {
-    void*       pMem = nullptr;
     size_t      blockSize = 0;
     ArenaBlock* pNext = nullptr;
 };
@@ -163,7 +162,7 @@ struct CachingAllocatorT : DefaultAllocator
         if (pMem)
         {
             ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
-            SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr);
+            SWR_ASSERT(pNewBlock->blockSize >= 0);
 
             std::unique_lock<std::mutex> l(m_mutex);
             ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
@@ -226,16 +225,18 @@ public:
 
     void* AllocAligned(size_t size, size_t  align)
     {
+        SWR_ASSERT(size);
+        SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
+
         if (m_pCurBlock)
         {
             ArenaBlock* pCurBlock = m_pCurBlock;
-            m_offset = AlignUp(m_offset, align);
+            size_t offset = AlignUp(m_offset, align);
 
-            if ((m_offset + size) <= pCurBlock->blockSize)
+            if ((offset + size) <= pCurBlock->blockSize)
             {
-                void* pMem = PtrAdd(pCurBlock->pMem, m_offset);
-                m_offset += size;
-                m_size += size;
+                void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN);
+                m_offset = offset + size;
                 return pMem;
             }
 
@@ -247,9 +248,9 @@ public:
         size_t blockSize = std::max(size, ArenaBlockSize);
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
-        blockSize = AlignUp(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);
+        blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
 
-        void *pMem = m_allocator.AllocateAligned(blockSize, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
         SWR_ASSERT(pMem != nullptr);
 
         ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
@@ -260,8 +261,7 @@ public:
             pNewBlock->pNext = m_pCurBlock;
 
             m_pCurBlock = pNewBlock;
-            m_pCurBlock->pMem = PtrAdd(pMem, ARENA_BLOCK_ALIGN);
-            m_pCurBlock->blockSize = blockSize - ARENA_BLOCK_ALIGN;
+            m_pCurBlock->blockSize = blockSize;
         }
 
         return AllocAligned(size, align);
@@ -316,17 +316,17 @@ public:
                 m_pCurBlock = nullptr;
             }
         }
-
-        m_size = 0;
     }
 
-    size_t Size() const { return m_size; }
+    bool IsEmpty()
+    {
+        return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr);
+    }
 
 private:
 
     ArenaBlock*         m_pCurBlock = nullptr;
     size_t              m_offset    = 0;
-    size_t              m_size      = 0;
 
     /// @note Mutex is only used by sync allocation functions.
     std::mutex          m_mutex;

From 51549912d1b1137572a0692972d1059ebb2e3384 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 22 Mar 2016 09:27:18 -0600
Subject: [PATCH 048/238] swr: [rasterizer core] Reduce Arena blocksize to
 128KB (from 1MB).

With global allocator this doesn't seem to affect performance at all.
Overall memory consumption drops by up to 85%.
---
 src/gallium/drivers/swr/rasterizer/core/arena.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 5d08cda6506..d777c20a4ee 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -212,7 +212,7 @@ struct CachingAllocatorT : DefaultAllocator
 };
 typedef CachingAllocatorT<> CachingAllocator;
 
-template<typename T = DefaultAllocator>
+template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
 class TArena
 {
 public:
@@ -225,7 +225,11 @@ public:
 
     void* AllocAligned(size_t size, size_t  align)
     {
-        SWR_ASSERT(size);
+        if (0 == size)
+        {
+            return nullptr;
+        }
+
         SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
 
         if (m_pCurBlock)
@@ -244,7 +248,7 @@ public:
             // a new block
         }
 
-        static const size_t ArenaBlockSize = 1024 * 1024 - ARENA_BLOCK_ALIGN;
+        static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN;
         size_t blockSize = std::max(size, ArenaBlockSize);
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.

From 83822d7ed580e764b3e0a6cb773310af2473f062 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 22 Mar 2016 12:41:13 -0600
Subject: [PATCH 049/238] swr: [rasterizer jitter] add missing include for llvm
 jitevents

---
 src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 734c89792f0..de856c4a095 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -47,6 +47,10 @@
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/IRReader/IRReader.h"
 
+#if LLVM_USE_INTEL_JITEVENTS
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#endif
+
 #include "core/state.h"
 #include "common/containers.hpp"
 

From 813e89c0cc0ea6a6ed4b69303073995b4c4c7666 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 22 Mar 2016 15:13:29 -0600
Subject: [PATCH 050/238] swr: [rasterizer core] Cleanup state ring arena after
 last draw that references it completes

Rather than waiting for the API thread to re-use it.
---
 src/gallium/drivers/swr/rasterizer/core/api.cpp     | 6 ++++++
 src/gallium/drivers/swr/rasterizer/core/context.h   | 2 ++
 src/gallium/drivers/swr/rasterizer/core/threads.cpp | 8 ++++++--
 3 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 6ebb3f87f7a..591342239d3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -297,6 +297,8 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
 
         // Assign unique drawId for this DC
         pCurDrawContext->drawId = pContext->dcRing.GetHead();
+
+        pCurDrawContext->cleanupState = true;
     }
     else
     {
@@ -1076,6 +1078,8 @@ void DrawInstanced(
         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
         pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
 
+        pDC->cleanupState = (remainingVerts == numVertsForDraw);
+
         //enqueue DC
         QueueDraw(pContext);
 
@@ -1210,6 +1214,8 @@ void DrawIndexedInstance(
         pDC->FeWork.desc.draw.baseVertex = baseVertex;
         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
 
+        pDC->cleanupState = (remainingIndices == numIndicesForDraw);
+
         //enqueue DC
         QueueDraw(pContext);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index b8f15cae4a3..39f23372a18 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -402,6 +402,8 @@ struct DRAW_CONTEXT
     CachingArena* pArena;
 
     uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
+
+    bool  cleanupState; // True if this is the last draw using an entry in the state ring.
 };
 
 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index ce8646fb28d..521a306b96e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -287,11 +287,15 @@ INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 
     if (result == 0)
     {
-        _ReadWriteBarrier();
-
         // Cleanup memory allocations
         pDC->pArena->Reset(true);
         pDC->pTileMgr->initialize();
+        if (pDC->cleanupState)
+        {
+            pDC->pState->pArena->Reset(true);
+        }
+
+        _ReadWriteBarrier();
 
         pContext->dcRing.Dequeue();  // Remove from tail
     }

From 0767e820fd96e8bac2943fa8942bea3ff81b8bd9 Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Tue, 22 Mar 2016 17:28:06 -0600
Subject: [PATCH 051/238] swr: [rasterizer core] Fix Compute workitem
 retirement

---
 .../drivers/swr/rasterizer/core/api.cpp       | 37 +++++++------------
 .../drivers/swr/rasterizer/core/arena.h       |  7 +---
 .../drivers/swr/rasterizer/core/threads.cpp   |  7 ++--
 .../drivers/swr/rasterizer/core/threads.h     |  2 +
 4 files changed, 22 insertions(+), 31 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 591342239d3..7ca182242e5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -160,20 +160,12 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
 template<bool IsDraw>
 void QueueWork(SWR_CONTEXT *pContext)
 {
-    if (IsDraw)
-    {
-        // Each worker thread looks at a DC for both FE and BE work at different times and so we
-        // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
-        // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
-        // then moved on if all work is done.)
-        pContext->pCurDrawContext->threadsDone =
-            pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
-    }
-    else
-    {
-        pContext->pCurDrawContext->threadsDone =
-            pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
-    }
+    // Each worker thread looks at a DC for both FE and BE work at different times and so we
+    // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
+    // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+    // then moved on if all work is done.)
+    pContext->pCurDrawContext->threadsDone =
+        pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
 
     _ReadWriteBarrier();
     {
@@ -201,10 +193,7 @@ void QueueWork(SWR_CONTEXT *pContext)
         }
 
         // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
-        if (!pContext->dcRing.IsEmpty())
-        {
-            pContext->dcRing.Dequeue();
-        }
+        while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
 
         // restore csr
         _mm_setcsr(mxcsr);
@@ -252,8 +241,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
-        auto& stateArena = *(pCurDrawContext->pState->pArena);
-
         // Copy previous state to current state.
         if (pContext->pPrevDrawContext)
         {
@@ -266,7 +253,9 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
             {
                 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
 
-                stateArena.Reset(true);    // Reset memory.
+                // Should have been cleaned up previously
+                SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
+
                 pCurDrawContext->pState->pPrivateState = nullptr;
 
                 pContext->curStateId++;  // Progress state ring index forward.
@@ -276,16 +265,18 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
                 // If its a split draw then just copy the state pointer over
                 // since its the same draw.
                 pCurDrawContext->pState = pPrevDrawContext->pState;
+                SWR_ASSERT(pPrevDrawContext->cleanupState == false);
             }
         }
         else
         {
-            stateArena.Reset();    // Reset memory.
+            SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
             pContext->curStateId++;  // Progress state ring index forward.
         }
 
+        SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
+
         pCurDrawContext->dependency = 0;
-        pCurDrawContext->pArena->Reset();
         pCurDrawContext->pContext = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index d777c20a4ee..67d81a44347 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -51,10 +51,7 @@ public:
     }
 };
 
-static const size_t ARENA_BLOCK_SHIFT = 5;
-static const size_t ARENA_BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4;
-static_assert((1U << ARENA_BLOCK_SHIFT) == ARENA_BLOCK_ALIGN,
-              "Invalid value for ARENA_BLOCK_ALIGN/SHIFT");
+static const size_t ARENA_BLOCK_ALIGN = 64;
 
 struct ArenaBlock
 {
@@ -65,7 +62,7 @@ static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
               "Increase BLOCK_ALIGN size");
 
 // Caching Allocator for Arena
-template<uint32_t NumBucketsT = 1, uint32_t StartBucketBitT = 20>
+template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
 struct CachingAllocatorT : DefaultAllocator
 {
     static uint32_t GetBucketId(size_t blockSize)
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 521a306b96e..845e28ea497 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -279,11 +279,10 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti
     return (pDC->dependency > lastRetiredDraw);
 }
 
-
-
-INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
     int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+    SWR_ASSERT(result >= 0);
 
     if (result == 0)
     {
@@ -299,6 +298,8 @@ INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 
         pContext->dcRing.Dequeue();  // Remove from tail
     }
+
+    return result;
 }
 
 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 6b37e3ac179..6cc8c96a00f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -34,6 +34,7 @@
 typedef std::thread* THREAD_PTR;
 
 struct SWR_CONTEXT;
+struct DRAW_CONTEXT;
 
 struct THREAD_DATA
 {
@@ -63,3 +64,4 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
 void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
+int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file

From 090be2e434d6023428faa9842d38f9d5c3cef67a Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Wed, 23 Mar 2016 18:12:11 -0600
Subject: [PATCH 052/238] swr: [rasterizer jitter] Fix logic bug for
 alpha-to-coverage.

---
 .../drivers/swr/rasterizer/jitter/blend_jit.cpp     | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 2fed2bf4831..a64f86006f4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -576,9 +576,12 @@ struct BlendJit : public Builder
             src1[i] = LOAD(pSrc1, { i });
         }
         Value* currentMask = VIMMED1(-1);
-        if(state.desc.alphaToCoverageEnable)
+        if (state.desc.alphaToCoverageEnable)
         {
-            currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
+            uint32_t bits = (1 << state.desc.numSamples) - 1;
+            currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+            currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
         }
 
         // alpha test
@@ -702,6 +705,12 @@ struct BlendJit : public Builder
             currentMask = AND(sampleMask, currentMask);
         }
 
+        if (state.desc.alphaToCoverageEnable)
+        {
+            Value* sampleMasked = SHL(C(1), sampleNum);
+            currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+        }
+
         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
            state.desc.oMaskEnable)
         {

From 93c1a2dedfa8b786e969a9ae44765bf6841218ef Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 24 Mar 2016 00:01:23 -0600
Subject: [PATCH 053/238] swr: [rasterizer core] NUMA optimizations...

- Affinitize hot-tile memory to specific NUMA nodes.
- Only do BE work for macrotiles assoicated with the numa node
---
 .../drivers/swr/rasterizer/core/api.cpp       |   2 +-
 .../drivers/swr/rasterizer/core/threads.cpp   | 126 ++++++++++--------
 .../drivers/swr/rasterizer/core/threads.h     |   5 +-
 .../drivers/swr/rasterizer/core/tilemgr.cpp   |   8 +-
 .../drivers/swr/rasterizer/core/tilemgr.h     |  31 ++++-
 5 files changed, 106 insertions(+), 66 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 7ca182242e5..f0f7956b590 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -184,7 +184,7 @@ void QueueWork(SWR_CONTEXT *pContext)
             static TileSet lockedTiles;
             uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
             WorkOnFifoFE(pContext, 0, curDraw[0], 0);
-            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
         }
         else
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 845e28ea497..07bc94a1a54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -349,7 +349,9 @@ void WorkOnFifoBE(
     SWR_CONTEXT *pContext,
     uint32_t workerId,
     uint64_t &curDrawBE,
-    TileSet& lockedTiles)
+    TileSet& lockedTiles,
+    uint32_t numaNode,
+    uint32_t numaMask)
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -390,68 +392,78 @@ void WorkOnFifoBE(
 
         for (uint32_t tileID : macroTiles)
         {
+            // Only work on tiles for for this numa node
+            uint32_t x, y;
+            pDC->pTileMgr->getTileIndices(tileID, x, y);
+            if (((x ^ y) & numaMask) != numaNode)
+            {
+                continue;
+            }
+
             MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
             
-            // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) == lockedTiles.end())
+            if (!tile.getNumQueued())
             {
-                if (tile.getNumQueued())
+                continue;
+            }
+
+            // can only work on this draw if it's not in use by other threads
+            if (lockedTiles.find(tileID) != lockedTiles.end())
+            {
+                continue;
+            }
+
+            if (tile.tryLock())
+            {
+                BE_WORK *pWork;
+
+                RDTSC_START(WorkerFoundWork);
+
+                uint32_t numWorkItems = tile.getNumQueued();
+                SWR_ASSERT(numWorkItems);
+
+                pWork = tile.peek();
+                SWR_ASSERT(pWork);
+                if (pWork->type == DRAW)
                 {
-                    if (tile.tryLock())
-                    {
-                        BE_WORK *pWork;
-
-                        RDTSC_START(WorkerFoundWork);
-
-                        uint32_t numWorkItems = tile.getNumQueued();
-
-                        if (numWorkItems != 0)
-                        {
-                            pWork = tile.peek();
-                            SWR_ASSERT(pWork);
-                            if (pWork->type == DRAW)
-                            {
-                                pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
-                            }
-                        }
-
-                        while ((pWork = tile.peek()) != nullptr)
-                        {
-                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-                            tile.dequeue();
-                        }
-                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-
-                        _ReadWriteBarrier();
-
-                        pDC->pTileMgr->markTileComplete(tileID);
-
-                        // Optimization: If the draw is complete and we're the last one to have worked on it then
-                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
-                        {
-                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
-                            curDrawBE++;
-                            CompleteDrawContext(pContext, pDC);
-
-                            lastRetiredDraw++;
-
-                            lockedTiles.clear();
-                            break;
-                        }
-                    }
-                    else
-                    {
-                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                        lockedTiles.insert(tileID);
-                    }
+                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
                 }
+
+                while ((pWork = tile.peek()) != nullptr)
+                {
+                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+                    tile.dequeue();
+                }
+                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+                _ReadWriteBarrier();
+
+                pDC->pTileMgr->markTileComplete(tileID);
+
+                // Optimization: If the draw is complete and we're the last one to have worked on it then
+                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                {
+                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
+                    curDrawBE++;
+                    CompleteDrawContext(pContext, pDC);
+
+                    lastRetiredDraw++;
+
+                    lockedTiles.clear();
+                    break;
+                }
+            }
+            else
+            {
+                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+                lockedTiles.insert(tileID);
             }
         }
     }
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
 {
     // Try to grab the next DC from the ring
     uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -547,7 +559,8 @@ DWORD workerThreadMain(LPVOID pData)
 
     RDTSC_INIT(threadId);
 
-    int numaNode = (int)pThreadData->numaId;
+    uint32_t numaNode = pThreadData->numaId;
+    uint32_t numaMask = pContext->threadPool.numaMask;
 
     // flush denormals to 0
     _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
@@ -619,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
         }
 
         RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
         RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
 
         WorkOnCompute(pContext, workerId, curDrawBE);
@@ -740,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 
     pPool->inThreadShutdown = false;
     pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+    pPool->numaMask = 0;
 
     if (KNOB_MAX_WORKER_THREADS)
     {
@@ -760,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
     else
     {
+        pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
+
         uint32_t workerId = 0;
         for (uint32_t n = 0; n < numNodes; ++n)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 6cc8c96a00f..821d7dcb16e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -51,6 +51,7 @@ struct THREAD_POOL
 {
     THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
     uint32_t numThreads;
+    uint32_t numaMask;
     volatile bool inThreadShutdown;
     THREAD_DATA *pThreadData;
 };
@@ -61,7 +62,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
 int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 89c779e04d9..794577270cf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -119,7 +119,8 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
         if (create)
         {
             uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
             hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -139,10 +140,11 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
             SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
                 (hotTile.state == HOTTILE_RESOLVED) ||
                 (hotTile.state == HOTTILE_CLEAR));
-            _aligned_free(hotTile.pBuffer);
+            FreeHotTileMem(hotTile.pBuffer);
 
             uint32_t size = numSamples * mHotTileSize[attachment];
-            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
             hotTile.state = HOTTILE_INVALID;
             hotTile.numSamples = numSamples;
         }
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index cf9d2fea32a..aa561badc1c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -291,11 +291,7 @@ public:
             {
                 for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
                 {
-                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
-                    {
-                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
-                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
-                    }
+                    FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
                 }
             }
         }
@@ -315,5 +311,30 @@ public:
 private:
     HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
     uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
+
+    void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
+    {
+        void* p = nullptr;
+#if defined(_WIN32)
+        HANDLE hProcess = GetCurrentProcess();
+        p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+#else
+        p = _aligned_malloc(size, align);
+#endif
+
+        return p;
+    }
+
+    void FreeHotTileMem(void* pBuffer)
+    {
+        if (pBuffer)
+        {
+#if defined(_WIN32)
+            VirtualFree(pBuffer, 0, MEM_RELEASE);
+#else
+            _aligned_free(pBuffer);
+#endif
+        }
+    }
 };
 

From 74a04840e5e7213e1b317cfee63ce1e236c622fa Mon Sep 17 00:00:00 2001
From: Tim Rowley <timothy.o.rowley@intel.com>
Date: Thu, 24 Mar 2016 11:52:51 -0600
Subject: [PATCH 054/238] swr: [rasterizer jitter] Fix MASKLOADD AVX prototype
 (float -> i32)

---
 src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index c6cf793139c..486dad8f04c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -351,7 +351,7 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
     else
     {
         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
+        Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
         vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
     }
     return vResult;

From a8e5edaadfd5df6a473566ff55978aca27a37679 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Fri, 25 Mar 2016 14:06:39 -0600
Subject: [PATCH 055/238] st/xa: emit sampler view declarations in shaders

Fixes recent regressions with the VMware gallium driver.

Reviewed-by: Charmaine Lee <charmainel@vmware.com>
Tested-by: Charmaine Lee <charmainel@vmware.com>
---
 src/gallium/state_trackers/xa/xa_tgsi.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c
index 5d8b8079c4b..a50393d7886 100644
--- a/src/gallium/state_trackers/xa/xa_tgsi.c
+++ b/src/gallium/state_trackers/xa/xa_tgsi.c
@@ -339,6 +339,16 @@ create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg)
     u_sampler = ureg_DECL_sampler(ureg, 1);
     v_sampler = ureg_DECL_sampler(ureg, 2);
 
+    ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+    ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+    ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+
     matrow0 = ureg_DECL_constant(ureg, 0);
     matrow1 = ureg_DECL_constant(ureg, 1);
     matrow2 = ureg_DECL_constant(ureg, 2);
@@ -475,6 +485,9 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits)
     }
     if (is_composite) {
 	src_sampler = ureg_DECL_sampler(ureg, 0);
+        ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
 	src_input = ureg_DECL_fs_input(ureg,
 				       TGSI_SEMANTIC_GENERIC, 0,
 				       TGSI_INTERPOLATE_PERSPECTIVE);
@@ -494,12 +507,18 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits)
 
     if (has_mask) {
 	mask_sampler = ureg_DECL_sampler(ureg, 1);
+        ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
 	mask_pos = ureg_DECL_fs_input(ureg,
 				      TGSI_SEMANTIC_GENERIC, 1,
 				      TGSI_INTERPOLATE_PERSPECTIVE);
     }
 #if 0				/* unused right now */
     dst_sampler = ureg_DECL_sampler(ureg, 2);
+    ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
     dst_pos = ureg_DECL_fs_input(ureg,
 				 TGSI_SEMANTIC_POSITION, 2,
 				 TGSI_INTERPOLATE_PERSPECTIVE);

From 8683d54d2be82519c31e087e17dd936d13fa9d07 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 24 Mar 2016 12:11:01 +1100
Subject: [PATCH 056/238] glsl: reduce buffer block duplication

This reduces some of the craziness required for handling buffer
blocks. The problem is each shader stage holds its own information
about a block in memory, we were copying that information to a
program wide list but the per stage information remained meaning
when a binding was updated we needed to update all versions of it.

This changes the per stage blocks to instead point to a single
version of the block information in the program list.

Acked-by: Kenneth Graunke <kenneth@whitecape.org>
---
 .../glsl/link_uniform_initializers.cpp        |  2 +-
 src/compiler/glsl/link_uniforms.cpp           | 12 +--
 src/compiler/glsl/linker.cpp                  | 78 +++++++++++--------
 src/compiler/glsl/standalone_scaffolding.cpp  |  5 --
 src/mesa/main/mtypes.h                        |  9 +--
 src/mesa/main/uniforms.c                      | 33 +-------
 6 files changed, 57 insertions(+), 82 deletions(-)

diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp
index 3609f81771e..7d280ccf7fc 100644
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -183,7 +183,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
 
          if (stage_index != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[i];
-            sh->BufferInterfaceBlocks[stage_index].Binding = binding;
+            sh->BufferInterfaceBlocks[stage_index]->Binding = binding;
          }
       }
 }
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 940cc61181d..807b069e3ed 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -954,6 +954,8 @@ link_cross_validate_uniform_block(void *mem_ctx,
           new_block->Uniforms,
           sizeof(*linked_block->Uniforms) * linked_block->NumUniforms);
 
+   linked_block->Name = ralloc_strdup(*linked_blocks, linked_block->Name);
+
    for (unsigned int i = 0; i < linked_block->NumUniforms; i++) {
       struct gl_uniform_buffer_variable *ubo_var =
          &linked_block->Uniforms[i];
@@ -1005,9 +1007,9 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
 
       const unsigned l = strlen(var->name);
       for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
-         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
+         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i]->NumUniforms; j++) {
             if (sentinel) {
-               const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name;
+               const char *begin = shader->BufferInterfaceBlocks[i]->Uniforms[j].Name;
                const char *end = strchr(begin, sentinel);
 
                if (end == NULL)
@@ -1022,7 +1024,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
                   break;
                }
             } else if (!strcmp(var->name,
-                               shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) {
+                               shader->BufferInterfaceBlocks[i]->Uniforms[j].Name)) {
                found = true;
                var->data.location = j;
                break;
@@ -1148,9 +1150,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       sh->num_combined_uniform_components = sh->num_uniform_components;
 
       for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
-         if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) {
+         if (!sh->BufferInterfaceBlocks[i]->IsShaderStorage) {
             sh->num_combined_uniform_components +=
-               sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
+               sh->BufferInterfaceBlocks[i]->UniformBufferSize / 4;
          }
       }
    }
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 76b700d3451..cd35464eeeb 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1192,11 +1192,11 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
 	 int index = link_cross_validate_uniform_block(prog,
 						       &prog->BufferInterfaceBlocks,
 						       &prog->NumBufferInterfaceBlocks,
-						       &sh->BufferInterfaceBlocks[j]);
+						       sh->BufferInterfaceBlocks[j]);
 
 	 if (index == -1) {
 	    linker_error(prog, "uniform block `%s' has mismatching definitions\n",
-			 sh->BufferInterfaceBlocks[j].Name);
+			 sh->BufferInterfaceBlocks[j]->Name);
 	    return false;
 	 }
 
@@ -1204,6 +1204,23 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
       }
    }
 
+   /* Update per stage block pointers to point to the program list.
+    * FIXME: We should be able to free the per stage blocks here.
+    */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
+	 int stage_index =
+            prog->InterfaceBlockStageIndex[i][j];
+
+	 if (stage_index != -1) {
+	    struct gl_shader *sh = prog->_LinkedShaders[i];
+
+            sh->BufferInterfaceBlocks[stage_index] =
+               &prog->BufferInterfaceBlocks[j];
+	 }
+      }
+   }
+
    return true;
 }
 
@@ -2069,9 +2086,15 @@ link_intrastage_shaders(void *mem_ctx,
    linked->ir = new(linked) exec_list;
    clone_ir_list(mem_ctx, linked->ir, main->ir);
 
-   linked->BufferInterfaceBlocks = uniform_blocks;
+   linked->BufferInterfaceBlocks =
+      ralloc_array(linked, gl_uniform_block *, num_uniform_blocks);
+
+   ralloc_steal(linked, uniform_blocks);
+   for (unsigned i = 0; i < num_uniform_blocks; i++) {
+      linked->BufferInterfaceBlocks[i] = &uniform_blocks[i];
+   }
+
    linked->NumBufferInterfaceBlocks = num_uniform_blocks;
-   ralloc_steal(linked, linked->BufferInterfaceBlocks);
 
    link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
@@ -2869,7 +2892,8 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (prog->InterfaceBlockStageIndex[j][i] != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[j];
             int stage_index = prog->InterfaceBlockStageIndex[j][i];
-            if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
+            if (sh &&
+                sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage) {
                shader_blocks[j]++;
                total_shader_storage_blocks++;
             } else {
@@ -2986,7 +3010,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 
          for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
             int stage_index = prog->InterfaceBlockStageIndex[i][j];
-            if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
+            if (stage_index != -1 &&
+                sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage)
                total_shader_storage_blocks++;
          }
 
@@ -4006,20 +4031,22 @@ link_assign_subroutine_types(struct gl_shader_program *prog)
 
 static void
 split_ubos_and_ssbos(void *mem_ctx,
-                     struct gl_uniform_block *blocks,
+                     struct gl_uniform_block **s_blks,
+                     struct gl_uniform_block *p_blks,
                      unsigned num_blocks,
                      struct gl_uniform_block ***ubos,
                      unsigned *num_ubos,
-                     unsigned **ubo_interface_block_indices,
                      struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos,
-                     unsigned **ssbo_interface_block_indices)
+                     unsigned *num_ssbos)
 {
    unsigned num_ubo_blocks = 0;
    unsigned num_ssbo_blocks = 0;
 
+   /* Are we spliting the list of blocks for the shader or the program */
+   bool is_shader = p_blks == NULL;
+
    for (unsigned i = 0; i < num_blocks; i++) {
-      if (blocks[i].IsShaderStorage)
+      if (is_shader ? s_blks[i]->IsShaderStorage : p_blks[i].IsShaderStorage)
          num_ssbo_blocks++;
       else
          num_ubo_blocks++;
@@ -4031,24 +4058,13 @@ split_ubos_and_ssbos(void *mem_ctx,
    *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
    *num_ssbos = 0;
 
-   if (ubo_interface_block_indices)
-      *ubo_interface_block_indices =
-         ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
-
-   if (ssbo_interface_block_indices)
-      *ssbo_interface_block_indices =
-         ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
-
    for (unsigned i = 0; i < num_blocks; i++) {
-      if (blocks[i].IsShaderStorage) {
-         (*ssbos)[*num_ssbos] = &blocks[i];
-         if (ssbo_interface_block_indices)
-            (*ssbo_interface_block_indices)[*num_ssbos] = i;
+      struct gl_uniform_block *blk = is_shader ? s_blks[i] : &p_blks[i];
+      if (blk->IsShaderStorage) {
+         (*ssbos)[*num_ssbos] = blk;
          (*num_ssbos)++;
       } else {
-         (*ubos)[*num_ubos] = &blocks[i];
-         if (ubo_interface_block_indices)
-            (*ubo_interface_block_indices)[*num_ubos] = i;
+         (*ubos)[*num_ubos] = blk;
          (*num_ubos)++;
       }
    }
@@ -4627,25 +4643,23 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          gl_shader *sh = prog->_LinkedShaders[i];
          split_ubos_and_ssbos(sh,
                               sh->BufferInterfaceBlocks,
+                              NULL,
                               sh->NumBufferInterfaceBlocks,
                               &sh->UniformBlocks,
                               &sh->NumUniformBlocks,
-                              NULL,
                               &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks,
-                              NULL);
+                              &sh->NumShaderStorageBlocks);
       }
    }
 
    split_ubos_and_ssbos(prog,
+                        NULL,
                         prog->BufferInterfaceBlocks,
                         prog->NumBufferInterfaceBlocks,
                         &prog->UniformBlocks,
                         &prog->NumUniformBlocks,
-                        &prog->UboInterfaceBlockIndex,
                         &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks,
-                        &prog->SsboInterfaceBlockIndex);
+                        &prog->NumShaderStorageBlocks);
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
index d5d214b57cc..e350f702099 100644
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -124,11 +124,6 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
       shProg->InterfaceBlockStageIndex[i] = NULL;
    }
 
-   ralloc_free(shProg->UboInterfaceBlockIndex);
-   shProg->UboInterfaceBlockIndex = NULL;
-   ralloc_free(shProg->SsboInterfaceBlockIndex);
-   shProg->SsboInterfaceBlockIndex = NULL;
-
    ralloc_free(shProg->AtomicBuffers);
    shProg->AtomicBuffers = NULL;
    shProg->NumAtomicBuffers = 0;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 399f4508415..f050dddc4e8 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2306,7 +2306,7 @@ struct gl_shader
     * duplicated.
     */
    unsigned NumBufferInterfaceBlocks;
-   struct gl_uniform_block *BufferInterfaceBlocks;
+   struct gl_uniform_block **BufferInterfaceBlocks;
 
    unsigned NumUniformBlocks;
    struct gl_uniform_block **UniformBlocks;
@@ -2821,13 +2821,6 @@ struct gl_shader_program
     */
    int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
 
-   /**
-    * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer
-    * Objects and Shader Storage Buffer Objects.
-    */
-   unsigned *UboInterfaceBlockIndex;
-   unsigned *SsboInterfaceBlockIndex;
-
    /**
     * Map of active uniform names to locations
     *
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index b1968b3f795..7dcbdccf442 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1018,26 +1018,11 @@ _mesa_UniformBlockBinding(GLuint program,
 
    if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
        uniformBlockBinding) {
-      int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
 
-      const int interface_block_index =
-         shProg->UboInterfaceBlockIndex[uniformBlockIndex];
-
-      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
-         uniformBlockBinding;
-
-      for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index =
-            shProg->InterfaceBlockStageIndex[i][interface_block_index];
-
-	 if (stage_index != -1) {
-	    struct gl_shader *sh = shProg->_LinkedShaders[i];
-	    sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding;
-	 }
-      }
+      shProg->UniformBlocks[uniformBlockIndex]->Binding = uniformBlockBinding;
    }
 }
 
@@ -1076,26 +1061,12 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
 
    if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
        shaderStorageBlockBinding) {
-      int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      const int interface_block_index =
-         shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex];
-
-      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+      shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding =
          shaderStorageBlockBinding;
-
-      for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index =
-            shProg->InterfaceBlockStageIndex[i][interface_block_index];
-
-	 if (stage_index != -1) {
-	    struct gl_shader *sh = shProg->_LinkedShaders[i];
-	    sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding;
-	 }
-      }
    }
 }
 

From fc3b000fef85f3e2a8ddcc648e215ff7dface3e2 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 24 Mar 2016 09:28:49 +1000
Subject: [PATCH 057/238] virgl: drop next shader property for now.

Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/virgl/virgl_tgsi.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c
index 641b0b3e3b5..4a2271f24f4 100644
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -40,6 +40,7 @@ virgl_tgsi_transform_property(struct tgsi_transform_context *ctx,
    switch (prop->Property.PropertyName) {
    case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
    case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+   case TGSI_PROPERTY_NEXT_SHADER:
       break;
    default:
       ctx->emit_property(ctx, prop);

From 1fb05a9a0cf12a16429cf7a02777390a9dfa19b3 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 26 Mar 2016 18:35:05 +1100
Subject: [PATCH 058/238] radeon/r600_query.c: Minor style fix
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/r600_query.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index f8b62411722..f9a5721fb97 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -1066,7 +1066,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 			item_mask = 0x3;
 		}
 
-		while(num_tile_pipes--) {
+		while (num_tile_pipes--) {
 			i = backend_map & item_mask;
 			mask |= (1<<i);
 			backend_map >>= item_width;

From ca22d2f1fdb09e9b77394bc88418251034d344b5 Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 26 Mar 2016 18:35:06 +1100
Subject: [PATCH 059/238] radeon/r600: Fix return type in failure branch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Commit `d4e847ea` introduced a warning about making an
integer from a pointer without a cast, fix it here.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeon/r600_texture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 7322f3ee985..83fc0021227 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -335,7 +335,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	 */
 	if (resource->target != PIPE_BUFFER &&
 	    (resource->nr_samples > 1 || rtex->is_depth))
-		return NULL;
+		return false;
 
 	if (!res->is_shared) {
 		res->is_shared = true;

From 2df141087a56da20ad50b379b4a63426b870344a Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sat, 26 Mar 2016 18:35:07 +1100
Subject: [PATCH 060/238] mesa/st: Remove GLSLVersion clamping
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

While here, remove itermediate glsl_feature_level variable.

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_extensions.c | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 2fdaba073a2..0d6c6b196a1 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -552,7 +552,6 @@ void st_init_extensions(struct pipe_screen *screen,
                         boolean has_lib_dxtc)
 {
    unsigned i;
-   int glsl_feature_level;
    GLboolean *extension_table = (GLboolean *) extensions;
 
    static const struct st_extension_cap_mapping cap_mapping[] = {
@@ -844,12 +843,8 @@ void st_init_extensions(struct pipe_screen *screen,
                           ARRAY_SIZE(vertex_mapping), PIPE_BUFFER,
                           PIPE_BIND_VERTEX_BUFFER);
 
-   /* Figure out GLSL support. */
-   glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
-
-   consts->GLSLVersion = glsl_feature_level;
-   if (glsl_feature_level >= 410)
-      consts->GLSLVersion = 410;
+   /* Figure out GLSL support and set GLSLVersion to it. */
+   consts->GLSLVersion = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
 
    _mesa_override_glsl_version(consts);
 
@@ -858,9 +853,9 @@ void st_init_extensions(struct pipe_screen *screen,
       consts->ForceGLSLVersion = options->force_glsl_version;
    }
 
-   if (glsl_feature_level >= 400)
+   if (consts->GLSLVersion >= 400)
       extensions->ARB_gpu_shader5 = GL_TRUE;
-   if (glsl_feature_level >= 410)
+   if (consts->GLSLVersion >= 410)
       extensions->ARB_shader_precision = GL_TRUE;
 
    /* This extension needs full OpenGL 3.2, but we don't know if that's
@@ -1036,7 +1031,7 @@ void st_init_extensions(struct pipe_screen *screen,
 
    consts->MaxViewports = screen->get_param(screen, PIPE_CAP_MAX_VIEWPORTS);
    if (consts->MaxViewports >= 16) {
-      if (glsl_feature_level >= 400) {
+      if (consts->GLSLVersion >= 400) {
          consts->ViewportBounds.Min = -32768.0;
          consts->ViewportBounds.Max = 32767.0;
       } else {

From 11bd53933e36665efdcfd922ab7c22d51429df9a Mon Sep 17 00:00:00 2001
From: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Date: Sun, 27 Mar 2016 13:05:34 +1100
Subject: [PATCH 061/238] gallium/p_context.h: Make comment more readable
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/include/pipe/p_context.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index ee68fdd6f6f..1c97e82ece5 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -162,7 +162,7 @@ struct pipe_context {
     *               item of that data to store (e.g. for
     *               PIPE_QUERY_PIPELINE_STATISTICS).
     *               When the index is -1, instead of the value of the query
-    *               the driver should instead write a 1/0 to the appropriate
+    *               the driver should instead write a 1 or 0 to the appropriate
     *               location with 1 meaning that the query result is available.
     */
    void (*get_query_result_resource)(struct pipe_context *pipe,

From e2d5a6fac5c2b433cd78c0fc29b420b36c429cb9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mikl=C3=B3s=20M=C3=A1t=C3=A9?= <mtmkls@gmail.com>
Date: Thu, 24 Mar 2016 01:12:55 +0100
Subject: [PATCH 062/238] mesa: optionally associate a gl_program to
 ATI_fragment_shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

the state tracker will use it

Acked-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Miklós Máté <mtmkls@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/drivers/common/driverfuncs.c |  3 +++
 src/mesa/main/atifragshader.c         | 13 ++++++++++++-
 src/mesa/main/dd.h                    |  5 +++++
 src/mesa/main/mtypes.h                |  1 +
 src/mesa/main/state.c                 | 14 +++++++++++++-
 5 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index e96f92af5bb..2730b7b2f2a 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -117,6 +117,9 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->NewProgram = _mesa_new_program;
    driver->DeleteProgram = _mesa_delete_program;
 
+   /* ATI_fragment_shader */
+   driver->NewATIfs = NULL;
+
    /* simple state commands */
    driver->AlphaFunc = NULL;
    driver->BlendColor = NULL;
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 8fcbff6a7a4..34f45c68008 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -30,6 +30,7 @@
 #include "main/mtypes.h"
 #include "main/dispatch.h"
 #include "main/atifragshader.h"
+#include "program/program.h"
 
 #define MESA_DEBUG_ATI_FS 0
 
@@ -63,6 +64,7 @@ _mesa_delete_ati_fragment_shader(struct gl_context *ctx, struct ati_fragment_sha
       free(s->Instructions[i]);
       free(s->SetupInst[i]);
    }
+   _mesa_reference_program(ctx, &s->Program, NULL);
    free(s);
 }
 
@@ -321,6 +323,8 @@ _mesa_BeginFragmentShaderATI(void)
          free(ctx->ATIFragmentShader.Current->SetupInst[i]);
    }
 
+   _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, NULL);
+
    /* malloc the instructions here - not sure if the best place but its
       a start */
    for (i = 0; i < MAX_NUM_PASSES_ATI; i++) {
@@ -405,7 +409,14 @@ _mesa_EndFragmentShaderATI(void)
    }
 #endif
 
-   if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI, NULL)) {
+   if (ctx->Driver.NewATIfs) {
+      struct gl_program *prog = ctx->Driver.NewATIfs(ctx,
+                                                     ctx->ATIFragmentShader.Current);
+      _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, prog);
+   }
+
+   if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI,
+                                        curProg->Program)) {
       ctx->ATIFragmentShader.Current->isValid = GL_FALSE;
       /* XXX is this the right error? */
       _mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 60bc8ef4411..d62fee690f4 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -476,6 +476,11 @@ struct dd_function_table {
                                      GLuint id);
    /** Delete a program */
    void (*DeleteProgram)(struct gl_context *ctx, struct gl_program *prog);   
+   /**
+    * Allocate a program to associate with the new ATI fragment shader (optional)
+    */
+   struct gl_program * (*NewATIfs)(struct gl_context *ctx,
+                                   struct ati_fragment_shader *curProg);
    /**
     * Notify driver that a program string (and GPU code) has been specified
     * or modified.  Return GL_TRUE or GL_FALSE to indicate if the program is
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index f050dddc4e8..c2c86a6e0d1 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2196,6 +2196,7 @@ struct ati_fragment_shader
    GLboolean interpinp1;
    GLboolean isValid;
    GLuint swizzlerq;
+   struct gl_program *Program;
 };
 
 /**
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 917ae4da023..bf6035e0142 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -124,7 +124,8 @@ update_program(struct gl_context *ctx)
     * follows:
     *   1. OpenGL 2.0/ARB vertex/fragment shaders
     *   2. ARB/NV vertex/fragment programs
-    *   3. Programs derived from fixed-function state.
+    *   3. ATI fragment shader
+    *   4. Programs derived from fixed-function state.
     *
     * Note: it's possible for a vertex shader to get used with a fragment
     * program (and vice versa) here, but in practice that shouldn't ever
@@ -152,6 +153,17 @@ update_program(struct gl_context *ctx)
       _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
 			       NULL);
    }
+   else if (ctx->ATIFragmentShader._Enabled &&
+            ctx->ATIFragmentShader.Current->Program) {
+       /* Use the enabled ATI fragment shader's associated program */
+      _mesa_reference_shader_program(ctx,
+                                     &ctx->_Shader->_CurrentFragmentProgram,
+                                     NULL);
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
+                               gl_fragment_program(ctx->ATIFragmentShader.Current->Program));
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
+                               NULL);
+   }
    else if (ctx->FragmentProgram._MaintainTexEnvProgram) {
       /* Use fragment program generated from fixed-function state */
       struct gl_shader_program *f = _mesa_get_fixed_func_fragment_program(ctx);

From d71c1e9e54d379ff312dca7eb4d717e3f20e4099 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mikl=C3=B3s=20M=C3=A1t=C3=A9?= <mtmkls@gmail.com>
Date: Thu, 24 Mar 2016 01:12:56 +0100
Subject: [PATCH 063/238] program: add ATI_fragment_shader to shader stages
 list
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Miklós Máté <mtmkls@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/program/program.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 24e05974dc3..09e69280d46 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -172,6 +172,8 @@ _mesa_program_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_VERTEX;
    case GL_FRAGMENT_PROGRAM_ARB:
       return MESA_SHADER_FRAGMENT;
+   case GL_FRAGMENT_SHADER_ATI:
+      return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_PROGRAM_NV:
       return MESA_SHADER_GEOMETRY;
    case GL_TESS_CONTROL_PROGRAM_NV:

From dee274477fb36aebc0ebf3eb1a9c58d875ad7a6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mikl=C3=B3s=20M=C3=A1t=C3=A9?= <mtmkls@gmail.com>
Date: Thu, 24 Mar 2016 01:12:57 +0100
Subject: [PATCH 064/238] st/mesa: implement GL_ATI_fragment_shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: fix arithmetic for special opcodes,
 fix fog state, cleanup
v3: simplify handling of special opcodes,
 fix rebinding with different textargets or fog equation,
 lots of formatting fixes
v4: adapt to the compile early, fix later architecture,
 formatting fixes

Signed-off-by: Miklós Máté <mtmkls@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/Makefile.sources                 |   1 +
 src/mesa/main/atifragshader.h             |   1 +
 src/mesa/state_tracker/st_atifs_to_tgsi.c | 845 ++++++++++++++++++++++
 src/mesa/state_tracker/st_atifs_to_tgsi.h |  67 ++
 src/mesa/state_tracker/st_atom_constbuf.c |  15 +
 src/mesa/state_tracker/st_atom_shader.c   |  65 +-
 src/mesa/state_tracker/st_cb_drawpixels.c |   1 +
 src/mesa/state_tracker/st_cb_program.c    |  31 +
 src/mesa/state_tracker/st_program.c       |  34 +-
 src/mesa/state_tracker/st_program.h       |   8 +
 10 files changed, 1064 insertions(+), 4 deletions(-)
 create mode 100644 src/mesa/state_tracker/st_atifs_to_tgsi.c
 create mode 100644 src/mesa/state_tracker/st_atifs_to_tgsi.h

diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index a6c12c64828..54601a956fd 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -395,6 +395,7 @@ VBO_FILES = \
 	vbo/vbo_split_inplace.c
 
 STATETRACKER_FILES = \
+	state_tracker/st_atifs_to_tgsi.c \
 	state_tracker/st_atom_array.c \
 	state_tracker/st_atom_atomicbuf.c \
 	state_tracker/st_atom_blend.c \
diff --git a/src/mesa/main/atifragshader.h b/src/mesa/main/atifragshader.h
index 59011341018..0e32795da3b 100644
--- a/src/mesa/main/atifragshader.h
+++ b/src/mesa/main/atifragshader.h
@@ -16,6 +16,7 @@ struct gl_context;
 #define MAX_NUM_INSTRUCTIONS_PER_PASS_ATI 8
 #define MAX_NUM_PASSES_ATI                2
 #define MAX_NUM_FRAGMENT_REGISTERS_ATI    6
+#define MAX_NUM_FRAGMENT_CONSTANTS_ATI    8
 
 struct ati_fs_opcode_st
 {
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.c b/src/mesa/state_tracker/st_atifs_to_tgsi.c
new file mode 100644
index 00000000000..66f442aee5a
--- /dev/null
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (C) 2016 Miklós Máté
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/mtypes.h"
+#include "main/atifragshader.h"
+#include "main/errors.h"
+#include "program/prog_parameter.h"
+
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_transform.h"
+
+#include "st_program.h"
+#include "st_atifs_to_tgsi.h"
+
+/**
+ * Intermediate state used during shader translation.
+ */
+struct st_translate {
+   struct ureg_program *ureg;
+   struct ati_fragment_shader *atifs;
+
+   struct ureg_dst temps[MAX_PROGRAM_TEMPS];
+   struct ureg_src *constants;
+   struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
+   struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
+   struct ureg_src samplers[PIPE_MAX_SAMPLERS];
+
+   const GLuint *inputMapping;
+   const GLuint *outputMapping;
+
+   unsigned current_pass;
+
+   bool regs_written[MAX_NUM_PASSES_ATI][MAX_NUM_FRAGMENT_REGISTERS_ATI];
+
+   boolean error;
+};
+
+struct instruction_desc {
+   unsigned TGSI_opcode;
+   const char *name;
+   unsigned char arg_count;
+};
+
+static const struct instruction_desc inst_desc[] = {
+   {TGSI_OPCODE_MOV, "MOV", 1},
+   {TGSI_OPCODE_NOP, "UND", 0}, /* unused */
+   {TGSI_OPCODE_ADD, "ADD", 2},
+   {TGSI_OPCODE_MUL, "MUL", 2},
+   {TGSI_OPCODE_SUB, "SUB", 2},
+   {TGSI_OPCODE_DP3, "DOT3", 2},
+   {TGSI_OPCODE_DP4, "DOT4", 2},
+   {TGSI_OPCODE_MAD, "MAD", 3},
+   {TGSI_OPCODE_LRP, "LERP", 3},
+   {TGSI_OPCODE_NOP, "CND", 3},
+   {TGSI_OPCODE_NOP, "CND0", 3},
+   {TGSI_OPCODE_NOP, "DOT2_ADD", 3}
+};
+
+static struct ureg_dst
+get_temp(struct st_translate *t, unsigned index)
+{
+   if (ureg_dst_is_undef(t->temps[index]))
+      t->temps[index] = ureg_DECL_temporary(t->ureg);
+   return t->temps[index];
+}
+
+static struct ureg_src
+apply_swizzle(struct st_translate *t,
+              struct ureg_src src, GLuint swizzle)
+{
+   if (swizzle == GL_SWIZZLE_STR_ATI) {
+      return src;
+   } else if (swizzle == GL_SWIZZLE_STQ_ATI) {
+      return ureg_swizzle(src,
+                          TGSI_SWIZZLE_X,
+                          TGSI_SWIZZLE_Y,
+                          TGSI_SWIZZLE_W,
+                          TGSI_SWIZZLE_Z);
+   } else {
+      struct ureg_dst tmp[2];
+      struct ureg_src imm[3];
+
+      tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI);
+      tmp[1] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 1);
+      imm[0] = src;
+      imm[1] = ureg_imm4f(t->ureg, 1.0f, 1.0f, 0.0f, 0.0f);
+      imm[2] = ureg_imm4f(t->ureg, 0.0f, 0.0f, 1.0f, 1.0f);
+      ureg_insn(t->ureg, TGSI_OPCODE_MAD, &tmp[0], 1, imm, 3);
+
+      if (swizzle == GL_SWIZZLE_STR_DR_ATI) {
+         imm[0] = ureg_scalar(src, TGSI_SWIZZLE_Z);
+      } else {
+         imm[0] = ureg_scalar(src, TGSI_SWIZZLE_W);
+      }
+      ureg_insn(t->ureg, TGSI_OPCODE_RCP, &tmp[1], 1, &imm[0], 1);
+
+      imm[0] = ureg_src(tmp[0]);
+      imm[1] = ureg_src(tmp[1]);
+      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &tmp[0], 1, imm, 2);
+
+      return ureg_src(tmp[0]);
+   }
+}
+
+static struct ureg_src
+get_source(struct st_translate *t, GLuint src_type)
+{
+   if (src_type >= GL_REG_0_ATI && src_type <= GL_REG_5_ATI) {
+      if (t->regs_written[t->current_pass][src_type - GL_REG_0_ATI]) {
+         return ureg_src(get_temp(t, src_type - GL_REG_0_ATI));
+      } else {
+         return ureg_imm1f(t->ureg, 0.0f);
+      }
+   } else if (src_type >= GL_CON_0_ATI && src_type <= GL_CON_7_ATI) {
+      return t->constants[src_type - GL_CON_0_ATI];
+   } else if (src_type == GL_ZERO) {
+      return ureg_imm1f(t->ureg, 0.0f);
+   } else if (src_type == GL_ONE) {
+      return ureg_imm1f(t->ureg, 1.0f);
+   } else if (src_type == GL_PRIMARY_COLOR_ARB) {
+      return t->inputs[t->inputMapping[VARYING_SLOT_COL0]];
+   } else if (src_type == GL_SECONDARY_INTERPOLATOR_ATI) {
+      return t->inputs[t->inputMapping[VARYING_SLOT_COL1]];
+   } else {
+      /* frontend prevents this */
+      unreachable("unknown source");
+   }
+}
+
+static struct ureg_src
+prepare_argument(struct st_translate *t, const unsigned argId,
+                 const struct atifragshader_src_register *srcReg)
+{
+   struct ureg_src src = get_source(t, srcReg->Index);
+   struct ureg_dst arg = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + argId);
+
+   switch (srcReg->argRep) {
+   case GL_NONE:
+      break;
+   case GL_RED:
+      src = ureg_scalar(src, TGSI_SWIZZLE_X);
+      break;
+   case GL_GREEN:
+      src = ureg_scalar(src, TGSI_SWIZZLE_Y);
+      break;
+   case GL_BLUE:
+      src = ureg_scalar(src, TGSI_SWIZZLE_Z);
+      break;
+   case GL_ALPHA:
+      src = ureg_scalar(src, TGSI_SWIZZLE_W);
+      break;
+   }
+   ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1);
+
+   if (srcReg->argMod & GL_COMP_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_imm1f(t->ureg, 1.0f);
+      modsrc[1] = ureg_src(arg);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
+   }
+   if (srcReg->argMod & GL_BIAS_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_src(arg);
+      modsrc[1] = ureg_imm1f(t->ureg, 0.5f);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
+   }
+   if (srcReg->argMod & GL_2X_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_src(arg);
+      modsrc[1] = ureg_src(arg);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
+   }
+   if (srcReg->argMod & GL_NEGATE_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_src(arg);
+      modsrc[1] = ureg_imm1f(t->ureg, -1.0f);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &arg, 1, modsrc, 2);
+   }
+   return  ureg_src(arg);
+}
+
+/* These instructions need special treatment */
+static void
+emit_special_inst(struct st_translate *t, const struct instruction_desc *desc,
+                  struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
+{
+   struct ureg_dst tmp[1];
+   struct ureg_src src[3];
+
+   if (!strcmp(desc->name, "CND")) {
+      tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 2); /* re-purpose a3 */
+      src[0] = ureg_imm1f(t->ureg, 0.5f);
+      src[1] = args[2];
+      ureg_insn(t->ureg, TGSI_OPCODE_SUB, tmp, 1, src, 2);
+      src[0] = ureg_src(tmp[0]);
+      src[1] = args[0];
+      src[2] = args[1];
+      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+   } else if (!strcmp(desc->name, "CND0")) {
+      src[0] = args[2];
+      src[1] = args[1];
+      src[2] = args[0];
+      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+   } else if (!strcmp(desc->name, "DOT2_ADD")) {
+      /* note: DP2A is not implemented in most pipe drivers */
+      tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI); /* re-purpose a1 */
+      src[0] = args[0];
+      src[1] = args[1];
+      ureg_insn(t->ureg, TGSI_OPCODE_DP2, tmp, 1, src, 2);
+      src[0] = ureg_src(tmp[0]);
+      src[1] = ureg_scalar(args[2], TGSI_SWIZZLE_Z);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, dst, 1, src, 2);
+   }
+}
+
+static void
+emit_arith_inst(struct st_translate *t,
+                const struct instruction_desc *desc,
+                struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
+{
+   if (desc->TGSI_opcode == TGSI_OPCODE_NOP) {
+      return emit_special_inst(t, desc, dst, args, argcount);
+   }
+
+   ureg_insn(t->ureg, desc->TGSI_opcode, dst, 1, args, argcount);
+}
+
+static void
+emit_dstmod(struct st_translate *t,
+            struct ureg_dst dst, GLuint dstMod)
+{
+   float imm;
+   struct ureg_src src[3];
+   GLuint scale = dstMod & ~GL_SATURATE_BIT_ATI;
+
+   if (dstMod == GL_NONE) {
+      return;
+   }
+
+   switch (scale) {
+   case GL_2X_BIT_ATI:
+      imm = 2.0f;
+      break;
+   case GL_4X_BIT_ATI:
+      imm = 4.0f;
+      break;
+   case GL_8X_BIT_ATI:
+      imm = 8.0f;
+      break;
+   case GL_HALF_BIT_ATI:
+      imm = 0.5f;
+      break;
+   case GL_QUARTER_BIT_ATI:
+      imm = 0.25f;
+      break;
+   case GL_EIGHTH_BIT_ATI:
+      imm = 0.125f;
+      break;
+   default:
+      imm = 1.0f;
+   }
+
+   src[0] = ureg_src(dst);
+   src[1] = ureg_imm1f(t->ureg, imm);
+   if (dstMod & GL_SATURATE_BIT_ATI) {
+      dst = ureg_saturate(dst);
+   }
+   ureg_insn(t->ureg, TGSI_OPCODE_MUL, &dst, 1, src, 2);
+}
+
+/**
+ * Compile one setup instruction to TGSI instructions.
+ */
+static void
+compile_setupinst(struct st_translate *t,
+                  const unsigned r,
+                  const struct atifs_setupinst *texinst)
+{
+   struct ureg_dst dst[1];
+   struct ureg_src src[2];
+
+   if (!texinst->Opcode)
+      return;
+
+   dst[0] = get_temp(t, r);
+
+   GLuint pass_tex = texinst->src;
+
+   if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) {
+      unsigned attr = pass_tex - GL_TEXTURE0_ARB + VARYING_SLOT_TEX0;
+
+      src[0] = t->inputs[t->inputMapping[attr]];
+   } else if (pass_tex >= GL_REG_0_ATI && pass_tex <= GL_REG_5_ATI) {
+      unsigned reg = pass_tex - GL_REG_0_ATI;
+
+      /* the frontend already validated that REG is only allowed in second pass */
+      if (t->regs_written[0][reg]) {
+         src[0] = ureg_src(t->temps[reg]);
+      } else {
+         src[0] = ureg_imm1f(t->ureg, 0.0f);
+      }
+   }
+   src[0] = apply_swizzle(t, src[0], texinst->swizzle);
+
+   if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) {
+      /* by default texture and sampler indexes are the same */
+      src[1] = t->samplers[r];
+      /* the texture target is still unknown, it will be fixed in the draw call */
+      ureg_tex_insn(t->ureg, TGSI_OPCODE_TEX, dst, 1, TGSI_TEXTURE_2D,
+                    NULL, 0, src, 2);
+   } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+   }
+
+   t->regs_written[t->current_pass][r] = true;
+}
+
+/**
+ * Compile one arithmetic operation COLOR&ALPHA pair into TGSI instructions.
+ */
+static void
+compile_instruction(struct st_translate *t,
+                    const struct atifs_instruction *inst)
+{
+   unsigned optype;
+
+   for (optype = 0; optype < 2; optype++) { /* color, alpha */
+      const struct instruction_desc *desc;
+      struct ureg_dst dst[1];
+      struct ureg_src args[3]; /* arguments for the main operation */
+      unsigned arg;
+      unsigned dstreg = inst->DstReg[optype].Index - GL_REG_0_ATI;
+
+      if (!inst->Opcode[optype])
+         continue;
+
+      desc = &inst_desc[inst->Opcode[optype] - GL_MOV_ATI];
+
+      /* prepare the arguments */
+      for (arg = 0; arg < desc->arg_count; arg++) {
+         if (arg >= inst->ArgCount[optype]) {
+            _mesa_warning(0, "Using 0 for missing argument %d of %s\n",
+                          arg, desc->name);
+            args[arg] = ureg_imm1f(t->ureg, 0.0f);
+         } else {
+            args[arg] = prepare_argument(t, arg,
+                                         &inst->SrcReg[optype][arg]);
+         }
+      }
+
+      /* prepare dst */
+      dst[0] = get_temp(t, dstreg);
+
+      if (optype) {
+         dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_W);
+      } else {
+         GLuint dstMask = inst->DstReg[optype].dstMask;
+         if (dstMask == GL_NONE) {
+            dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ);
+         } else {
+            dst[0] = ureg_writemask(dst[0], dstMask); /* the enum values match */
+         }
+      }
+
+      /* emit the main instruction */
+      emit_arith_inst(t, desc, dst, args, arg);
+
+      emit_dstmod(t, *dst, inst->DstReg[optype].dstMod);
+
+      t->regs_written[t->current_pass][dstreg] = true;
+   }
+}
+
+static void
+finalize_shader(struct st_translate *t, unsigned numPasses)
+{
+   struct ureg_dst dst[1] = { { 0 } };
+   struct ureg_src src[1] = { { 0 } };
+
+   if (t->regs_written[numPasses-1][0]) {
+      /* copy the result into the OUT slot */
+      dst[0] = t->outputs[t->outputMapping[FRAG_RESULT_COLOR]];
+      src[0] = ureg_src(t->temps[0]);
+      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+   }
+
+   /* signal the end of the program */
+   ureg_insn(t->ureg, TGSI_OPCODE_END, dst, 0, src, 0);
+}
+
+/**
+ * Called when a new variant is needed, we need to translate
+ * the ATI fragment shader to TGSI
+ */
+enum pipe_error
+st_translate_atifs_program(
+   struct ureg_program *ureg,
+   struct ati_fragment_shader *atifs,
+   struct gl_program *program,
+   GLuint numInputs,
+   const GLuint inputMapping[],
+   const ubyte inputSemanticName[],
+   const ubyte inputSemanticIndex[],
+   const GLuint interpMode[],
+   GLuint numOutputs,
+   const GLuint outputMapping[],
+   const ubyte outputSemanticName[],
+   const ubyte outputSemanticIndex[])
+{
+   enum pipe_error ret = PIPE_OK;
+
+   unsigned pass, i, r;
+
+   struct st_translate translate, *t;
+   t = &translate;
+   memset(t, 0, sizeof *t);
+
+   t->inputMapping = inputMapping;
+   t->outputMapping = outputMapping;
+   t->ureg = ureg;
+   t->atifs = atifs;
+
+   /*
+    * Declare input attributes.
+    */
+   for (i = 0; i < numInputs; i++) {
+      t->inputs[i] = ureg_DECL_fs_input(ureg,
+                                        inputSemanticName[i],
+                                        inputSemanticIndex[i],
+                                        interpMode[i]);
+   }
+
+   /*
+    * Declare output attributes:
+    *  we always have numOutputs=1 and it's FRAG_RESULT_COLOR
+    */
+   t->outputs[0] = ureg_DECL_output(ureg,
+                                    TGSI_SEMANTIC_COLOR,
+                                    outputSemanticIndex[0]);
+
+   /* Emit constants and immediates.  Mesa uses a single index space
+    * for these, so we put all the translated regs in t->constants.
+    */
+   if (program->Parameters) {
+      t->constants = calloc(program->Parameters->NumParameters,
+                            sizeof t->constants[0]);
+      if (t->constants == NULL) {
+         ret = PIPE_ERROR_OUT_OF_MEMORY;
+         goto out;
+      }
+
+      for (i = 0; i < program->Parameters->NumParameters; i++) {
+         switch (program->Parameters->Parameters[i].Type) {
+         case PROGRAM_STATE_VAR:
+         case PROGRAM_UNIFORM:
+            t->constants[i] = ureg_DECL_constant(ureg, i);
+            break;
+         case PROGRAM_CONSTANT:
+            t->constants[i] =
+               ureg_DECL_immediate(ureg,
+                                   (const float*)program->Parameters->ParameterValues[i],
+                                   4);
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   /* texture samplers */
+   for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) {
+      if (program->SamplersUsed & (1 << i)) {
+         t->samplers[i] = ureg_DECL_sampler(ureg, i);
+         /* the texture target is still unknown, it will be fixed in the draw call */
+         ureg_DECL_sampler_view(ureg, i, TGSI_TEXTURE_2D,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT);
+      }
+   }
+
+   /* emit instructions */
+   for (pass = 0; pass < atifs->NumPasses; pass++) {
+      t->current_pass = pass;
+      for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) {
+         struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r];
+         compile_setupinst(t, r, texinst);
+      }
+      for (i = 0; i < atifs->numArithInstr[pass]; i++) {
+         struct atifs_instruction *inst = &atifs->Instructions[pass][i];
+         compile_instruction(t, inst);
+      }
+   }
+
+   finalize_shader(t, atifs->NumPasses);
+
+out:
+   free(t->constants);
+
+   if (t->error) {
+      debug_printf("%s: translate error flag set\n", __func__);
+   }
+
+   return ret;
+}
+
+/**
+ * Called in ProgramStringNotify, we need to fill the metadata of the
+ * gl_program attached to the ati_fragment_shader
+ */
+void
+st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog)
+{
+   /* we know this is st_fragment_program, because of st_new_ati_fs() */
+   struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
+   struct ati_fragment_shader *atifs = stfp->ati_fs;
+
+   unsigned pass, i, r, optype, arg;
+
+   static const gl_state_index fog_params_state[STATE_LENGTH] =
+      {STATE_INTERNAL, STATE_FOG_PARAMS_OPTIMIZED, 0, 0, 0};
+   static const gl_state_index fog_color[STATE_LENGTH] =
+      {STATE_FOG_COLOR, 0, 0, 0, 0};
+
+   prog->InputsRead = 0;
+   prog->OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR);
+   prog->SamplersUsed = 0;
+   prog->Parameters = _mesa_new_parameter_list();
+
+   /* fill in InputsRead, SamplersUsed, TexturesUsed */
+   for (pass = 0; pass < atifs->NumPasses; pass++) {
+      for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) {
+         struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r];
+         GLuint pass_tex = texinst->src;
+
+         if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) {
+            /* mark which texcoords are used */
+            prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB);
+            /* by default there is 1:1 mapping between samplers and textures */
+            prog->SamplersUsed |= (1 << r);
+            /* the target is unknown here, it will be fixed in the draw call */
+            prog->TexturesUsed[r] = TEXTURE_2D_BIT;
+         } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+            if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) {
+               prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB);
+            }
+         }
+      }
+   }
+   for (pass = 0; pass < atifs->NumPasses; pass++) {
+      for (i = 0; i < atifs->numArithInstr[pass]; i++) {
+         struct atifs_instruction *inst = &atifs->Instructions[pass][i];
+
+         for (optype = 0; optype < 2; optype++) { /* color, alpha */
+            if (inst->Opcode[optype]) {
+               for (arg = 0; arg < inst->ArgCount[optype]; arg++) {
+                  GLint index = inst->SrcReg[optype][arg].Index;
+                  if (index == GL_PRIMARY_COLOR_EXT) {
+                     prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL0);
+                  } else if (index == GL_SECONDARY_INTERPOLATOR_ATI) {
+                     /* note: ATI_fragment_shader.txt never specifies what
+                      * GL_SECONDARY_INTERPOLATOR_ATI is, swrast uses
+                      * VARYING_SLOT_COL1 for this input */
+                     prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL1);
+                  }
+               }
+            }
+         }
+      }
+   }
+   /* we may need fog */
+   prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_FOGC);
+
+   /* we always have the ATI_fs constants, and the fog params */
+   for (i = 0; i < MAX_NUM_FRAGMENT_CONSTANTS_ATI; i++) {
+      _mesa_add_parameter(prog->Parameters, PROGRAM_UNIFORM,
+                          NULL, 4, GL_FLOAT, NULL, NULL);
+   }
+   _mesa_add_state_reference(prog->Parameters, fog_params_state);
+   _mesa_add_state_reference(prog->Parameters, fog_color);
+
+   prog->NumInstructions = 0;
+   prog->NumTemporaries = MAX_NUM_FRAGMENT_REGISTERS_ATI + 3; /* 3 input temps for arith ops */
+   prog->NumParameters = MAX_NUM_FRAGMENT_CONSTANTS_ATI + 2; /* 2 state variables for fog */
+}
+
+
+struct tgsi_atifs_transform {
+   struct tgsi_transform_context base;
+   struct tgsi_shader_info info;
+   const struct st_fp_variant_key *key;
+   bool first_instruction_emitted;
+   unsigned fog_factor_temp;
+   unsigned fog_clamp_imm;
+};
+
+static inline struct tgsi_atifs_transform *
+tgsi_atifs_transform(struct tgsi_transform_context *tctx)
+{
+   return (struct tgsi_atifs_transform *)tctx;
+}
+
+/* copied from st_cb_drawpixels_shader.c */
+static void
+set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index,
+        unsigned x, unsigned y, unsigned z, unsigned w)
+{
+   inst->Src[i].Register.File  = file;
+   inst->Src[i].Register.Index = index;
+   inst->Src[i].Register.SwizzleX = x;
+   inst->Src[i].Register.SwizzleY = y;
+   inst->Src[i].Register.SwizzleZ = z;
+   inst->Src[i].Register.SwizzleW = w;
+}
+
+#define SET_SRC(inst, i, file, index, x, y, z, w) \
+   set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \
+           TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w)
+
+static void
+transform_decl(struct tgsi_transform_context *tctx,
+               struct tgsi_full_declaration *decl)
+{
+   struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx);
+
+   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+      /* fix texture target */
+      unsigned newtarget = ctx->key->texture_targets[decl->Range.First];
+      if (newtarget)
+         decl->SamplerView.Resource = newtarget;
+   }
+
+   tctx->emit_declaration(tctx, decl);
+}
+
+static void
+transform_instr(struct tgsi_transform_context *tctx,
+                struct tgsi_full_instruction *current_inst)
+{
+   struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx);
+
+   if (ctx->first_instruction_emitted)
+      goto transform_inst;
+
+   ctx->first_instruction_emitted = true;
+
+   if (ctx->key->fog) {
+      /* add a new temp for the fog factor */
+      ctx->fog_factor_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+      tgsi_transform_temp_decl(tctx, ctx->fog_factor_temp);
+
+      /* add immediates for clamp */
+      ctx->fog_clamp_imm = ctx->info.immediate_count;
+      tgsi_transform_immediate_decl(tctx, 1.0f, 0.0f, 0.0f, 0.0f);
+   }
+
+transform_inst:
+   if (current_inst->Instruction.Opcode == TGSI_OPCODE_TEX) {
+      /* fix texture target */
+      unsigned newtarget = ctx->key->texture_targets[current_inst->Src[1].Register.Index];
+      if (newtarget)
+         current_inst->Texture.Texture = newtarget;
+
+   } else if (ctx->key->fog && current_inst->Instruction.Opcode == TGSI_OPCODE_MOV &&
+              current_inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+      struct tgsi_full_instruction inst;
+      unsigned i;
+      int fogc_index = -1;
+
+      /* find FOGC input */
+      for (i = 0; i < ctx->info.num_inputs; i++) {
+         if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FOG) {
+            fogc_index = i;
+            break;
+         }
+      }
+      if (fogc_index < 0) {
+         /* should never be reached, because fog coord input is always declared */
+         tctx->emit_instruction(tctx, current_inst);
+         return;
+      }
+
+      /* compute the 1 component fog factor f */
+      if (ctx->key->fog == 1) {
+         /* LINEAR formula: f = (end - z) / (end - start)
+          * with optimized parameters:
+          *    f = MAD(fogcoord, oparams.x, oparams.y)
+          */
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MAD;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 3;
+         SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, X, X, X, X);
+         SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Y, Y, Y, Y);
+         tctx->emit_instruction(tctx, &inst);
+      } else if (ctx->key->fog == 2) {
+         /* EXP formula: f = exp(-dens * z)
+          * with optimized parameters:
+          *    f = MUL(fogcoord, oparams.z); f= EX2(-f)
+          */
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 2;
+         SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Z, Z, Z, Z);
+         tctx->emit_instruction(tctx, &inst);
+
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_EX2;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 1;
+         SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         inst.Src[0].Register.Negate = 1;
+         tctx->emit_instruction(tctx, &inst);
+      } else if (ctx->key->fog == 3) {
+         /* EXP2 formula: f = exp(-(dens * z)^2)
+          * with optimized parameters:
+          *    f = MUL(fogcoord, oparams.w); f=MUL(f, f); f= EX2(-f)
+          */
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 2;
+         SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, W, W, W, W);
+         tctx->emit_instruction(tctx, &inst);
+
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 2;
+         SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         tctx->emit_instruction(tctx, &inst);
+
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_EX2;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 1;
+         SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         inst.Src[0].Register.Negate ^= 1;
+         tctx->emit_instruction(tctx, &inst);
+      }
+      /* f = CLAMP(f, 0.0, 1.0) */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_CLAMP;
+      inst.Instruction.NumDstRegs = 1;
+      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+      inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+      inst.Instruction.NumSrcRegs = 3;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+      SET_SRC(&inst, 1, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, Y, Y, Y, Y); // 0.0
+      SET_SRC(&inst, 2, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, X, X, X, X); // 1.0
+      tctx->emit_instruction(tctx, &inst);
+
+      /* REG0 = LRP(f, REG0, fogcolor) */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_LRP;
+      inst.Instruction.NumDstRegs = 1;
+      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+      inst.Dst[0].Register.Index = 0;
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+      inst.Instruction.NumSrcRegs = 3;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, X, X, Y);
+      SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, 0, X, Y, Z, W);
+      SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI + 1, X, Y, Z, W);
+      tctx->emit_instruction(tctx, &inst);
+   }
+
+   tctx->emit_instruction(tctx, current_inst);
+}
+
+/*
+ * A post-process step in the draw call to fix texture targets and
+ * insert code for fog.
+ */
+const struct tgsi_token *
+st_fixup_atifs(const struct tgsi_token *tokens,
+               const struct st_fp_variant_key *key)
+{
+   struct tgsi_atifs_transform ctx;
+   struct tgsi_token *newtoks;
+   int newlen;
+
+   memset(&ctx, 0, sizeof(ctx));
+   ctx.base.transform_declaration = transform_decl;
+   ctx.base.transform_instruction = transform_instr;
+   ctx.key = key;
+   tgsi_scan_shader(tokens, &ctx.info);
+
+   newlen = tgsi_num_tokens(tokens) + 30;
+   newtoks = tgsi_alloc_tokens(newlen);
+   if (!newtoks)
+      return NULL;
+
+   tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base);
+   return newtoks;
+}
+
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.h b/src/mesa/state_tracker/st_atifs_to_tgsi.h
new file mode 100644
index 00000000000..c1b6758ba02
--- /dev/null
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2016 Miklós Máté
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef ST_ATIFS_TO_TGSI_H
+#define ST_ATIFS_TO_TGSI_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#include "main/glheader.h"
+#include "pipe/p_defines.h"
+
+struct gl_context;
+struct gl_program;
+struct ureg_program;
+struct tgsi_token;
+struct ati_fragment_shader;
+struct st_fp_variant_key;
+
+enum pipe_error
+st_translate_atifs_program(
+    struct ureg_program *ureg,
+    struct ati_fragment_shader *atifs,
+    struct gl_program *program,
+    GLuint numInputs,
+    const GLuint inputMapping[],
+    const ubyte inputSemanticName[],
+    const ubyte inputSemanticIndex[],
+    const GLuint interpMode[],
+    GLuint numOutputs,
+    const GLuint outputMapping[],
+    const ubyte outputSemanticName[],
+    const ubyte outputSemanticIndex[]);
+
+
+void
+st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog);
+
+const struct tgsi_token *
+st_fixup_atifs(const struct tgsi_token *tokens,
+               const struct st_fp_variant_key *key);
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* ST_ATIFS_TO_TGSI_H */
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 407dfd31c80..4d9b344111b 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -64,6 +64,21 @@ void st_upload_constants( struct st_context *st,
           shader_type == PIPE_SHADER_TESS_EVAL ||
           shader_type == PIPE_SHADER_COMPUTE);
 
+   /* update the ATI constants before rendering */
+   struct ati_fragment_shader *ati_fs = st->fp->ati_fs;
+   if (shader_type == PIPE_SHADER_FRAGMENT && ati_fs) {
+      unsigned c;
+
+      for (c = 0; c < MAX_NUM_FRAGMENT_CONSTANTS_ATI; c++) {
+         if (ati_fs->LocalConstDef & (1 << c))
+            memcpy(params->ParameterValues[c],
+                   ati_fs->Constants[c], sizeof(GLfloat) * 4);
+         else
+            memcpy(params->ParameterValues[c],
+                   st->ctx->ATIFragmentShader.GlobalConstants[c], sizeof(GLfloat) * 4);
+      }
+   }
+
    /* update constants */
    if (params && params->NumParameters) {
       struct pipe_constant_buffer cb;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 709f0cbcb91..d0c2429dcef 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -38,18 +38,69 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 #include "main/framebuffer.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
 #include "program/program.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_simple_shaders.h"
 #include "cso_cache/cso_context.h"
+#include "util/u_debug.h"
 
 #include "st_context.h"
 #include "st_atom.h"
 #include "st_program.h"
 
 
+/** Compress the fog function enums into a 2-bit value */
+static GLuint
+translate_fog_mode(GLenum mode)
+{
+   switch (mode) {
+   case GL_LINEAR: return 1;
+   case GL_EXP:    return 2;
+   case GL_EXP2:   return 3;
+   default:
+      return 0;
+   }
+}
+
+static unsigned
+get_texture_target(struct gl_context *ctx, const unsigned unit)
+{
+   struct gl_texture_object *texObj = _mesa_get_tex_unit(ctx, unit)->_Current;
+   gl_texture_index index;
+
+   if (texObj) {
+      index = _mesa_tex_target_to_index(ctx, texObj->Target);
+   } else {
+      /* fallback for missing texture */
+      index = TEXTURE_2D_INDEX;
+   }
+
+   /* Map mesa texture target to TGSI texture target.
+    * Copied from st_mesa_to_tgsi.c, the shadow part is omitted */
+   switch(index) {
+   case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA;
+   case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA;
+   case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER;
+   case TEXTURE_1D_INDEX:   return TGSI_TEXTURE_1D;
+   case TEXTURE_2D_INDEX:   return TGSI_TEXTURE_2D;
+   case TEXTURE_3D_INDEX:   return TGSI_TEXTURE_3D;
+   case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE;
+   case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY;
+   case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT;
+   case TEXTURE_1D_ARRAY_INDEX:   return TGSI_TEXTURE_1D_ARRAY;
+   case TEXTURE_2D_ARRAY_INDEX:   return TGSI_TEXTURE_2D_ARRAY;
+   case TEXTURE_EXTERNAL_INDEX:   return TGSI_TEXTURE_2D;
+   default:
+      debug_assert(0);
+      return TGSI_TEXTURE_1D;
+   }
+}
+
+
 /**
  * Update fragment program state/atom.  This involves translating the
  * Mesa fragment program into a gallium fragment program and binding it.
@@ -79,6 +130,18 @@ update_fp( struct st_context *st )
       st->ctx->Multisample.MinSampleShadingValue *
       _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
 
+   if (stfp->ati_fs) {
+      unsigned u;
+
+      if (st->ctx->Fog.Enabled) {
+         key.fog = translate_fog_mode(st->ctx->Fog.Mode);
+      }
+
+      for (u = 0; u < MAX_NUM_FRAGMENT_REGISTERS_ATI; u++) {
+         key.texture_targets[u] = get_texture_target(st->ctx, u);
+      }
+   }
+
    st->fp_variant = st_get_fp_variant(st, stfp, &key);
 
    st_reference_fragprog(st, &st->fp, stfp);
@@ -91,7 +154,7 @@ update_fp( struct st_context *st )
 const struct st_tracked_state st_update_fp = {
    "st_update_fp",					/* name */
    {							/* dirty */
-      _NEW_BUFFERS | _NEW_MULTISAMPLE,			/* mesa */
+      _NEW_BUFFERS | _NEW_MULTISAMPLE | _NEW_FOG,	/* mesa */
       ST_NEW_FRAGMENT_PROGRAM                           /* st */
    },
    update_fp  					/* update */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 09f4d8e00d1..01ed5441d11 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -1302,6 +1302,7 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
        !ctx->FragmentProgram.Enabled &&
        !ctx->VertexProgram.Enabled &&
        !ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] &&
+       !ctx->ATIFragmentShader._Enabled &&
        ctx->DrawBuffer->_NumColorDrawBuffers == 1 &&
        !ctx->Query.CondRenderQuery &&
        !ctx->Query.CurrentOcclusionObject) {
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 27cc0f3d154..d79cfe239e4 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -46,6 +46,7 @@
 #include "st_mesa_to_tgsi.h"
 #include "st_cb_program.h"
 #include "st_glsl_to_tgsi.h"
+#include "st_atifs_to_tgsi.h"
 
 
 
@@ -302,6 +303,22 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->cp == stcp)
          st->dirty_cp.st |= ST_NEW_COMPUTE_PROGRAM;
    }
+   else if (target == GL_FRAGMENT_SHADER_ATI) {
+      assert(prog);
+
+      struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
+      assert(stfp->ati_fs);
+      assert(stfp->ati_fs->Program == prog);
+
+      st_init_atifs_prog(ctx, prog);
+
+      st_release_fp_variants(st, stfp);
+      if (!st_translate_fragment_program(st, stfp))
+         return false;
+
+      if (st->fp == stfp)
+         st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
+   }
 
    if (ST_DEBUG & DEBUG_PRECOMPILE ||
        st->shader_has_one_variant[stage])
@@ -310,6 +327,19 @@ st_program_string_notify( struct gl_context *ctx,
    return GL_TRUE;
 }
 
+/**
+ * Called via ctx->Driver.NewATIfs()
+ * Called in glEndFragmentShaderATI()
+ */
+static struct gl_program *
+st_new_ati_fs(struct gl_context *ctx, struct ati_fragment_shader *curProg)
+{
+   struct gl_program *prog = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
+         curProg->Id);
+   struct st_fragment_program *stfp = (struct st_fragment_program *)prog;
+   stfp->ati_fs = curProg;
+   return prog;
+}
 
 /**
  * Plug in the program and shader-related device driver functions.
@@ -322,6 +352,7 @@ st_init_program_functions(struct dd_function_table *functions)
    functions->NewProgram = st_new_program;
    functions->DeleteProgram = st_delete_program;
    functions->ProgramStringNotify = st_program_string_notify;
+   functions->NewATIfs = st_new_ati_fs;
    
    functions->LinkShader = st_link_shader;
 }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 80dcfd82743..94dc48971ec 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -53,6 +53,7 @@
 #include "st_context.h"
 #include "st_program.h"
 #include "st_mesa_to_tgsi.h"
+#include "st_atifs_to_tgsi.h"
 #include "cso_cache/cso_context.h"
 
 
@@ -811,7 +812,22 @@ st_translate_fragment_program(struct st_context *st,
 
       free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
       stfp->glsl_to_tgsi = NULL;
-   } else
+   } else if (stfp->ati_fs)
+      st_translate_atifs_program(ureg,
+                                 stfp->ati_fs,
+                                 &stfp->Base.Base,
+                                 /* inputs */
+                                 fs_num_inputs,
+                                 inputMapping,
+                                 input_semantic_name,
+                                 input_semantic_index,
+                                 interpMode,
+                                 /* outputs */
+                                 fs_num_outputs,
+                                 outputMapping,
+                                 fs_output_semantic_name,
+                                 fs_output_semantic_index);
+   else
       st_translate_mesa_program(st->ctx,
                                 TGSI_PROCESSOR_FRAGMENT,
                                 ureg,
@@ -849,6 +865,16 @@ st_create_fp_variant(struct st_context *st,
 
    assert(!(key->bitmap && key->drawpixels));
 
+   /* Fix texture targets and add fog for ATI_fs */
+   if (stfp->ati_fs) {
+      const struct tgsi_token *tokens = st_fixup_atifs(tgsi.tokens, key);
+
+      if (tokens)
+         tgsi.tokens = tokens;
+      else
+         fprintf(stderr, "mesa: cannot post-process ATI_fs\n");
+   }
+
    /* Emulate features. */
    if (key->clamp_color || key->persample_shading) {
       const struct tgsi_token *tokens;
@@ -858,9 +884,11 @@ st_create_fp_variant(struct st_context *st,
 
       tokens = tgsi_emulate(tgsi.tokens, flags);
 
-      if (tokens)
+      if (tokens) {
+         if (tgsi.tokens != stfp->tgsi.tokens)
+            tgsi_free_tokens(tgsi.tokens);
          tgsi.tokens = tokens;
-      else
+      } else
          fprintf(stderr, "mesa: cannot emulate deprecated features\n");
    }
 
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index 028fba99a74..7c90fd74e14 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -35,6 +35,7 @@
 #define ST_PROGRAM_H
 
 #include "main/mtypes.h"
+#include "main/atifragshader.h"
 #include "program/program.h"
 #include "pipe/p_state.h"
 #include "st_context.h"
@@ -65,6 +66,12 @@ struct st_fp_variant_key
 
    /** for ARB_sample_shading */
    GLuint persample_shading:1;
+
+   /** needed for ATI_fragment_shader */
+   GLuint fog:2;
+
+   /** needed for ATI_fragment_shader */
+   char texture_targets[MAX_NUM_FRAGMENT_REGISTERS_ATI];
 };
 
 
@@ -99,6 +106,7 @@ struct st_fragment_program
    struct gl_fragment_program Base;
    struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+   struct ati_fragment_shader *ati_fs;
 
    struct st_fp_variant *variants;
 };

From 920fbecf57bc8e81db029a52f6ef1c9344d8ddab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mikl=C3=B3s=20M=C3=A1t=C3=A9?= <mtmkls@gmail.com>
Date: Thu, 24 Mar 2016 01:12:58 +0100
Subject: [PATCH 065/238] st/mesa: enable GL_ATI_fragment_shader
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Miklós Máté <mtmkls@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 docs/relnotes/11.3.0.html              | 1 +
 src/mesa/state_tracker/st_extensions.c | 1 +
 2 files changed, 2 insertions(+)

diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index acd8e11e3fc..508fbd34901 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -48,6 +48,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
 <li>GL_ARB_shader_image_load_store on radeonsi</li>
 <li>GL_ARB_shader_image_size on radeonsi</li>
+<li>GL_ATI_fragment_shader on all Gallium drivers</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 </ul>
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 0d6c6b196a1..44d93e30b4d 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -810,6 +810,7 @@ void st_init_extensions(struct pipe_screen *screen,
    extensions->EXT_texture_env_dot3 = GL_TRUE;
    extensions->EXT_vertex_array_bgra = GL_TRUE;
 
+   extensions->ATI_fragment_shader = GL_TRUE;
    extensions->ATI_texture_env_combine3 = GL_TRUE;
 
    extensions->MESA_pack_invert = GL_TRUE;

From baab345b192d207236253ce67b320fb32fa67625 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mikl=C3=B3s=20M=C3=A1t=C3=A9?= <mtmkls@gmail.com>
Date: Thu, 24 Mar 2016 01:13:00 +0100
Subject: [PATCH 066/238] st/mesa: fix handling the fallback texture
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes crash when post-processing is enabled in SW:KotOR.

v2: fix const-ness
v3: move assignment into the if() block

Signed-off-by: Miklós Máté <mtmkls@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_atom_sampler.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 82dcf5ee0ca..a1cfa1c34c5 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -133,18 +133,19 @@ convert_sampler(struct st_context *st,
 {
    const struct gl_texture_object *texobj;
    struct gl_context *ctx = st->ctx;
-   struct gl_sampler_object *msamp;
+   const struct gl_sampler_object *msamp;
    GLenum texBaseFormat;
 
    texobj = ctx->Texture.Unit[texUnit]._Current;
    if (!texobj) {
       texobj = _mesa_get_fallback_texture(ctx, TEXTURE_2D_INDEX);
+      msamp = &texobj->Sampler;
+   } else {
+      msamp = _mesa_get_samplerobj(ctx, texUnit);
    }
 
    texBaseFormat = _mesa_texture_base_format(texobj);
 
-   msamp = _mesa_get_samplerobj(ctx, texUnit);
-
    memset(sampler, 0, sizeof(*sampler));
    sampler->wrap_s = gl_wrap_xlate(msamp->WrapS);
    sampler->wrap_t = gl_wrap_xlate(msamp->WrapT);

From 50d653c2bbe7e6a7c0893b13ad6e57b68f55f8e4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mikl=C3=B3s=20M=C3=A1t=C3=A9?= <mtmkls@gmail.com>
Date: Thu, 24 Mar 2016 01:13:02 +0100
Subject: [PATCH 067/238] mesa: optimize out the realloc from
 glCopyTexImagexD()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

v2: comment about the purpose of the code
v3: also compare texFormat,
 add a perf debug message,
 formatting fixes

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Miklós Máté <mtmkls@gmail.com>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/main/teximage.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 616a92953e7..5af0a2ec198 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -3484,6 +3484,24 @@ formats_differ_in_component_sizes(mesa_format f1, mesa_format f2)
    return GL_FALSE;
 }
 
+static bool
+can_avoid_reallocation(struct gl_texture_image *texImage, GLenum internalFormat,
+                       mesa_format texFormat, GLint x, GLint y, GLsizei width,
+                       GLsizei height, GLint border)
+{
+   if (texImage->InternalFormat != internalFormat)
+      return false;
+   if (texImage->TexFormat != texFormat)
+      return false;
+   if (texImage->Border != border)
+      return false;
+   if (texImage->Width2 != width)
+      return false;
+   if (texImage->Height2 != height)
+      return false;
+   return true;
+}
+
 /**
  * Implement the glCopyTexImage1/2D() functions.
  */
@@ -3527,6 +3545,24 @@ copyteximage(struct gl_context *ctx, GLuint dims,
    texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
                                            internalFormat, GL_NONE, GL_NONE);
 
+   /* First check if reallocating the texture buffer can be avoided.
+    * Without the realloc the copy can be 20x faster.
+    */
+   _mesa_lock_texture(ctx, texObj);
+   {
+      texImage = _mesa_select_tex_image(texObj, target, level);
+      if (texImage && can_avoid_reallocation(texImage, internalFormat, texFormat,
+                                             x, y, width, height, border)) {
+         _mesa_unlock_texture(ctx, texObj);
+         return _mesa_copy_texture_sub_image(ctx, dims, texObj, target, level,
+                                             0, 0, 0, x, y, width, height,
+                                             "CopyTexImage");
+      }
+   }
+   _mesa_unlock_texture(ctx, texObj);
+   _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_LOW, "glCopyTexImage "
+                    "can't avoid reallocating texture storage\n");
+
    rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
 
    if (_mesa_is_gles3(ctx)) {

From 21c479256a4f195bce89bc313ab9367deef20bb1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Mon, 21 Mar 2016 12:18:40 +0100
Subject: [PATCH 068/238] st/mesa: only minify height if target != 1D array in
 st_finalize_texture

The st_texture_object documentation says:
  "the number of 1D array layers will be in height0"

We can't minify that.

Spotted by luck. No app is known to hit this issue.

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/mesa/state_tracker/st_cb_texture.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 460c1790663..3980f5d2f51 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -2886,12 +2886,17 @@ st_finalize_texture(struct gl_context *ctx,
          /* Need to import images in main memory or held in other textures.
           */
          if (stImage && stObj->pt != stImage->pt) {
+            GLuint height = stObj->height0;
             GLuint depth = stObj->depth0;
+
+            if (stObj->base.Target != GL_TEXTURE_1D_ARRAY)
+               height = u_minify(height, level);
             if (stObj->base.Target == GL_TEXTURE_3D)
                depth = u_minify(depth, level);
+
             if (level == 0 ||
                 (stImage->base.Width == u_minify(stObj->width0, level) &&
-                 stImage->base.Height == u_minify(stObj->height0, level) &&
+                 stImage->base.Height == height &&
                  stImage->base.Depth == depth)) {
                /* src image fits expected dest mipmap level size */
                copy_image_data_to_texture(st, stObj, level, stImage);

From 6262d6125a175d64045083341e74985652f44f17 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Sun, 27 Mar 2016 19:11:09 +0200
Subject: [PATCH 069/238] gallium/util: fix up inaccurate behavior of
 util_framebuffer_state_equal (v2)

v2: move the nr_cbufs check above the loop

Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu> (v1)
---
 src/gallium/auxiliary/util/u_framebuffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
index 2e0ef749e82..49b391d8162 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -55,16 +55,16 @@ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
        dst->height != src->height)
       return FALSE;
 
-   for (i = 0; i < Elements(src->cbufs); i++) {
+   if (dst->nr_cbufs != src->nr_cbufs) {
+      return FALSE;
+   }
+
+   for (i = 0; i < src->nr_cbufs; i++) {
       if (dst->cbufs[i] != src->cbufs[i]) {
          return FALSE;
       }
    }
 
-   if (dst->nr_cbufs != src->nr_cbufs) {
-      return FALSE;
-   }
-
    if (dst->zsbuf != src->zsbuf) {
       return FALSE;
    }

From 6711f159d94f37f3f45d36a8cb172cfc00692875 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Mar 2016 13:11:01 -0400
Subject: [PATCH 070/238] nv50/ir: saturate depth writes

Apparently there's no post-FS clamping logic, so we have to do this by
hand. The depth will never be outside of the 0..1 range, even on
floating point zeta buffers, so this should be safe.

Fixes dEQP-GLES3.functional.fbo.depth.*clamp.* which tests writing
invalid values on various zeta buffer formats.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 611d5f9c3ed..4f012cd3b91 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -3536,8 +3536,11 @@ Converter::exportOutputs()
          Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
                                 info->out[i].slot[c] * 4);
          Value *val = oData.load(sub.cur->values, i, c, NULL);
-         if (val)
+         if (val) {
+            if (info->out[i].sn == TGSI_SEMANTIC_POSITION)
+               mkOp1(OP_SAT, TYPE_F32, val, val);
             mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+         }
       }
    }
 }

From f667d15561820ee9dd8e836d43cce3ee52a4780e Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sun, 20 Mar 2016 17:26:13 -0400
Subject: [PATCH 071/238] nvc0/ir: fix picking of coordinates from tex
 instruction for textureGrad

On Fermi, there's an argument in front of the coords that combines array
and indirect handle, while on Kepler the array and the indirect handle
are separate (and in front of the coords). We were previously only
accounting for the array bit of it, if there were an indirect access it
wouldn't be counted in the formula.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp        | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e8f8e30918b..122a6b74279 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -874,7 +874,17 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
-   const int array = i->tex.target.isArray();
+
+   // This function is invoked after handleTEX lowering, so we have to expect
+   // the arguments in the order that the hw wants them. For Fermi, array and
+   // indirect are both in the leading arg, while for Kepler, array and
+   // indirect are separate (and both precede the coordinates). Maxwell is
+   // handled in a separate function.
+   unsigned array;
+   if (targ->getChipset() < NVISA_GK104_CHIPSET)
+      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
+   else
+      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
 
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 

From 41100b6b44e747b9003937f123fce571fd3dec46 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 26 Mar 2016 22:32:43 -0400
Subject: [PATCH 072/238] nvc0: disable primitive restart and index bias during
 blits

Back in the dawn of time, we used to do immediate uploads for the vertex
data, and all was well. However Maxwell dropped support for immediate
vertex data, so we started feeding in a VBO (in all cases). But we
forgot to disable some things that apply in such cases, specifically
primitive restart and index bias. The latter was causing WoW and other
Blizzard games trouble as they use a pattern where they draw with a base
vertex (aka index bias), followed by texture uploads (aka blits,
internally).

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=91526
Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Tested-by: Karol Herbst <nouveau@karolherbst.de>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_surface.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index e8b3a4d549a..3ebacb6b234 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1303,6 +1303,17 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
    }
    nvc0->state.num_vtxelts = 2;
 
+   if (nvc0->state.prim_restart) {
+      IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+      nvc0->state.prim_restart = 0;
+   }
+
+   if (nvc0->state.index_bias) {
+      IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
+      nvc0->state.index_bias = 0;
+   }
+
    for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
       if (info->dst.box.z + i) {
          BEGIN_NVC0(push, NVC0_3D(LAYER), 1);

From b9f1affb2e52a7a75d968af117a03b01866546cb Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 28 Mar 2016 00:52:00 -0400
Subject: [PATCH 073/238] nvc0: make sure to disable fetches from
 previously-set VBOs when blitting

We disable the vertex attributes, but also disable the VBO fetch details
as well, just in case. Not known to fix anything.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_surface.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 3ebacb6b234..e657204128e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1295,6 +1295,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 |
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST);
    }
+   for (i = 1; i < n; ++i)
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
    if (nvc0->state.instance_elts) {
       nvc0->state.instance_elts = 0;
       BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);

From 3ca034cada87aea58a92113cb38cf92a97d70c55 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Sat, 6 Feb 2016 09:09:52 -0500
Subject: [PATCH 074/238] freedreno/ir3: fix compiler warn

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 54315d2f592..88f6e168558 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1109,7 +1109,7 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	default:
 		compile_error(ctx, "Unhandled store deref type: %u\n",
 				darr->deref_array_type);
-		break;
+		return;
 	}
 
 	for (int i = 0; i < intr->num_components; i++) {

From b4c72b792caecd8be271af20de92d24b4ae7da4c Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Mon, 21 Mar 2016 19:55:37 -0400
Subject: [PATCH 075/238] freedreno/ir3: fix for load_front_face intrinsic

Seems like trying to widen in the same instruction as the add.s does a
non-sign-extending widen.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
---
 src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 88f6e168558..3d656d4a34d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1258,7 +1258,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			ctx->frag_face = create_input(b, 0);
 			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 		}
-		dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+		/* for fragface, we always get -1 or 0, but that is inverse
+		 * of what nir expects (where ~0 is true).  Unfortunately
+		 * trying to widen from half to full in add.s seems to do a
+		 * non-sign-extending widen (resulting in something that
+		 * gets interpreted as float Inf??)
+		 */
+		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+		dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
 		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard: {

From dd5f0950e4105b022d6c909e0a39fe38426312c4 Mon Sep 17 00:00:00 2001
From: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Date: Mon, 28 Mar 2016 17:01:49 +0200
Subject: [PATCH 076/238] mesa/st: Fix NULL access if no fragment shader is
 bound
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Bas Nieuwenhuizen <bas@basnieuwenhuizen.nl>
Signed-off-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_atom_constbuf.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 4d9b344111b..a980dbedac5 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -65,8 +65,8 @@ void st_upload_constants( struct st_context *st,
           shader_type == PIPE_SHADER_COMPUTE);
 
    /* update the ATI constants before rendering */
-   struct ati_fragment_shader *ati_fs = st->fp->ati_fs;
-   if (shader_type == PIPE_SHADER_FRAGMENT && ati_fs) {
+   if (shader_type == PIPE_SHADER_FRAGMENT && st->fp->ati_fs) {
+      struct ati_fragment_shader *ati_fs = st->fp->ati_fs;
       unsigned c;
 
       for (c = 0; c < MAX_NUM_FRAGMENT_CONSTANTS_ATI; c++) {

From c0a9cbea4d7a6cede911cf82b57d7612f48d48a2 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 24 Mar 2016 13:57:57 -0400
Subject: [PATCH 077/238] glx: Unbreak generating some of the xorg glx headers

Broken by:

    commit 9ace0b542241c77ae82a0835ac8a09e2a7510eaf
    Author: Dylan Baker <baker.dylan.c@gmail.com>
    Date:   Wed May 20 15:49:11 2015 -0700

	glapi: glX_proto_size.py: use argparse instead of getopt

Which changed most, but not all, callers to use --header-tag instead of
-h.

Reviewed-by: Dylan Baker <baker.dylan.c@gmail.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/mapi/glapi/gen/Makefile.am | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 8421af48854..0d5e408c10b 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -330,7 +330,7 @@ $(XORG_GLX_DIR)/indirect_dispatch.h: glX_proto_recv.py gl_and_glX_API.xml $(COMM
 
 $(XORG_GLX_DIR)/indirect_size_get.h: glX_proto_size.py $(COMMON_GLX)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m size_h \
-           --only-get -h '_INDIRECT_SIZE_GET_H_' \
+           --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \
 	  | $(INDENT) $(XORG_INDENT_FLAGS) > $@
 
 $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX)
@@ -339,7 +339,7 @@ $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX)
 
 $(XORG_GLX_DIR)/indirect_reqsize.h: glX_proto_size.py $(COMMON_GLX)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m reqsize_h \
-           --only-get -h '_INDIRECT_SIZE_GET_H_' \
+           --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \
 	  | $(INDENT) $(XORG_INDENT_FLAGS) > $@
 
 $(XORG_GLX_DIR)/indirect_reqsize.c: glX_proto_size.py $(COMMON_GLX)

From ce3f0b23d1ee3e854780d01effb90a7c39134ffd Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 24 Mar 2016 13:57:57 -0400
Subject: [PATCH 078/238] glapi/glx: Emit direct GL calls instead of dispatch
 lookup

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/mapi/glapi/gen/Makefile.am       | 11 ----------
 src/mapi/glapi/gen/glX_proto_recv.py | 33 +++++++++-------------------
 2 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 0d5e408c10b..1e346321292 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -89,18 +89,7 @@ EXTRA_DIST= \
 XORG_GLX_DIR = $(XORG_BASE)/glx
 XORG_GLAPI_DIR = $(XORG_BASE)/glx
 
-XORG_GLAPI_OUTPUTS = \
-	$(XORG_GLAPI_DIR)/glprocs.h \
-	$(XORG_GLAPI_DIR)/glapitable.h \
-	$(XORG_GLAPI_DIR)/dispatch.h
-
-if HAVE_APPLEDRI
-XORG_GLAPI_OUTPUTS += \
-	$(XORG_GLAPI_DIR)/glapi_gentable.c
-endif
-
 XORG_OUTPUTS = \
-	$(XORG_GLAPI_OUTPUTS) \
 	$(XORG_GLX_DIR)/indirect_dispatch.c \
 	$(XORG_GLX_DIR)/indirect_dispatch_swap.c \
 	$(XORG_GLX_DIR)/indirect_dispatch.h \
diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index 5d95f278a91..916da944689 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -80,21 +80,14 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
 
     def printRealHeader(self):
-        print '#include <X11/Xmd.h>'
-        print '#include <GL/gl.h>'
-        print '#include <GL/glxproto.h>'
-
         print '#include <inttypes.h>'
+        print '#include "glxserver.h"'
         print '#include "indirect_size.h"'
         print '#include "indirect_size_get.h"'
         print '#include "indirect_dispatch.h"'
-        print '#include "glxserver.h"'
         print '#include "glxbyteorder.h"'
         print '#include "indirect_util.h"'
         print '#include "singlesize.h"'
-        print '#include "glapi.h"'
-        print '#include "glapitable.h"'
-        print '#include "dispatch.h"'
         print ''
         print '#define __GLX_PAD(x)  (((x) + 3) & ~3)'
         print ''
@@ -238,13 +231,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
             list.append( '%s        %s' % (indent, location) )
 
 
-        if len( list ):
-            print '%s    %sCALL_%s( GET_DISPATCH(), (' % (indent, retval_assign, f.name)
-            print string.join( list, ",\n" )
-            print '%s    ) );' % (indent)
-        else:
-            print '%s    %sCALL_%s( GET_DISPATCH(), () );' % (indent, retval_assign, f.name)
-        return
+        print '%s    %sgl%s(%s);' % (indent, retval_assign, f.name, string.join(list, ',\n'))
 
 
     def common_func_print_just_start(self, f, indent):
@@ -507,18 +494,18 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
             # the must NEVER be byte-swapped.
 
             if not (img.img_type == "GL_BITMAP" and img.img_format == "GL_COLOR_INDEX"):
-                print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SWAP_BYTES,   hdr->swapBytes) );'
+                print '    glPixelStorei(GL_UNPACK_SWAP_BYTES, hdr->swapBytes);'
 
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_LSB_FIRST,    hdr->lsbFirst) );'
+            print '    glPixelStorei(GL_UNPACK_LSB_FIRST, hdr->lsbFirst);'
 
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ROW_LENGTH,   (GLint) %shdr->rowLength%s) );' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint) %shdr->rowLength%s);' % (pre, post)
             if img.depth:
-                print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s) );' % (pre, post)
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_ROWS,    (GLint) %shdr->skipRows%s) );' % (pre, post)
+                print '    glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s);' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_SKIP_ROWS, (GLint) %shdr->skipRows%s);' % (pre, post)
             if img.depth:
-                print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_IMAGES,  (GLint) %shdr->skipImages%s) );' % (pre, post)
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_PIXELS,  (GLint) %shdr->skipPixels%s) );' % (pre, post)
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ALIGNMENT,    (GLint) %shdr->alignment%s) );' % (pre, post)
+                print '    glPixelStorei(GL_UNPACK_SKIP_IMAGES, (GLint) %shdr->skipImages%s);' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_SKIP_PIXELS, (GLint) %shdr->skipPixels%s);' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_ALIGNMENT, (GLint) %shdr->alignment%s);' % (pre, post)
             print ''
 
 

From c2f0bc2537b8e3d8a8dc4b8fb2e7c54b89cab60d Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 24 Mar 2016 13:57:57 -0400
Subject: [PATCH 079/238] glapi/glx: Thunk non-ABI calls through GetProcAddress

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/mapi/glapi/gen/glX_proto_recv.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index 916da944689..1cfa8c5c142 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -117,6 +117,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
         return
 
+    def fptrType(self, name):
+	fptr = "pfngl" + name + "proc"
+	return fptr.upper()
 
     def printFunction(self, f, name):
         if (f.glx_sop or f.glx_vendorpriv) and (len(f.get_images()) != 0):
@@ -134,6 +137,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
         print '{'
 
+        if not f.is_abi():
+            print '    %s %s = __glGetProcAddress("gl%s");' % (self.fptrType(name), name, name)
+
         if f.glx_rop or f.vectorequiv:
             self.printRenderFunction(f)
         elif f.glx_sop or f.glx_vendorpriv:
@@ -218,6 +224,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
     def emit_function_call(self, f, retval_assign, indent):
         list = []
+        prefix = "gl" if f.is_abi() else ""
 
         for param in f.parameterIterator():
             if param.is_padding:
@@ -230,8 +237,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
             list.append( '%s        %s' % (indent, location) )
 
-
-        print '%s    %sgl%s(%s);' % (indent, retval_assign, f.name, string.join(list, ',\n'))
+        print '%s    %s%s%s(%s);' % (indent, retval_assign, prefix, f.name, string.join(list, ',\n'))
 
 
     def common_func_print_just_start(self, f, indent):

From 2b8492d63ea14f36da00803620e89483743e77e7 Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 24 Mar 2016 13:57:57 -0400
Subject: [PATCH 080/238] glapi/glx: Treat xserver generated targets as .PHONY

Meaning, always rebuild them when asked instead of bothering to look at
timestamps (and then wondering why nothing happened when you said make).

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/mapi/glapi/gen/Makefile.am | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 1e346321292..fff6805fbcd 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -100,6 +100,8 @@ XORG_OUTPUTS = \
 	$(XORG_GLX_DIR)/indirect_size_get.h \
 	$(XORG_GLX_DIR)/indirect_table.c
 
+.PHONY: $(XORG_OUTPUTS)
+
 ######################################################################
 
 API_XML = \

From 668b6ddfc5cb0b0d7e0e394804d80094e4f8e623 Mon Sep 17 00:00:00 2001
From: Rhys Kidd <rhyskidd@gmail.com>
Date: Sat, 19 Mar 2016 18:37:57 -0400
Subject: [PATCH 081/238] vc4: Remove unused include from
 vc4_nir_lower_txf_ms.c

Found with grep and inspection. Test compiled on RPi hw.
Assists any future effort to remove TGSI as an intermediate stage.

Signed-off-by: Rhys Kidd <rhyskidd@gmail.com>
Signed-off-by: Eric Anholt <eric@anholt.net>
---
 src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index a2d89ef3349..8b65cac5084 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -23,7 +23,6 @@
 
 #include "vc4_qir.h"
 #include "kernel/vc4_packet.h"
-#include "tgsi/tgsi_info.h"
 #include "compiler/nir/nir_builder.h"
 
 /** @file vc4_nir_lower_txf_ms.c

From de505f7d7bfff3fe031242589c8986ded201c837 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 24 Mar 2016 16:21:35 -0700
Subject: [PATCH 082/238] i965: Whack UAV bit when FS discards and there are no
 color writes.

dEQP-GLES31.functional.fbo.no_attachments.* draws a quad with no
framebuffer attachments, using a shader that discards based on
gl_FragCoord.  It uses occlusion queries to inspect whether pixels
are rendered or not.

Unfortunately, the hardware is not dispatching any pixel shaders,
so discards never happen, and the full quad of pixels increments
PS_DEPTH_COUNT, making the occlusion query results bogus.

To understand why, we have to delve into the WM_INT internal
signalling mechanism's formulas.

The "WM_INT::Pixel Shader Kill Pixel" signal is defined as:

    3DSTATE_WM::ForceKillPixel == ON ||
    (3DSTATE_WM::ForceKillPixel != Off &&
     !WM_INT::WM_HZ_OP &&
     3DSTATE_WM::EDSC_Mode != PREPS &&
     (WM_INT::Depth Write Enable || WM_INT::Stencil Write Enable) &&
     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
     (3DSTATE_PS_EXTRA::PixelShaderKillsPixels ||
      3DSTATE_PS_EXTRA:: oMask Present to RenderTarget ||
      3DSTATE_PS_BLEND::AlphaToCoverageEnable ||
      3DSTATE_PS_BLEND::AlphaTestEnable ||
      3DSTATE_WM_CHROMAKEY::ChromaKeyKillEnable))

Because there is no depth or stencil buffer, writes to those buffers
are disabled.  So the highlighted condition is false, making the whole
"Kill Pixel" condition false.  This then feeds into the following
"WM_INT::ThreadDispatchEnable" condition:

    3DSTATE_WM::ForceThreadDispatch != OFF &&
    !WM_INT::WM_HZ_OP &&
    3DSTATE_PS_EXTRA::PixelShaderValid &&
    (3DSTATE_PS_EXTRA::PixelShaderHasUAV ||
     WM_INT::Pixel Shader Kill Pixel ||
     WM_INT::RTIndependentRasterizationEnable ||
     (!3DSTATE_PS_EXTRA::PixelShaderDoesNotWriteRT &&
      3DSTATE_PS_BLEND::HasWriteableRT) ||
     (WM_INT::Pixel Shader Computed Depth Mode != PSCDEPTH_OFF &&
      (WM_INT::Depth Test Enable || WM_INT::Depth Write Enable)) ||
     (3DSTATE_PS_EXTRA::Computed Stencil && WM_INT::Stencil Test Enable) ||
     (3DSTATE_WM::EDSC_Mode == 1 && (WM_INT::Depth Test Enable ||
                                     WM_INT::Depth Write Enable ||
                                     WM_INT::Stencil Test Enable)))

Given that there's no depth/stencil testing, no writeable render target,
and the hardware thinks kill pixel doesn't happen, all of these
conditions are false.  We have to whack some bit to make PS invocations
happen.  There are many options.

Curro suggested using the UAV bit.  There's some precedence in doing
that - we set it for fragment shaders that do SSBO/image/atomic writes
when no color buffer writes are enabled.  We can simply include discard
here too.

Fixes 64 dEQP-GLES31.functional.fbo.no_attachments.* tests.

v2: Add a comment suggested and written by Jason Ekstrand.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/gen8_ps_state.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index b9a06e7b2c7..7dfd4bfb8de 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -91,10 +91,15 @@ gen8_upload_ps_extra(struct brw_context *brw,
     * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
     * difference so we may just disable it here.
     *
+    * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
+    * take into account KillPixels when no depth or stencil writes are enabled.
+    * In order for occlusion queries to work correctly with no attachments, we
+    * need to force-enable here.
+    *
     * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
     */
-   if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) &&
-       !brw_color_buffer_write_enabled(brw))
+   if ((_mesa_active_fragment_shader_has_side_effects(ctx) ||
+        prog_data->uses_kill) && !brw_color_buffer_write_enabled(brw))
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
    if (prog_data->computed_stencil) {

From 72473658c51d5e074ce219c1e6385a4cce29f467 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Fri, 25 Mar 2016 15:33:35 -0700
Subject: [PATCH 083/238] i965: Fix brw_render_cache_set_check_flush's
 PIPE_CONTROLs.

Our driver uses the brw_render_cache mechanism to track buffers we've
rendered to and are about to sample from.

Previously, we did a single PIPE_CONTROL with the following bits set:
- Render Target Flush
- Depth Cache Flush
- Texture Cache Invalidate
- VF Cache Invalidate
- Instruction Cache Invalidate
- CS Stall

This combined both "top of pipe" invalidations and "bottom of pipe"
flushes, which isn't how the hardware is intended to be programmed.

The "top of pipe" invalidations may happen right away, without any
guarantees that rendering using those caches has completed.  That
rendering may continue altering the caches.  The "bottom of pipe"
flushes do wait for the rendering to complete.  The CS stall also
prevents further work from happening until data is flushed out.

What we wanted to do was wait for rendering complete, flush the new
data out of the render and depth caches, wait, then invalidate any
stale data in read-only caches.  We can accomplish this by doing the
"bottom of pipe" flushes with a CS stall, then the "top of pipe"
flushes as a second PIPE_CONTROL.  The flushes will wait until the
rendering is complete, and the CS stall will prevent the second
PIPE_CONTROL with the invalidations from executing until the first
is done.

Fixes dEQP-GLES3.functional.texture.specification.teximage2d_pbo
subtests on Braswell and Skylake.  These tests hit the meta PBO
texture upload path, which binds the PBO as a texture and samples
from it, while rendering to the destination texture.  The tests
then sample from the texture.

For now, we leave Gen4-5 alone.  It probably needs work too, but
apparently it hasn't even been setting the (G45+) TC invalidation
bit at all...

v2: Add Sandybridge post-sync non-zero workaround, for safety.

Cc: mesa-stable@lists.freedesktop.org
Suggested-by: Francisco Jerez <currojerez@riseup.net>
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_pipe_control.c |  2 --
 src/mesa/drivers/dri/i965/intel_fbo.c        | 23 +++++++++++++++++++-
 2 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index b41e28e1ec8..4672efdffc3 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -338,8 +338,6 @@ brw_emit_mi_flush(struct brw_context *brw)
       }
       brw_emit_pipe_control_flush(brw, flags);
    }
-
-   brw_render_cache_set_clear(brw);
 }
 
 int
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index b7b679686e5..7eb21acc40b 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1065,7 +1065,28 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
    if (!_mesa_set_search(brw->render_cache, bo))
       return;
 
-   brw_emit_mi_flush(brw);
+   if (brw->gen >= 6) {
+      if (brw->gen == 6) {
+         /* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+          * Flush Enable = 1, a PIPE_CONTROL with any non-zero
+          * post-sync-op is required.
+          */
+         brw_emit_post_sync_nonzero_flush(brw);
+      }
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+   } else {
+      brw_emit_mi_flush(brw);
+   }
+
+   brw_render_cache_set_clear(brw);
 }
 
 /**

From 0faf26e6a0a34c3544644852802484f2404cc83e Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Wed, 23 Mar 2016 11:56:39 -0700
Subject: [PATCH 084/238] i965: Always use BRW_TEXCOORDMODE_CUBE when seamless
 filtering.

When using seamless cube map mode and NEAREST filtering, we explicitly
overrode the wrap modes to CLAMP_TO_EDGE.  This was to implement the
following spec text:

   "If NEAREST filtering is done within a miplevel, always apply apply
    wrap mode CLAMP_TO_EDGE."

However, textureGather() ignores the sampler's filtering mode, and
instead returns the four pixels that would be blended by LINEAR
filtering.  This implies that we should do proper seamless filtering,
and include pixels from adjacent cube faces.

It turns out that we can simply delete the NEAREST -> CLAMP_TO_EDGE
overrides.  Normal cube map sampling works by first selecting the
face, and then nearest filtering fetches the closest texel.  If the
nearest texel was on a different face, then that face would have been
chosen.  So it should always be within the face anyway, which
effectively performs CLAMP_TO_EDGE.

Fixes 86 dEQP-GLES31.texture.gather.basic.cube.* tests.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Suggested-by: Ian Romanick <idr@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index c20a02817f9..3bd22c7559f 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -460,9 +460,7 @@ brw_update_sampler_state(struct brw_context *brw,
       /* Cube maps must use the same wrap mode for all three coordinate
        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
        */
-      if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
-         (sampler->MinFilter != GL_NEAREST ||
-          sampler->MagFilter != GL_NEAREST)) {
+      if (tex_cube_map_seamless || sampler->CubeMapSeamless) {
 	 wrap_s = BRW_TEXCOORDMODE_CUBE;
 	 wrap_t = BRW_TEXCOORDMODE_CUBE;
 	 wrap_r = BRW_TEXCOORDMODE_CUBE;

From 60d6a8989ab44cf47accee6bc692ba6fb98f6a9f Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 7 Mar 2016 23:54:53 -0800
Subject: [PATCH 085/238] i965: Set address rounding bits for GL_NEAREST
 filtering as well.

Yuanhan Liu decided these were useful for linear filtering in
commit 76669381 (circa 2011).  Prior to that, we never set them;
it seems he tried to preserve that behavior for nearest filtering.

It turns out they're useful for nearest filtering, too: setting
these fixes the following dEQP-GLES3 tests:

functional.fbo.blit.rect.nearest_consistency_mag
functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_x
functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_y
functional.fbo.blit.rect.nearest_consistency_mag_reverse_dst_x
functional.fbo.blit.rect.nearest_consistency_mag_reverse_dst_y
functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_dst_x
functional.fbo.blit.rect.nearest_consistency_mag_reverse_src_dst_y
functional.fbo.blit.rect.nearest_consistency_min
functional.fbo.blit.rect.nearest_consistency_min_reverse_src_x
functional.fbo.blit.rect.nearest_consistency_min_reverse_src_y
functional.fbo.blit.rect.nearest_consistency_min_reverse_dst_x
functional.fbo.blit.rect.nearest_consistency_min_reverse_dst_y
functional.fbo.blit.rect.nearest_consistency_min_reverse_src_dst_x
functional.fbo.blit.rect.nearest_consistency_min_reverse_src_dst_y
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag_reverse_src_x
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag_reverse_src_y
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag_reverse_dst_x
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag_reverse_dst_y
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag_reverse_src_dst_x
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_mag_reverse_src_dst_y
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min_reverse_src_x
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min_reverse_src_y
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min_reverse_dst_x
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min_reverse_dst_y
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min_reverse_src_dst_x
functional.fbo.blit.rect.nearest_consistency_out_of_bounds_min_reverse_src_dst_y

Apparently, BLORP has always set these bits unconditionally.

However, setting them unconditionally appears to regress tests using
texture projection, 3D samplers, integer formats, and vertex shaders,
all in combination, such as:

functional.shaders.texture_functions.textureprojlod.isampler3d_vertex

Setting them on Gen4-5 appears to regress Piglit's
tests/spec/arb_sampler_objects/framebufferblit.

Honestly, it looks like the real problem here is a lack of precision.
I'm just hacking around problems here (as embarassing as it is).

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jordan Justen <jordan.l.justen@intel.com>
---
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 3bd22c7559f..7bd21f7aaf0 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -436,14 +436,17 @@ brw_update_sampler_state(struct brw_context *brw,
       }
    }
 
-   /* Set address rounding bits if not using nearest filtering. */
+   /* Set address rounding bits.  The conditions are empirically
+    * derived in order to pass test cases.
+    */
+   bool round_nearest = brw->gen >= 6 && target != GL_TEXTURE_3D;
    unsigned address_rounding = 0;
-   if (min_filter != BRW_MAPFILTER_NEAREST) {
+   if (min_filter != BRW_MAPFILTER_NEAREST || round_nearest) {
       address_rounding |= BRW_ADDRESS_ROUNDING_ENABLE_U_MIN |
                           BRW_ADDRESS_ROUNDING_ENABLE_V_MIN |
                           BRW_ADDRESS_ROUNDING_ENABLE_R_MIN;
    }
-   if (mag_filter != BRW_MAPFILTER_NEAREST) {
+   if (mag_filter != BRW_MAPFILTER_NEAREST || round_nearest) {
       address_rounding |= BRW_ADDRESS_ROUNDING_ENABLE_U_MAG |
                           BRW_ADDRESS_ROUNDING_ENABLE_V_MAG |
                           BRW_ADDRESS_ROUNDING_ENABLE_R_MAG;

From b8b3af2932039c6105d61f6922157a250ed8b79a Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 28 Mar 2016 12:43:01 +0200
Subject: [PATCH 086/238] nvc0: use a different offset for buffers and surfaces

To not overwrite buffers and surfaces information, we need to use
a different offset in the driver constant buffer. Currently, OP_SUQ
is only supported for buffers but this will be slightly updated for
images support.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Acked-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  1 +
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 81 ++++++++++++++-----
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h   | 12 ++-
 .../drivers/nouveau/nvc0/nvc0_program.c       |  8 +-
 4 files changed, 74 insertions(+), 28 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 21523a27761..54c53c98325 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -177,6 +177,7 @@ struct nv50_ir_prog_info
       bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
+      uint16_t bufInfoBase;      /* base address for buffer info */
       uint16_t sampleInfoBase;   /* base address for sample positions */
       uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
       uint16_t msInfoBase;       /* base address for multisample info */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 122a6b74279..68a30ecb8d7 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1073,7 +1073,7 @@ bool
 NVC0LoweringPass::handleSUQ(Instruction *suq)
 {
    suq->op = OP_MOV;
-   suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1),
+   suq->setSrc(0, loadBufLength32(suq->getIndirect(0, 1),
                                   suq->getSrc(0)->reg.fileIndex * 16));
    suq->setIndirect(0, 0, NULL);
    suq->setIndirect(0, 1, NULL);
@@ -1190,7 +1190,7 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
       return true;
    default:
       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
-      base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
+      base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
       assert(base->reg.size == 8);
       if (ptr)
          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
@@ -1250,19 +1250,20 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 }
 
 inline Value *
-NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
 {
    uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
+
    return bld.
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 }
 
 inline Value *
-NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
 {
    uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
 
    if (ptr)
       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1272,10 +1273,10 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 }
 
 inline Value *
-NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
 {
    uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
 
    if (ptr)
       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1284,6 +1285,42 @@ NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U64, off + 8), ptr);
 }
 
+inline Value *
+NVC0LoweringPass::loadSuInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
 inline Value *
 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 {
@@ -1364,8 +1401,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
 
    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
 
-   Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
-   Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+   Value *ms_x = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+   Value *ms_y = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(1));
 
    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
@@ -1418,9 +1455,9 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    for (c = 0; c < arg; ++c) {
       src[c] = bld.getScratch();
       if (c == 0 && raw)
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
       else
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
          ->subOp = getSuClampSubOp(su, c);
    }
@@ -1442,16 +1479,16 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
    } else
    if (dim == 3) {
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
 
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
    } else {
       assert(dim == 2);
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
          ->subOp = su->tex.target.isArray() ?
          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
@@ -1462,7 +1499,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
       if (raw) {
          bf = src[0];
       } else {
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
       }
@@ -1479,7 +1516,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
       case 2:
          z = off;
          if (!su->tex.target.isArray()) {
-            z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+            z = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
             subOp = NV50_IR_SUBOP_SUBFM_3D;
          }
          break;
@@ -1494,7 +1531,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    }
 
    // part 2
-   v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+   v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ADDR);
 
    if (su->tex.target == TEX_TARGET_BUFFER) {
       eau = v;
@@ -1503,7 +1540,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    }
    // add array layer offset
    if (su->tex.target.isArray()) {
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
       if (dim == 1)
          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
@@ -1543,7 +1580,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
 
    // let's just set it 0 for raw access and hope it works
    v = raw ?
-      bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+      bld.mkImm(0) : loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
 
    // get rid of old coordinate sources, make space for fmt info and predicate
    su->moveSources(arg, 3 - arg);
@@ -2014,12 +2051,12 @@ NVC0LoweringPass::visit(Instruction *i)
          i->op = OP_VFETCH;
       } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
          Value *ind = i->getIndirect(0, 1);
-         Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+         Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
          // XXX come up with a way not to do this for EVERY little access but
          // rather to batch these up somehow. Unfortunately we've lost the
          // information about the field width by the time we get here.
          Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
-         Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+         Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
          Value *pred = new_LValue(func, FILE_PREDICATE);
          if (i->src(0).isIndirect(0)) {
             bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 6eb8aff3036..d2cb23f45d5 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -117,9 +117,15 @@ private:
 
    void readTessCoord(LValue *dst, int c);
 
-   Value *loadResInfo32(Value *ptr, uint32_t off);
-   Value *loadResInfo64(Value *ptr, uint32_t off);
-   Value *loadResLength32(Value *ptr, uint32_t off);
+   Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadSuInfo32(Value *ptr, uint32_t off);
+   Value *loadSuInfo64(Value *ptr, uint32_t off);
+   Value *loadSuLength32(Value *ptr, uint32_t off);
+   Value *loadBufInfo32(Value *ptr, uint32_t off);
+   Value *loadBufInfo64(Value *ptr, uint32_t off);
+   Value *loadBufLength32(Value *ptr, uint32_t off);
    Value *loadMsInfo32(Value *ptr, uint32_t off);
    Value *loadTexHandle(Value *ptr, unsigned int slot);
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index b7c6faf9cde..a3433f4a10a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -544,20 +544,22 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
          info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
          info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
          info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
+         info->io.bufInfoBase = 0; /* TODO */
       } else {
-         info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+         info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+         info->io.suInfoBase = 0; /* TODO */
       }
       info->io.msInfoCBSlot = 0;
       info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
          info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
-         info->io.suInfoBase = 0; /* TODO */
       }
       info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
-      info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+      info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       info->io.msInfoCBSlot = 15;
       info->io.msInfoBase = 0; /* TODO */
+      info->io.suInfoBase = 0; /* TODO */
    }
 
    info->assignSlots = nvc0_program_assign_varying_slots;

From 86d87d10474d1c5c5683acb28d4491e877432a90 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sun, 27 Mar 2016 14:51:02 +1100
Subject: [PATCH 087/238] mesa: remove initialized field from uniform storage

The only place this was used was in a gallium debug function that
had to be manually enabled.

Reviewed-by: Edward O'Callaghan <eocallaghan@alterapraxis.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/compiler/glsl/ir_uniform.h                |  5 ---
 .../glsl/link_uniform_initializers.cpp        |  4 --
 src/compiler/glsl/link_uniforms.cpp           |  1 -
 src/mesa/main/shaderapi.c                     |  3 +-
 src/mesa/main/uniform_query.cpp               |  4 --
 src/mesa/state_tracker/st_draw.c              | 37 -------------------
 6 files changed, 1 insertion(+), 53 deletions(-)

diff --git a/src/compiler/glsl/ir_uniform.h b/src/compiler/glsl/ir_uniform.h
index 1854279925b..e72e7b42c57 100644
--- a/src/compiler/glsl/ir_uniform.h
+++ b/src/compiler/glsl/ir_uniform.h
@@ -105,11 +105,6 @@ struct gl_uniform_storage {
     */
    unsigned array_elements;
 
-   /**
-    * Has this uniform ever been set?
-    */
-   bool initialized;
-
    struct gl_opaque_uniform_index opaque[MESA_SHADER_STAGES];
 
    /**
diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp
index 7d280ccf7fc..870bc5bfebd 100644
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -162,8 +162,6 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
             }
          }
       }
-
-      storage->initialized = true;
    }
 }
 
@@ -267,8 +265,6 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
          }
       }
    }
-
-   storage->initialized = true;
 }
 }
 
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 807b069e3ed..cd487ab6dd0 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -799,7 +799,6 @@ private:
 
       this->uniforms[id].name = ralloc_strdup(this->uniforms, name);
       this->uniforms[id].type = base_type;
-      this->uniforms[id].initialized = 0;
       this->uniforms[id].num_driver_storage = 0;
       this->uniforms[id].driver_storage = NULL;
       this->uniforms[id].atomic_buffer_index = -1;
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 32fad56f651..ba2607221d9 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2568,7 +2568,6 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
       memcpy(&uni->storage[0], &indices[i],
              sizeof(GLuint) * uni_count);
 
-      uni->initialized = true;
       _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
       i += uni_count;
    } while(i < count);
@@ -2742,7 +2741,7 @@ _mesa_shader_init_subroutine_defaults(struct gl_shader *sh)
 
       for (j = 0; j < uni_count; j++)
          memcpy(&uni->storage[j], &val, sizeof(int));
-      uni->initialized = true;
+
       _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
    }
 }
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 2ced201ebca..ab5c3cd9249 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -815,8 +815,6 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg,
       }
    }
 
-   uni->initialized = true;
-
    _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
 
    /* If the uniform is a sampler, do the extra magic necessary to propagate
@@ -1030,8 +1028,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg,
       }
    }
 
-   uni->initialized = true;
-
    _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
 }
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index fdd59a383a9..3db5749725e 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -126,35 +126,6 @@ setup_index_buffer(struct st_context *st,
 }
 
 
-/**
- * Prior to drawing, check that any uniforms referenced by the
- * current shader have been set.  If a uniform has not been set,
- * issue a warning.
- */
-static void
-check_uniforms(struct gl_context *ctx)
-{
-   struct gl_shader_program **shProg = ctx->_Shader->CurrentProgram;
-   unsigned j;
-
-   for (j = 0; j < 3; j++) {
-      unsigned i;
-
-      if (shProg[j] == NULL || !shProg[j]->LinkStatus)
-	 continue;
-
-      for (i = 0; i < shProg[j]->NumUniformStorage; i++) {
-         const struct gl_uniform_storage *u = &shProg[j]->UniformStorage[i];
-         if (!u->initialized) {
-            _mesa_warning(ctx,
-                          "Using shader with uninitialized uniform: %s",
-                          u->name);
-         }
-      }
-   }
-}
-
-
 /**
  * Translate OpenGL primtive type (GL_POINTS, GL_TRIANGLE_STRIP, etc) to
  * the corresponding Gallium type.
@@ -203,14 +174,6 @@ st_draw_vbo(struct gl_context *ctx,
    /* Validate state. */
    if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) {
       st_validate_state(st, ST_PIPELINE_RENDER);
-
-#if 0
-      if (MESA_VERBOSE & VERBOSE_GLSL) {
-         check_uniforms(ctx);
-      }
-#else
-      (void) check_uniforms;
-#endif
    }
 
    if (st->vertex_array_out_of_memory) {

From 4ed4a2af8628e6adaa04fbe489d2d95747cf3634 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 28 Mar 2016 16:57:19 -0700
Subject: [PATCH 088/238] glsl: Delete initialized field from uniform storage
 test.

Timothy deleted this field.  Fixes "make check".

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 .../tests/set_uniform_initializer_tests.cpp   | 19 -------------------
 1 file changed, 19 deletions(-)

diff --git a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
index 0b1f66cb342..a36ffdc58be 100644
--- a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
+++ b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
@@ -115,7 +115,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
    prog->UniformStorage[index_to_set].name = (char *) name;
    prog->UniformStorage[index_to_set].type = type;
    prog->UniformStorage[index_to_set].array_elements = array_size;
-   prog->UniformStorage[index_to_set].initialized = false;
    for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
       prog->UniformStorage[index_to_set].opaque[sh].index = ~0;
       prog->UniformStorage[index_to_set].opaque[sh].active = false;
@@ -136,7 +135,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
       prog->UniformStorage[i].name = (char *) "invalid slot";
       prog->UniformStorage[i].type = glsl_type::void_type;
       prog->UniformStorage[i].array_elements = 0;
-      prog->UniformStorage[i].initialized = false;
       for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
          prog->UniformStorage[i].opaque[sh].index = ~0;
          prog->UniformStorage[i].opaque[sh].active = false;
@@ -149,21 +147,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
    return red_zone_components;
 }
 
-/**
- * Verify that the correct uniform is marked as having been initialized.
- */
-static void
-verify_initialization(struct gl_shader_program *prog, unsigned actual_index)
-{
-   for (unsigned i = 0; i < prog->NumUniformStorage; i++) {
-      if (i == actual_index) {
-	 EXPECT_TRUE(prog->UniformStorage[actual_index].initialized);
-      } else {
-	 EXPECT_FALSE(prog->UniformStorage[i].initialized);
-      }
-   }
-}
-
 static void
 non_array_test(void *mem_ctx, struct gl_shader_program *prog,
 	       unsigned actual_index, const char *name,
@@ -181,7 +164,6 @@ non_array_test(void *mem_ctx, struct gl_shader_program *prog,
 
    linker::set_uniform_initializer(mem_ctx, prog, name, type, val, 0xF00F);
 
-   verify_initialization(prog, actual_index);
    verify_data(prog->UniformStorage[actual_index].storage, 0, val,
 	       red_zone_components, 0xF00F);
 }
@@ -338,7 +320,6 @@ array_test(void *mem_ctx, struct gl_shader_program *prog,
    linker::set_uniform_initializer(mem_ctx, prog, name, element_type, val,
                                    0xF00F);
 
-   verify_initialization(prog, actual_index);
    verify_data(prog->UniformStorage[actual_index].storage, array_size,
 	       val, red_zone_components, 0xF00F);
 }

From 659beca666c4e90ab5f366b231a94ed437898b80 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 27 Feb 2016 16:04:51 -0500
Subject: [PATCH 089/238] mesa: properly return GetTexLevelParameter queries
 for buffer textures

This fixes all failures with dEQP tests in this area. While
ARB_texture_buffer_object explicitly says that GetTexLevelParameter & co
should not be supported, GL 3.1 reverses this decision and allows all of
these queries there.

Conversely, there is no text that forbids the buffer-specific queries
from being used with non-buffer images.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 src/mesa/main/texparam.c | 54 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 9350ca5c035..8a3e02f0552 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1447,6 +1447,29 @@ get_tex_level_parameter_image(struct gl_context *ctx,
          *params = img->FixedSampleLocations;
          break;
 
+      /* There is never a buffer data store here, but these pnames still have
+       * to work.
+       */
+
+      /* GL_ARB_texture_buffer_object */
+      case GL_TEXTURE_BUFFER_DATA_STORE_BINDING:
+         if (!ctx->Extensions.ARB_texture_buffer_object)
+            goto invalid_pname;
+         *params = 0;
+         break;
+
+      /* GL_ARB_texture_buffer_range */
+      case GL_TEXTURE_BUFFER_OFFSET:
+         if (!ctx->Extensions.ARB_texture_buffer_range)
+            goto invalid_pname;
+         *params = 0;
+         break;
+      case GL_TEXTURE_BUFFER_SIZE:
+         if (!ctx->Extensions.ARB_texture_buffer_range)
+            goto invalid_pname;
+         *params = 0;
+         break;
+
       default:
          goto invalid_pname;
    }
@@ -1468,13 +1491,24 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
 {
    const struct gl_buffer_object *bo = texObj->BufferObject;
    mesa_format texFormat = texObj->_BufferObjectFormat;
+   int bytes = MAX2(1, _mesa_get_format_bytes(texFormat));
    GLenum internalFormat = texObj->BufferObjectFormat;
    GLenum baseFormat = _mesa_get_format_base_format(texFormat);
    const char *suffix = dsa ? "ture" : "";
 
    if (!bo) {
       /* undefined texture buffer object */
-      *params = pname == GL_TEXTURE_COMPONENTS ? 1 : 0;
+      switch (pname) {
+      case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS:
+         *params = GL_TRUE;
+         break;
+      case GL_TEXTURE_INTERNAL_FORMAT:
+         *params = internalFormat;
+         break;
+      default:
+         *params = 0;
+         break;
+      }
       return;
    }
 
@@ -1483,10 +1517,13 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          *params = bo->Name;
          break;
       case GL_TEXTURE_WIDTH:
-         *params = bo->Size;
+         *params = ((texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize)
+            / bytes;
          break;
       case GL_TEXTURE_HEIGHT:
       case GL_TEXTURE_DEPTH:
+         *params = 1;
+         break;
       case GL_TEXTURE_BORDER:
       case GL_TEXTURE_SHARED_SIZE:
       case GL_TEXTURE_COMPRESSED:
@@ -1536,6 +1573,19 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          *params = (texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize;
          break;
 
+      /* GL_ARB_texture_multisample */
+      case GL_TEXTURE_SAMPLES:
+         if (!ctx->Extensions.ARB_texture_multisample)
+            goto invalid_pname;
+         *params = 0;
+         break;
+
+      case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS:
+         if (!ctx->Extensions.ARB_texture_multisample)
+            goto invalid_pname;
+         *params = GL_TRUE;
+         break;
+
       /* GL_ARB_texture_compression */
       case GL_TEXTURE_COMPRESSED_IMAGE_SIZE:
          /* Always illegal for GL_TEXTURE_BUFFER */

From 74b76c08a3732b0ca337998780d01d67e7fd554b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 27 Feb 2016 16:06:42 -0500
Subject: [PATCH 090/238] mesa: add OES_texture_buffer and EXT_texture_buffer
 extension to table

We need to add a new bit since the GL ES exts require functionality from
a combination of texture buffer extensions as well as images (for
imageBuffer) support. Additionally, not all GPUs support all the texture
buffer functionality (e.g. rgb32 isn't supported by nv50).

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/extensions_table.h | 2 ++
 src/mesa/main/mtypes.h           | 1 +
 2 files changed, 3 insertions(+)

diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 54a5bb057a3..7885aefcc28 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -228,6 +228,7 @@ EXT(EXT_texture                             , dummy_true
 EXT(EXT_texture3D                           , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(EXT_texture_array                       , EXT_texture_array                      , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_texture_border_clamp                , ARB_texture_border_clamp               ,  x ,  x ,  x , ES2, 2014)
+EXT(EXT_texture_buffer                      , OES_texture_buffer                     ,  x ,  x ,  x ,  31, 2014)
 EXT(EXT_texture_compression_dxt1            , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2004)
 EXT(EXT_texture_compression_latc            , EXT_texture_compression_latc           , GLL,  x ,  x ,  x , 2006)
 EXT(EXT_texture_compression_rgtc            , ARB_texture_compression_rgtc           , GLL, GLC,  x ,  x , 2004)
@@ -339,6 +340,7 @@ EXT(OES_stencil_wrap                        , dummy_true
 EXT(OES_surfaceless_context                 , dummy_true                             ,  x ,  x , ES1, ES2, 2012)
 EXT(OES_texture_3D                          , dummy_true                             ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_texture_border_clamp                , ARB_texture_border_clamp               ,  x ,  x ,  x , ES2, 2014)
+EXT(OES_texture_buffer                      , OES_texture_buffer                     ,  x ,  x ,  x ,  31, 2014)
 EXT(OES_texture_cube_map                    , ARB_texture_cube_map                   ,  x ,  x , ES1,  x , 2007)
 EXT(OES_texture_env_crossbar                , ARB_texture_env_crossbar               ,  x ,  x , ES1,  x , 2005)
 EXT(OES_texture_float                       , OES_texture_float                      ,  x ,  x ,  x , ES2, 2005)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index c2c86a6e0d1..ae0c8a84dda 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3895,6 +3895,7 @@ struct gl_extensions
    GLboolean EXT_timer_query;
    GLboolean EXT_vertex_array_bgra;
    GLboolean OES_standard_derivatives;
+   GLboolean OES_texture_buffer;
    /* vendor extensions */
    GLboolean AMD_performance_monitor;
    GLboolean AMD_pinned_memory;

From 720670a615590e37a7e85852527a590778e6f273 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 27 Feb 2016 16:13:50 -0500
Subject: [PATCH 091/238] glsl: add OES_texture_buffer and EXT_texture_buffer
 support

Expose the samplerBuffer/imageBuffer types, and allow the various
functions to operate on them.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/compiler/glsl/builtin_functions.cpp  | 26 +++++++++++++-----------
 src/compiler/glsl/builtin_types.cpp      | 22 ++++++++++++++------
 src/compiler/glsl/glcpp/glcpp-parse.y    |  4 ++++
 src/compiler/glsl/glsl_lexer.ll          | 12 +++++------
 src/compiler/glsl/glsl_parser_extras.cpp |  2 ++
 src/compiler/glsl/glsl_parser_extras.h   |  4 ++++
 6 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index ff6b628eb64..62f07b2460e 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -129,12 +129,6 @@ v130_fs_only(const _mesa_glsl_parse_state *state)
           state->stage == MESA_SHADER_FRAGMENT;
 }
 
-static bool
-v140(const _mesa_glsl_parse_state *state)
-{
-   return state->is_version(140, 0);
-}
-
 static bool
 v140_or_es3(const _mesa_glsl_parse_state *state)
 {
@@ -183,6 +177,14 @@ v110_lod(const _mesa_glsl_parse_state *state)
    return !state->es_shader && lod_exists_in_stage(state);
 }
 
+static bool
+texture_buffer(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(140, 320) ||
+      state->EXT_texture_buffer_enable ||
+      state->OES_texture_buffer_enable;
+}
+
 static bool
 shader_texture_lod(const _mesa_glsl_parse_state *state)
 {
@@ -1581,9 +1583,9 @@ builtin_builder::create_builtins()
                 _textureSize(v130, glsl_type::ivec2_type, glsl_type::usampler2DRect_type),
                 _textureSize(v130, glsl_type::ivec2_type, glsl_type::sampler2DRectShadow_type),
 
-                _textureSize(v140, glsl_type::int_type,   glsl_type::samplerBuffer_type),
-                _textureSize(v140, glsl_type::int_type,   glsl_type::isamplerBuffer_type),
-                _textureSize(v140, glsl_type::int_type,   glsl_type::usamplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::samplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::isamplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::usamplerBuffer_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::sampler2DMS_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::isampler2DMS_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::usampler2DMS_type),
@@ -1855,9 +1857,9 @@ builtin_builder::create_builtins()
                 _texelFetch(v130, glsl_type::ivec4_type, glsl_type::isampler2DArray_type, glsl_type::ivec3_type),
                 _texelFetch(v130, glsl_type::uvec4_type, glsl_type::usampler2DArray_type, glsl_type::ivec3_type),
 
-                _texelFetch(v140, glsl_type::vec4_type,  glsl_type::samplerBuffer_type,  glsl_type::int_type),
-                _texelFetch(v140, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
-                _texelFetch(v140, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::vec4_type,  glsl_type::samplerBuffer_type,  glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
 
                 _texelFetch(texture_multisample, glsl_type::vec4_type,  glsl_type::sampler2DMS_type,  glsl_type::ivec2_type),
                 _texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMS_type, glsl_type::ivec2_type),
diff --git a/src/compiler/glsl/builtin_types.cpp b/src/compiler/glsl/builtin_types.cpp
index ee24bd5e411..d250234f652 100644
--- a/src/compiler/glsl/builtin_types.cpp
+++ b/src/compiler/glsl/builtin_types.cpp
@@ -179,7 +179,7 @@ static const struct builtin_type_versions {
    T(sampler2DArray,                  130, 300)
    T(samplerCubeArray,                400, 999)
    T(sampler2DRect,                   140, 999)
-   T(samplerBuffer,                   140, 999)
+   T(samplerBuffer,                   140, 320)
    T(sampler2DMS,                     150, 310)
    T(sampler2DMSArray,                150, 999)
 
@@ -191,7 +191,7 @@ static const struct builtin_type_versions {
    T(isampler2DArray,                 130, 300)
    T(isamplerCubeArray,               400, 999)
    T(isampler2DRect,                  140, 999)
-   T(isamplerBuffer,                  140, 999)
+   T(isamplerBuffer,                  140, 320)
    T(isampler2DMS,                    150, 310)
    T(isampler2DMSArray,               150, 999)
 
@@ -203,7 +203,7 @@ static const struct builtin_type_versions {
    T(usampler2DArray,                 130, 300)
    T(usamplerCubeArray,               400, 999)
    T(usampler2DRect,                  140, 999)
-   T(usamplerBuffer,                  140, 999)
+   T(usamplerBuffer,                  140, 320)
    T(usampler2DMS,                    150, 310)
    T(usampler2DMSArray,               150, 999)
 
@@ -222,7 +222,7 @@ static const struct builtin_type_versions {
    T(image3D,                         420, 310)
    T(image2DRect,                     420, 999)
    T(imageCube,                       420, 310)
-   T(imageBuffer,                     420, 999)
+   T(imageBuffer,                     420, 320)
    T(image1DArray,                    420, 999)
    T(image2DArray,                    420, 310)
    T(imageCubeArray,                  420, 999)
@@ -233,7 +233,7 @@ static const struct builtin_type_versions {
    T(iimage3D,                        420, 310)
    T(iimage2DRect,                    420, 999)
    T(iimageCube,                      420, 310)
-   T(iimageBuffer,                    420, 999)
+   T(iimageBuffer,                    420, 320)
    T(iimage1DArray,                   420, 999)
    T(iimage2DArray,                   420, 310)
    T(iimageCubeArray,                 420, 999)
@@ -244,7 +244,7 @@ static const struct builtin_type_versions {
    T(uimage3D,                        420, 310)
    T(uimage2DRect,                    420, 999)
    T(uimageCube,                      420, 310)
-   T(uimageBuffer,                    420, 999)
+   T(uimageBuffer,                    420, 320)
    T(uimage1DArray,                   420, 999)
    T(uimage2DArray,                   420, 310)
    T(uimageCubeArray,                 420, 999)
@@ -371,6 +371,16 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
       add_type(symbols, glsl_type::uimage2DMSArray_type);
    }
 
+   if (state->EXT_texture_buffer_enable || state->OES_texture_buffer_enable) {
+      add_type(symbols, glsl_type::samplerBuffer_type);
+      add_type(symbols, glsl_type::isamplerBuffer_type);
+      add_type(symbols, glsl_type::usamplerBuffer_type);
+
+      add_type(symbols, glsl_type::imageBuffer_type);
+      add_type(symbols, glsl_type::iimageBuffer_type);
+      add_type(symbols, glsl_type::uimageBuffer_type);
+   }
+
    if (state->has_atomic_counters()) {
       add_type(symbols, glsl_type::atomic_uint_type);
    }
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 007b70b020d..fbbf85bfdae 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2390,6 +2390,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
                     add_builtin_define(parser, "GL_EXT_gpu_shader5", 1);
                     add_builtin_define(parser, "GL_OES_gpu_shader5", 1);
                  }
+                 if (extensions->OES_texture_buffer) {
+                    add_builtin_define(parser, "GL_EXT_texture_buffer", 1);
+                    add_builtin_define(parser, "GL_OES_texture_buffer", 1);
+                 }
               }
 	   }
 	} else {
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 1f122654340..883c58f0da9 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -369,7 +369,7 @@ image2D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 image3D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE3D);
 image2DRect     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DRECT);
 imageCube       KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE);
-imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGEBUFFER);
+imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IMAGEBUFFER);
 image1DArray    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1DARRAY);
 image2DArray    KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY);
 imageCubeArray  KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBEARRAY);
@@ -380,7 +380,7 @@ iimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 iimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D);
 iimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DRECT);
 iimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE);
-iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGEBUFFER);
+iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IIMAGEBUFFER);
 iimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1DARRAY);
 iimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY);
 iimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBEARRAY);
@@ -391,7 +391,7 @@ uimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 uimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D);
 uimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DRECT);
 uimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE);
-uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGEBUFFER);
+uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, UIMAGEBUFFER);
 uimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1DARRAY);
 uimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY);
 uimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBEARRAY);
@@ -565,15 +565,15 @@ common		KEYWORD(130, 300, 0, 0, COMMON);
 partition	KEYWORD(130, 300, 0, 0, PARTITION);
 active		KEYWORD(130, 300, 0, 0, ACTIVE);
 superp		KEYWORD(130, 100, 0, 0, SUPERP);
-samplerBuffer	KEYWORD(130, 300, 140, 0, SAMPLERBUFFER);
+samplerBuffer	KEYWORD_WITH_ALT(130, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, SAMPLERBUFFER);
 filter		KEYWORD(130, 300, 0, 0, FILTER);
 row_major	KEYWORD_WITH_ALT(130, 0, 140, 0, yyextra->ARB_uniform_buffer_object_enable && !yyextra->es_shader, ROW_MAJOR);
 
     /* Additional reserved words in GLSL 1.40 */
 isampler2DRect	KEYWORD(140, 300, 140, 0, ISAMPLER2DRECT);
 usampler2DRect	KEYWORD(140, 300, 140, 0, USAMPLER2DRECT);
-isamplerBuffer	KEYWORD(140, 300, 140, 0, ISAMPLERBUFFER);
-usamplerBuffer	KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
+isamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, ISAMPLERBUFFER);
+usamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, USAMPLERBUFFER);
 
     /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 9fcca211a99..1d9bfd6aaba 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -611,6 +611,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(OES_shader_image_atomic,        false, true,      ARB_shader_image_load_store),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
    EXT(OES_texture_3D,                 false, true,      dummy_true),
+   EXT(OES_texture_buffer,             false, true,      OES_texture_buffer),
    EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample),
 
    /* All other extensions go here, sorted alphabetically.
@@ -627,6 +628,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(EXT_shader_integer_mix,         true,  true,      EXT_shader_integer_mix),
    EXT(EXT_shader_samples_identical,   true,  true,      EXT_shader_samples_identical),
    EXT(EXT_texture_array,              true,  false,     EXT_texture_array),
+   EXT(EXT_texture_buffer,             false, true,      OES_texture_buffer),
 };
 
 #undef EXT
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 12a3a46928c..24195f97f18 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -603,6 +603,8 @@ struct _mesa_glsl_parse_state {
    bool OES_standard_derivatives_warn;
    bool OES_texture_3D_enable;
    bool OES_texture_3D_warn;
+   bool OES_texture_buffer_enable;
+   bool OES_texture_buffer_warn;
    bool OES_texture_storage_multisample_2d_array_enable;
    bool OES_texture_storage_multisample_2d_array_warn;
 
@@ -632,6 +634,8 @@ struct _mesa_glsl_parse_state {
    bool EXT_shader_samples_identical_warn;
    bool EXT_texture_array_enable;
    bool EXT_texture_array_warn;
+   bool EXT_texture_buffer_enable;
+   bool EXT_texture_buffer_warn;
    /*@}*/
 
    /** Extensions supported by the OpenGL implementation. */

From b4c0c514b10ed85b50e4fc3bbd9c740db21e5720 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 27 Feb 2016 16:16:28 -0500
Subject: [PATCH 092/238] mesa: add OES_texture_buffer and EXT_texture_buffer
 support

Allow ES 3.1 contexts to access the texture buffer functionality.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 docs/GL3.txt                            |  2 +-
 src/mapi/glapi/gen/apiexec.py           |  4 +--
 src/mapi/glapi/gen/es_EXT.xml           | 36 +++++++++++++++++++++++
 src/mesa/main/bufferobj.c               |  4 +--
 src/mesa/main/get.c                     |  4 +--
 src/mesa/main/get_hash_params.py        | 20 ++++++-------
 src/mesa/main/tests/dispatch_sanity.cpp |  4 +++
 src/mesa/main/teximage.c                | 20 ++++++++-----
 src/mesa/main/texobj.c                  |  8 ++---
 src/mesa/main/texparam.c                | 39 +++++++++++++------------
 10 files changed, 94 insertions(+), 47 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 88c14c4c67d..03ebf70fb0f 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -269,7 +269,7 @@ GLES3.2, GLSL ES 3.2
   GL_OES_shader_multisample_interpolation               not started (based on parts of GL_ARB_gpu_shader5, which is done)
   GL_OES_tessellation_shader                            not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
   GL_OES_texture_border_clamp                           DONE (all drivers)
-  GL_OES_texture_buffer                                 not started (based on GL_ARB_texture_buffer_object, GL_ARB_texture_buffer_range, and GL_ARB_texture_buffer_object_rgb32 that are all done)
+  GL_OES_texture_buffer                                 DONE (core only)
   GL_OES_texture_cube_map_array                         not started (based on GL_ARB_texture_cube_map_array, which is done for all drivers)
   GL_OES_texture_stencil8                               DONE (all drivers that support GL_ARB_texture_stencil8)
   GL_OES_texture_storage_multisample_2d_array           DONE (all drivers that support GL_ARB_texture_multisample)
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index 2a8043264eb..b4f4cf6831b 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -68,7 +68,7 @@ class exec_info():
 functions = {
     # OpenGL 3.1 / GL_ARB_texture_buffer_object.  Mesa only exposes this
     # extension with core profile.
-    "TexBuffer": exec_info(core=31),
+    "TexBuffer": exec_info(core=31, es2=31),
 
     # OpenGL 3.2 / GL_OES_geometry_shader.
     "FramebufferTexture": exec_info(core=32, es2=31),
@@ -146,7 +146,7 @@ functions = {
 
     # OpenGL 4.3 / GL_ARB_texture_buffer_range.  Mesa can expose the extension
     # with OpenGL 3.1.
-    "TexBufferRange": exec_info(core=31),
+    "TexBufferRange": exec_info(core=31, es2=31),
 
     # OpenGL 4.3 / GL_ARB_framebuffer_no_attachments.  Mesa can expose the
     # extension with OpenGL 3.0.
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 178f7c027bc..8f8f997b20d 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -847,6 +847,24 @@
 
 </category>
 
+<category name="GL_EXT_texture_buffer" number="183">
+
+    <function name="TexBufferEXT" es2="3.1" alias="TexBuffer">
+        <param name="target" type="GLenum"/>
+        <param name="internalFormat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+    </function>
+
+    <function name="TexBufferRangeEXT" es2="3.1" alias="TexBufferRange">
+        <param name="target" type="GLenum"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+        <param name="offset" type="GLintptr"/>
+        <param name="size" type="GLsizeiptr"/>
+    </function>
+
+</category>
+
 <category name="GL_EXT_draw_elements_base_vertex" number="204">
 
     <function name="DrawElementsBaseVertexEXT" alias="DrawElementsBaseVertex"
@@ -891,6 +909,24 @@
 
 </category>
 
+<category name="GL_OES_texture_buffer" number="216">
+
+    <function name="TexBufferOES" es2="3.1" alias="TexBuffer">
+        <param name="target" type="GLenum"/>
+        <param name="internalFormat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+    </function>
+
+    <function name="TexBufferRangeOES" es2="3.1" alias="TexBufferRange">
+        <param name="target" type="GLenum"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+        <param name="offset" type="GLintptr"/>
+        <param name="size" type="GLsizeiptr"/>
+    </function>
+
+</category>
+
 <category name="GL_OES_draw_elements_base_vertex" number="219">
 
     <function name="DrawElementsBaseVertexOES" alias="DrawElementsBaseVertex"
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 9aec42508a7..731b62ebe21 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -148,8 +148,8 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
       }
       break;
    case GL_TEXTURE_BUFFER:
-      if (ctx->API == API_OPENGL_CORE &&
-          ctx->Extensions.ARB_texture_buffer_object) {
+      if (_mesa_has_ARB_texture_buffer_object(ctx) ||
+          _mesa_has_OES_texture_buffer(ctx)) {
          return &ctx->Texture.BufferObject;
       }
       break;
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index b0fadc93aef..88efd3ee642 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -1907,8 +1907,8 @@ tex_binding_to_index(const struct gl_context *ctx, GLenum binding)
          || _mesa_is_gles3(ctx)
          ? TEXTURE_2D_ARRAY_INDEX : -1;
    case GL_TEXTURE_BINDING_BUFFER:
-      return ctx->API == API_OPENGL_CORE &&
-             ctx->Extensions.ARB_texture_buffer_object ?
+      return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+              _mesa_has_OES_texture_buffer(ctx)) ?
              TEXTURE_BUFFER_INDEX : -1;
    case GL_TEXTURE_BINDING_CUBE_MAP_ARRAY:
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 12c21899cb1..62968fc0300 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -535,6 +535,16 @@ descriptor=[
 
 # GL_ARB_gpu_shader5 / GL_OES_geometry_shader
   [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5_or_oes_geometry_shader" ],
+
+# GL_ARB_texture_buffer_object / GL_OES_texture_buffer
+  [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ],
+  [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+  [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ],
+  [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+  [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+
+# GL_ARB_texture_buffer_range
+  [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
 ]},
 
 # Remaining enums are only in OpenGL
@@ -805,13 +815,6 @@ descriptor=[
 # GL_ARB_color_buffer_float
   [ "RGBA_FLOAT_MODE_ARB", "BUFFER_FIELD(Visual.floatMode, TYPE_BOOLEAN), extra_core_ARB_color_buffer_float_and_new_buffers" ],
 
-# GL_ARB_texture_buffer_object
-  [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ],
-  [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-  [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ],
-  [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-  [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-
 # GL 3.0
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
 
@@ -871,9 +874,6 @@ descriptor=[
 
 # Enums restricted to OpenGL Core profile
 { "apis": ["GL_CORE"], "params": [
-# GL_ARB_texture_buffer_range
-  [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
-
 # GL_ARB_viewport_array
   [ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ],
   [ "VIEWPORT_SUBPIXEL_BITS", "CONTEXT_INT(Const.ViewportSubpixelBits), extra_ARB_viewport_array" ],
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 09b97c33074..309e574ac32 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2450,6 +2450,10 @@ const struct function gles3_functions_possible[] = {
    { "glGetSamplerParameterIivOES", 30, -1 },
    { "glGetSamplerParameterIuivOES", 30, -1 },
 
+   /* GL_OES_texture_buffer */
+   { "glTexBufferOES", 31, -1 },
+   { "glTexBufferRangeOES", 31, -1 },
+
    { NULL, 0, -1 }
 };
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 5af0a2ec198..6ac6fb109d3 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -499,8 +499,8 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target)
       return ctx->Extensions.ARB_texture_cube_map_array
          ? ctx->Const.MaxCubeTextureLevels : 0;
    case GL_TEXTURE_BUFFER:
-      return ctx->API == API_OPENGL_CORE &&
-             ctx->Extensions.ARB_texture_buffer_object ? 1 : 0;
+      return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+              _mesa_has_OES_texture_buffer(ctx)) ? 1 : 0;
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
@@ -4717,7 +4717,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
 static mesa_format
 get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
 {
-   if (ctx->API != API_OPENGL_CORE) {
+   if (ctx->API == API_OPENGL_COMPAT) {
       switch (internalFormat) {
       case GL_ALPHA8:
          return MESA_FORMAT_A_UNORM8;
@@ -4804,8 +4804,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
       }
    }
 
-   if (ctx->API == API_OPENGL_CORE &&
-       ctx->Extensions.ARB_texture_buffer_object_rgb32) {
+   if (_mesa_has_ARB_texture_buffer_object_rgb32(ctx) ||
+       _mesa_has_OES_texture_buffer(ctx)) {
       switch (internalFormat) {
       case GL_RGB32F:
          return MESA_FORMAT_RGB_FLOAT32;
@@ -4822,6 +4822,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
    case GL_RGBA8:
       return MESA_FORMAT_R8G8B8A8_UNORM;
    case GL_RGBA16:
+      if (_mesa_is_gles(ctx))
+         return MESA_FORMAT_NONE;
       return MESA_FORMAT_RGBA_UNORM16;
    case GL_RGBA16F_ARB:
       return MESA_FORMAT_RGBA_FLOAT16;
@@ -4843,6 +4845,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
    case GL_RG8:
       return MESA_FORMAT_R8G8_UNORM;
    case GL_RG16:
+      if (_mesa_is_gles(ctx))
+         return MESA_FORMAT_NONE;
       return MESA_FORMAT_R16G16_UNORM;
    case GL_RG16F:
       return MESA_FORMAT_RG_FLOAT16;
@@ -4864,6 +4868,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
    case GL_R8:
       return MESA_FORMAT_R_UNORM8;
    case GL_R16:
+      if (_mesa_is_gles(ctx))
+         return MESA_FORMAT_NONE;
       return MESA_FORMAT_R_UNORM16;
    case GL_R16F:
       return MESA_FORMAT_R_FLOAT16;
@@ -4941,8 +4947,8 @@ _mesa_texture_buffer_range(struct gl_context *ctx,
    /* NOTE: ARB_texture_buffer_object has interactions with
     * the compatibility profile that are not implemented.
     */
-   if (!(ctx->API == API_OPENGL_CORE &&
-         ctx->Extensions.ARB_texture_buffer_object)) {
+   if (!_mesa_has_ARB_texture_buffer_object(ctx) &&
+       !_mesa_has_OES_texture_buffer(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(ARB_texture_buffer_object is not"
                   " implemented for the compatibility profile)", caller);
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index d8407f04340..c9502bda236 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -204,8 +204,8 @@ _mesa_get_current_tex_object(struct gl_context *ctx, GLenum target)
       case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
          return arrayTex ? ctx->Texture.ProxyTex[TEXTURE_2D_ARRAY_INDEX] : NULL;
       case GL_TEXTURE_BUFFER:
-         return ctx->API == API_OPENGL_CORE &&
-                ctx->Extensions.ARB_texture_buffer_object ?
+         return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+                 _mesa_has_OES_texture_buffer(ctx)) ?
                 texUnit->CurrentTex[TEXTURE_BUFFER_INDEX] : NULL;
       case GL_TEXTURE_EXTERNAL_OES:
          return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external
@@ -1574,8 +1574,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
          || _mesa_is_gles3(ctx)
          ? TEXTURE_2D_ARRAY_INDEX : -1;
    case GL_TEXTURE_BUFFER:
-      return ctx->API == API_OPENGL_CORE &&
-             ctx->Extensions.ARB_texture_buffer_object ?
+      return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+              _mesa_has_OES_texture_buffer(ctx)) ?
              TEXTURE_BUFFER_INDEX : -1;
    case GL_TEXTURE_EXTERNAL_OES:
       return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 8a3e02f0552..ba83f8fda9a 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1223,6 +1223,26 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return ctx->Extensions.ARB_texture_multisample;
+   case GL_TEXTURE_BUFFER:
+      /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts,
+       * but not in earlier versions that expose ARB_texture_buffer_object.
+       *
+       * From the ARB_texture_buffer_object spec:
+       * "(7) Do buffer textures support texture parameters (TexParameter) or
+       *      queries (GetTexParameter, GetTexLevelParameter, GetTexImage)?
+       *
+       *    RESOLVED:  No. [...] Note that the spec edits above don't add
+       *    explicit error language for any of these cases.  That is because
+       *    each of the functions enumerate the set of valid <target>
+       *    parameters.  Not editing the spec to allow TEXTURE_BUFFER_ARB in
+       *    these cases means that target is not legal, and an INVALID_ENUM
+       *    error should be generated."
+       *
+       * From the OpenGL 3.1 spec:
+       * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
+       */
+      return (ctx->API == API_OPENGL_CORE && ctx->Version >= 31) ||
+         _mesa_has_OES_texture_buffer(ctx);
    }
 
    if (!_mesa_is_desktop_gl(ctx))
@@ -1247,25 +1267,6 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target
    case GL_PROXY_TEXTURE_1D_ARRAY_EXT:
    case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
       return ctx->Extensions.EXT_texture_array;
-   case GL_TEXTURE_BUFFER:
-      /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts,
-       * but not in earlier versions that expose ARB_texture_buffer_object.
-       *
-       * From the ARB_texture_buffer_object spec:
-       * "(7) Do buffer textures support texture parameters (TexParameter) or
-       *      queries (GetTexParameter, GetTexLevelParameter, GetTexImage)?
-       *
-       *    RESOLVED:  No. [...] Note that the spec edits above don't add
-       *    explicit error language for any of these cases.  That is because
-       *    each of the functions enumerate the set of valid <target>
-       *    parameters.  Not editing the spec to allow TEXTURE_BUFFER_ARB in
-       *    these cases means that target is not legal, and an INVALID_ENUM
-       *    error should be generated."
-       *
-       * From the OpenGL 3.1 spec:
-       * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
-       */
-      return ctx->API == API_OPENGL_CORE && ctx->Version >= 31;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return ctx->Extensions.ARB_texture_multisample;

From 731870fbe31d00fd9d734c4a6ebdecb944181eb8 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 10:08:50 -0700
Subject: [PATCH 093/238] nir/Makefile: Fix alphabetization

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/Makefile.sources     | 6 +++---
 src/compiler/nir/Makefile.sources | 6 +++---
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 1f8517282ef..0aee2006902 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -178,10 +178,10 @@ NIR_FILES = \
 	nir/nir_from_ssa.c \
 	nir/nir_gs_count_vertices.c \
 	nir/nir_inline_functions.c \
-	nir/nir_intrinsics.c \
-	nir/nir_intrinsics.h \
 	nir/nir_instr_set.c \
 	nir/nir_instr_set.h \
+	nir/nir_intrinsics.c \
+	nir/nir_intrinsics.h \
 	nir/nir_liveness.c \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
@@ -220,8 +220,8 @@ NIR_FILES = \
 	nir/nir_phi_builder.c \
 	nir/nir_phi_builder.h \
 	nir/nir_print.c \
-	nir/nir_repair_ssa.c \
 	nir/nir_remove_dead_variables.c \
+	nir/nir_repair_ssa.c \
 	nir/nir_search.c \
 	nir/nir_search.h \
 	nir/nir_split_var_copies.c \
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index 00576f062c5..34743024551 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -21,10 +21,10 @@ NIR_FILES = \
 	nir_from_ssa.c \
 	nir_gs_count_vertices.c \
 	nir_inline_functions.c \
-	nir_intrinsics.c \
-	nir_intrinsics.h \
 	nir_instr_set.c \
 	nir_instr_set.h \
+	nir_intrinsics.c \
+	nir_intrinsics.h \
 	nir_liveness.c \
 	nir_lower_alu_to_scalar.c \
 	nir_lower_atomics.c \
@@ -63,8 +63,8 @@ NIR_FILES = \
 	nir_phi_builder.c \
 	nir_phi_builder.h \
 	nir_print.c \
-	nir_repair_ssa.c \
 	nir_remove_dead_variables.c \
+	nir_repair_ssa.c \
 	nir_search.c \
 	nir_search.h \
 	nir_split_var_copies.c \

From da422663a6cacefcfae6be39154ab7598072cafa Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 10:18:35 -0700
Subject: [PATCH 094/238] nir: Add a variable_foreach_safe helper

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 37d2907a82b..6bd871dc43a 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -334,6 +334,9 @@ typedef struct nir_variable {
 #define nir_foreach_variable(var, var_list) \
    foreach_list_typed(nir_variable, var, node, var_list)
 
+#define nir_foreach_variable_safe(var, var_list) \
+   foreach_list_typed_safe(nir_variable, var, node, var_list)
+
 static inline bool
 nir_variable_is_global(const nir_variable *var)
 {

From 77e2ac1da731f10d823ed9f477cc6898039dcec4 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 10:34:17 -0700
Subject: [PATCH 095/238] nir/builder: Add a helper for building fdot
 instructions

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir_builder.h | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index b245f48c96d..4df79f58a7e 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -305,6 +305,23 @@ nir_swizzle(nir_builder *build, nir_ssa_def *src, unsigned swiz[4],
                      nir_imov_alu(build, alu_src, num_components);
 }
 
+/* Selects the right fdot given the number of components in each source. */
+static inline nir_ssa_def *
+nir_fdot(nir_builder *build, nir_ssa_def *src0, nir_ssa_def *src1)
+{
+   assert(src0->num_components == src1->num_components);
+   switch (src0->num_components) {
+   case 1: return nir_fmul(build, src0, src1);
+   case 2: return nir_fdot2(build, src0, src1);
+   case 3: return nir_fdot3(build, src0, src1);
+   case 4: return nir_fdot4(build, src0, src1);
+   default:
+      unreachable("bad component size");
+   }
+
+   return NULL;
+}
+
 static inline nir_ssa_def *
 nir_channel(nir_builder *b, nir_ssa_def *def, unsigned c)
 {

From 6a2479d61804e6cfc7389e1185139d0c8be758e3 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 10:35:03 -0700
Subject: [PATCH 096/238] nir/builder: Add a helper for storing to variable
 derefs

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir_builder.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 4df79f58a7e..e9c409b8d12 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -398,6 +398,22 @@ nir_store_var(nir_builder *build, nir_variable *var, nir_ssa_def *value,
    nir_builder_instr_insert(build, &store->instr);
 }
 
+static inline void
+nir_store_deref_var(nir_builder *build, nir_deref_var *deref,
+                    nir_ssa_def *value, unsigned writemask)
+{
+   const unsigned num_components =
+      glsl_get_vector_elements(nir_deref_tail(&deref->deref)->type);
+
+   nir_intrinsic_instr *store =
+      nir_intrinsic_instr_create(build->shader, nir_intrinsic_store_var);
+   store->num_components = num_components;
+   store->const_index[0] = writemask & ((1 << num_components) - 1);
+   store->variables[0] = nir_deref_as_var(nir_copy_deref(store, &deref->deref));
+   store->src[0] = nir_src_for_ssa(value);
+   nir_builder_instr_insert(build, &store->instr);
+}
+
 static inline void
 nir_copy_deref_var(nir_builder *build, nir_deref_var *dest, nir_deref_var *src)
 {

From 1be4c61c957d656d0d874b82779b4016a8bb03fd Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 10:43:46 -0700
Subject: [PATCH 097/238] nir/builder: Add a helper for creating undefs

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir_builder.h | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index e9c409b8d12..3dc7c25ec28 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -74,6 +74,20 @@ nir_builder_cf_insert(nir_builder *build, nir_cf_node *cf)
    nir_cf_node_insert(build->cursor, cf);
 }
 
+static inline nir_ssa_def *
+nir_ssa_undef(nir_builder *build, unsigned num_components, unsigned bit_size)
+{
+   nir_ssa_undef_instr *undef =
+      nir_ssa_undef_instr_create(build->shader, num_components);
+   undef->def.bit_size = bit_size;
+   if (!undef)
+      return NULL;
+
+   nir_instr_insert(nir_before_cf_list(&build->impl->body), &undef->instr);
+
+   return &undef->def;
+}
+
 static inline nir_ssa_def *
 nir_build_imm(nir_builder *build, unsigned num_components, nir_const_value value)
 {

From 49be812be60e7fab949bcd352583649a1dbf06b4 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 11:10:30 -0700
Subject: [PATCH 098/238] nir/sweep: Sweep function parameters

They are no longer in the list of local variables so we need to explicitly
sweep them.

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir_sweep.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/compiler/nir/nir_sweep.c b/src/compiler/nir/nir_sweep.c
index 5c62154ec7f..b22f0f56569 100644
--- a/src/compiler/nir/nir_sweep.c
+++ b/src/compiler/nir/nir_sweep.c
@@ -119,6 +119,8 @@ sweep_impl(nir_shader *nir, nir_function_impl *impl)
    ralloc_steal(nir, impl);
 
    ralloc_steal(nir, impl->params);
+   for (unsigned i = 0; i < impl->num_params; i++)
+      ralloc_steal(nir, impl->params[i]);
    ralloc_steal(nir, impl->return_var);
    steal_list(nir, nir_variable, &impl->locals);
    steal_list(nir, nir_register, &impl->registers);

From 38de85f9a5f3daae65ebe715f29fe2783e4ea146 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 14:07:41 -0700
Subject: [PATCH 099/238] nir: Add a helper for getting the unique function in
 a shader

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 6bd871dc43a..cab304814a2 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1761,6 +1761,17 @@ typedef struct nir_shader {
    gl_shader_stage stage;
 } nir_shader;
 
+static inline nir_function *
+nir_shader_get_entrypoint(nir_shader *shader)
+{
+   assert(exec_list_length(&shader->functions) == 1);
+   struct exec_node *func_node = exec_list_get_head(&shader->functions);
+   nir_function *func = exec_node_data(nir_function, func_node, node);
+   assert(func->return_type == glsl_void_type());
+   assert(func->num_params == 0);
+   return func;
+}
+
 #define nir_foreach_function(shader, func) \
    foreach_list_typed(nir_function, func, node, &(shader)->functions)
 

From 31a5bec93fd6a39b0bc124965e828315aff105d2 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 14:17:18 -0700
Subject: [PATCH 100/238] nir/lower_out_to_temp: Steal the output's constant
 initializer

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir_lower_outputs_to_temporaries.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/compiler/nir/nir_lower_outputs_to_temporaries.c b/src/compiler/nir/nir_lower_outputs_to_temporaries.c
index 71b06b81fcc..80c9af45f2f 100644
--- a/src/compiler/nir/nir_lower_outputs_to_temporaries.c
+++ b/src/compiler/nir/nir_lower_outputs_to_temporaries.c
@@ -97,6 +97,9 @@ nir_lower_outputs_to_temporaries(nir_shader *shader)
       /* Reparent the name to the new variable */
       ralloc_steal(output, output->name);
 
+      /* Reparent the constant initializer (if any) */
+      ralloc_steal(output, output->constant_initializer);
+
       /* Give the output a new name with @out-temp appended */
       temp->name = ralloc_asprintf(var, "%s@out-temp", output->name);
       temp->data.mode = nir_var_global;

From be98c475284823abdd9a6102c1c65ed548d9afd2 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 14:11:19 -0700
Subject: [PATCH 101/238] nir/lower_out_to_temp: Add an "entrypoint" parameter

Previously, the pass assumed that the entrypoint would be whatever function
happened to have the name "main".  We really shouldn't trust in the
function names.

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/glsl_to_nir.cpp                    | 2 +-
 src/compiler/nir/nir.h                              | 3 ++-
 src/compiler/nir/nir_lower_outputs_to_temporaries.c | 4 ++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index f6e1a17a916..1fac481ec9f 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -143,7 +143,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
    v2.run(sh->ir);
    visit_exec_list(sh->ir, &v1);
 
-   nir_lower_outputs_to_temporaries(shader);
+   nir_lower_outputs_to_temporaries(shader, nir_shader_get_entrypoint(shader));
 
    shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
    if (shader_prog->Label)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index cab304814a2..d5c6a056164 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2174,7 +2174,8 @@ bool nir_lower_indirect_derefs(nir_shader *shader, uint32_t mode_mask);
 
 bool nir_lower_locals_to_regs(nir_shader *shader);
 
-void nir_lower_outputs_to_temporaries(nir_shader *shader);
+void nir_lower_outputs_to_temporaries(nir_shader *shader,
+                                      nir_function *entrypoint);
 
 void nir_assign_var_locations(struct exec_list *var_list,
                               unsigned *size,
diff --git a/src/compiler/nir/nir_lower_outputs_to_temporaries.c b/src/compiler/nir/nir_lower_outputs_to_temporaries.c
index 80c9af45f2f..00ac09114cf 100644
--- a/src/compiler/nir/nir_lower_outputs_to_temporaries.c
+++ b/src/compiler/nir/nir_lower_outputs_to_temporaries.c
@@ -74,7 +74,7 @@ emit_output_copies_block(nir_block *block, void *state)
 }
 
 void
-nir_lower_outputs_to_temporaries(nir_shader *shader)
+nir_lower_outputs_to_temporaries(nir_shader *shader, nir_function *entrypoint)
 {
    struct lower_outputs_state state;
 
@@ -117,7 +117,7 @@ nir_lower_outputs_to_temporaries(nir_shader *shader)
           * before each EmitVertex call.
           */
          nir_foreach_block(function->impl, emit_output_copies_block, &state);
-      } else if (strcmp(function->name, "main") == 0) {
+      } else if (function == entrypoint) {
          /* For all other shader types, we need to do the copies right before
           * the jumps to the end block.
           */

From 35e2e96b307bcd6dd839a11e2bd98fa22bd4d50a Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 14:16:47 -0700
Subject: [PATCH 102/238] nir: Add a helper for getting the current block from
 a cursor

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index d5c6a056164..c19ae5948bd 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1881,6 +1881,17 @@ typedef struct {
    };
 } nir_cursor;
 
+static inline nir_block *
+nir_cursor_current_block(nir_cursor cursor)
+{
+   if (cursor.option == nir_cursor_before_instr ||
+       cursor.option == nir_cursor_after_instr) {
+      return cursor.instr->block;
+   } else {
+      return cursor.block;
+   }
+}
+
 bool nir_cursors_equal(nir_cursor a, nir_cursor b);
 
 static inline nir_cursor

From 8568d02498d12ebde6a6245056eebfbfe18aaf8f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Thu, 25 Feb 2016 11:11:54 +0100
Subject: [PATCH 103/238] glsl: add is_lhs bool on ast_expression

Useful to know if a expression is the recipient of an assignment
or not, that would be used to (for example) raise warnings of
"use of uninitialized variable" without getting a false positive
when assigning first a variable.

By default the value is false, and it is assigned to true on
the following cases:
 * The lhs assignments subexpression
 * At ast_array_index, on the array itself.
 * While handling the method on an array, to avoid the warning
   calling array.length
 * When computed the cached test expression at test_to_hir, to
   avoid a duplicate warning on the test expression of a switch.

set_is_lhs setter is added, because in some cases (like ast_field_selection)
the value need to be propagated on the expression tree. To avoid doing the
propatagion if not needed, it skips if no primary_expression.identifier is
available.

v2: use a new bool on ast_expression, instead of a new parameter
    on ast_expression::hir (Timothy Arceri)

v3: fix style and some typos on comments, initialize is_lhs default value
    on constructor, to avoid a c++11 feature (Ian Romanick)

v4: some tweaks on comments (Timothy Arceri)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94129

Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 src/compiler/glsl/ast.h                  |  6 +++++
 src/compiler/glsl/ast_function.cpp       |  4 +++
 src/compiler/glsl/ast_to_hir.cpp         | 33 ++++++++++++++++++++++++
 src/compiler/glsl/glsl_parser_extras.cpp |  1 +
 4 files changed, 44 insertions(+)

diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 727aa432631..9f46340e6e2 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -214,6 +214,7 @@ public:
       subexpressions[2] = NULL;
       primary_expression.identifier = identifier;
       this->non_lvalue_description = NULL;
+      this->is_lhs = false;
    }
 
    static const char *operator_string(enum ast_operators op);
@@ -263,6 +264,11 @@ public:
     * This pointer may be \c NULL.
     */
    const char *non_lvalue_description;
+
+   void set_is_lhs(bool new_value);
+
+private:
+   bool is_lhs;
 };
 
 class ast_expression_bin : public ast_expression {
diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp
index 1a440203cfc..db68d5dfa48 100644
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -1727,6 +1727,10 @@ ast_function_expression::handle_method(exec_list *instructions,
    const char *method;
    method = field->primary_expression.identifier;
 
+   /* This would prevent to raise "uninitialized variable" warnings when
+    * calling array.length.
+    */
+   field->subexpressions[0]->set_is_lhs(true);
    op = field->subexpressions[0]->hir(instructions, state);
    if (strcmp(method, "length") == 0) {
       if (!this->expressions.is_empty()) {
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 35def8e3ae0..e162203c1c8 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1248,6 +1248,24 @@ ast_expression::hir_no_rvalue(exec_list *instructions,
    do_hir(instructions, state, false);
 }
 
+void
+ast_expression::set_is_lhs(bool new_value)
+{
+   /* is_lhs is tracked only to print "variable used uninitialized" warnings,
+    * if we lack a identifier we can just skip it.
+    */
+   if (this->primary_expression.identifier == NULL)
+      return;
+
+   this->is_lhs = new_value;
+
+   /* We need to go through the subexpressions tree to cover cases like
+    * ast_field_selection
+    */
+   if (this->subexpressions[0] != NULL)
+      this->subexpressions[0]->set_is_lhs(new_value);
+}
+
 ir_rvalue *
 ast_expression::do_hir(exec_list *instructions,
                        struct _mesa_glsl_parse_state *state,
@@ -1323,6 +1341,7 @@ ast_expression::do_hir(exec_list *instructions,
       break;
 
    case ast_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
@@ -1592,6 +1611,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_div_assign:
    case ast_add_assign:
    case ast_sub_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
@@ -1618,6 +1638,7 @@ ast_expression::do_hir(exec_list *instructions,
    }
 
    case ast_mod_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
@@ -1640,6 +1661,7 @@ ast_expression::do_hir(exec_list *instructions,
 
    case ast_ls_assign:
    case ast_rs_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
       type = shift_result_type(op[0]->type, op[1]->type, this->oper, state,
@@ -1658,6 +1680,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_and_assign:
    case ast_xor_assign:
    case ast_or_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
       type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
@@ -1839,6 +1862,11 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_array_index: {
       YYLTYPE index_loc = subexpressions[1]->get_location();
 
+      /* Getting if an array is being used uninitialized is beyond what we get
+       * from ir_value.data.assigned. Setting is_lhs as true would force to
+       * not raise a uninitialized warning when using an array
+       */
+      subexpressions[0]->set_is_lhs(true);
       op[0] = subexpressions[0]->hir(instructions, state);
       op[1] = subexpressions[1]->hir(instructions, state);
 
@@ -5746,6 +5774,11 @@ ast_switch_statement::test_to_hir(exec_list *instructions,
 {
    void *ctx = state;
 
+   /* set to true to avoid a duplicate "use of uninitialized variable" warning
+    * on the switch test case. The first one would be already raised when
+    * getting the test_expression at ast_switch_statement::hir
+    */
+   test_expression->set_is_lhs(true);
    /* Cache value of test expression. */
    ir_rvalue *const test_val =
       test_expression->hir(instructions,
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 1d9bfd6aaba..ea9639b728d 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -1206,6 +1206,7 @@ ast_expression::ast_expression(int oper,
    this->subexpressions[1] = ex1;
    this->subexpressions[2] = ex2;
    this->non_lvalue_description = NULL;
+   this->is_lhs = false;
 }
 
 

From dcd41ca87a06199184eb8ada654aec985185189c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Tue, 23 Feb 2016 11:48:52 +0100
Subject: [PATCH 104/238] glsl: raise warning when using uninitialized
 variables

v2:
 * Take into account out varyings too (Timothy Arceri)
 * Fix style (Timothy Arceri)
 * Use a new ast_expression variable, instead of an
   ast_expression::hir new parameter (Timothy Arceri)

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=94129

Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index e162203c1c8..29a4642af2c 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1901,6 +1901,13 @@ ast_expression::do_hir(exec_list *instructions,
       if (var != NULL) {
          var->data.used = true;
          result = new(ctx) ir_dereference_variable(var);
+
+         if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_shader_out)
+             && !this->is_lhs
+             && result->variable_referenced()->data.assigned != true) {
+            _mesa_glsl_warning(&loc, state, "`%s' used uninitialized",
+                               this->primary_expression.identifier);
+         }
       } else {
          _mesa_glsl_error(& loc, state, "`%s' undeclared",
                           this->primary_expression.identifier);

From ea0f62e45eab39a8957f933325a1e16ace5c9e8c Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 24 Mar 2016 13:57:58 -0400
Subject: [PATCH 105/238] glapi/glx: Sync some additional error checking from
 xserver

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/mapi/glapi/gen/glX_proto_recv.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index 1cfa8c5c142..cf5b59c124a 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -437,6 +437,10 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
                 print '        %s %s = __glXGetAnswerBuffer(cl, %s%s, answerBuffer, sizeof(answerBuffer), %u);' % (param.type_string(), param.name, param.counter, size_scale, type_size)
                 answer_string = param.name
                 answer_count = param.counter
+                print ''
+                print '        if (%s == NULL) return BadAlloc;' % (param.name)
+                print '        __glXClearErrorOccured();'
+                print ''
             elif c >= 1:
                 print '        %s %s[%u];' % (answer_type, param.name, c)
                 answer_string = param.name

From 5e1aec6db073416ae6ddf0eeaaea4964b489af8e Mon Sep 17 00:00:00 2001
From: Adam Jackson <ajax@redhat.com>
Date: Thu, 24 Mar 2016 13:57:58 -0400
Subject: [PATCH 106/238] glapi/glx: Mark the indirect swapped dispatch
 functions _X_COLD

A modest size savings:

   text	   data	    bss	    dec	    hex	filename
 264143	  15608	    232	 279983	  445af libglx.so.before
 254303	  15608	    232	 270143	  41f3f libglx.so.after

Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Signed-off-by: Adam Jackson <ajax@redhat.com>
---
 src/mapi/glapi/gen/glX_proto_recv.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index cf5b59c124a..afee3882254 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -55,15 +55,15 @@ class PrintGlxDispatch_h(gl_XML.gl_print_base):
             if not func.ignore and not func.vectorequiv:
                 if func.glx_rop:
                     print 'extern _X_HIDDEN void __glXDisp_%s(GLbyte * pc);' % (func.name)
-                    print 'extern _X_HIDDEN void __glXDispSwap_%s(GLbyte * pc);' % (func.name)
+                    print 'extern _X_HIDDEN _X_COLD void __glXDispSwap_%s(GLbyte * pc);' % (func.name)
                 elif func.glx_sop or func.glx_vendorpriv:
                     print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
-                    print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
+                    print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
 
                     if func.glx_sop and func.glx_vendorpriv:
                         n = func.glx_vendorpriv_names[0]
                         print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
-                        print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
+                        print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
 
         return
 

From 9286cbdd1e2b0ef32db61f0fb4915ea2948ad4cd Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 28 Mar 2016 20:59:13 -0400
Subject: [PATCH 107/238] st/mesa: enable OES_texture_buffer when all
 components available
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OES_texture_buffer combines bits from a number of desktop extensions.
When they're all available, turn it on.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/mesa/state_tracker/st_extensions.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 44d93e30b4d..36a12010c23 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -1016,6 +1016,12 @@ void st_init_extensions(struct pipe_screen *screen,
                              PIPE_BIND_SAMPLER_VIEW);
    }
 
+   extensions->OES_texture_buffer =
+      extensions->ARB_texture_buffer_object &&
+      extensions->ARB_texture_buffer_range &&
+      extensions->ARB_texture_buffer_object_rgb32 &&
+      extensions->ARB_shader_image_load_store;
+
    /* Unpacking a varying in the fragment shader costs 1 texture indirection.
     * If the number of available texture indirections is very limited, then we
     * prefer to disable varying packing rather than run the risk of varying

From 7eb5e5b8b4ae51be367050df48d3a9398694d3bc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 14 Jan 2016 13:38:10 +0100
Subject: [PATCH 108/238] radeonsi: ignore PIPE_BIND_LINEAR in
 si_is_format_supported v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Linear layout should work for all not compressed or depth/stencil formats.

v2: restrict it a bit more

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com>
---
 src/gallium/drivers/radeonsi/si_state.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index a2b0da90ec9..10d691a92f1 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2046,6 +2046,11 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;
 
+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }
 

From d180de35320eafa3df3d76f0e82b332656530126 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 14 Jan 2016 13:40:25 +0100
Subject: [PATCH 109/238] st/vdpau: use linear layout for output surfaces
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Works around a bug in radeonsi and tiling is actually
not very beneficial in this use case.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 src/gallium/state_trackers/vdpau/output.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index 3248f76808d..95f15cb1264 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -79,7 +79,8 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
    res_tmpl.height0 = height;
    res_tmpl.depth0 = 1;
    res_tmpl.array_size = 1;
-   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET |
+                   PIPE_BIND_LINEAR;
    res_tmpl.usage = PIPE_USAGE_DEFAULT;
 
    pipe_mutex_lock(dev->mutex);

From faba96bc60bfcf28678781f20ee0a42eb1950018 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Tue, 12 Jan 2016 16:07:58 +0100
Subject: [PATCH 110/238] st/vdpau: add new interop interface
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Use DMA-buf for the VDPAU interop interface instead of using
internal structures.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 .../include/state_tracker/vdpau_dmabuf.h      | 94 +++++++++++++++++++
 .../include/state_tracker/vdpau_interop.h     |  7 +-
 2 files changed, 100 insertions(+), 1 deletion(-)
 create mode 100644 src/gallium/include/state_tracker/vdpau_dmabuf.h

diff --git a/src/gallium/include/state_tracker/vdpau_dmabuf.h b/src/gallium/include/state_tracker/vdpau_dmabuf.h
new file mode 100644
index 00000000000..886c3445d81
--- /dev/null
+++ b/src/gallium/include/state_tracker/vdpau_dmabuf.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ *      Christian König <christian.koenig@amd.com>
+ *
+ */
+
+#ifndef _VDPAU_DMABUF_H_
+#define _VDPAU_DMABUF_H_
+
+#include <vdpau/vdpau.h>
+
+/* driver specific functions for NV_vdpau_interop */
+#ifndef VDP_FUNC_ID_BASE_DRIVER
+#define VDP_FUNC_ID_BASE_DRIVER 0x2000
+#endif
+
+/* New DMA-buf based implementation */
+#define VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 2)
+#define VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 3)
+
+/* Define some more internal RGBA formats for more
+ * robust handling of Video Surfaces
+ */
+#define VDP_RGBA_FORMAT_R8          (-1)
+#define VDP_RGBA_FORMAT_R8G8        (-2)
+
+struct VdpSurfaceDMABufDesc {
+   /* DMA-buf file descriptor */
+   uint32_t handle;
+   /* Width in pixel */
+   uint32_t width;
+   /* Height in pixel */
+   uint32_t height;
+   /* Offset in bytes */
+   uint32_t offset;
+   /* Stride in bytes */
+   uint32_t stride;
+   /* VDP_RGBA_FORMAT_* as defined in the VDPAU spec and above. */
+   uint32_t format;
+};
+
+/**
+ * \brief Video surface planes
+ */
+typedef uint32_t VdpVideoSurfacePlane;
+
+/** \hideinitializer \brief Luma top field */
+#define VDP_VIDEO_SURFACE_PLANE_LUMA_TOP      ((VdpVideoSurfacePlane)0)
+/** \hideinitializer \brief Luma bottom field */
+#define VDP_VIDEO_SURFACE_PLANE_LUMA_BOTTOM   ((VdpVideoSurfacePlane)1)
+/** \hideinitializer \brief Chroma top field */
+#define VDP_VIDEO_SURFACE_PLANE_CHROMA_TOP    ((VdpVideoSurfacePlane)2)
+/** \hideinitializer \brief Chroma bottom field */
+#define VDP_VIDEO_SURFACE_PLANE_CHROMA_BOTTOM ((VdpVideoSurfacePlane)3)
+
+typedef VdpStatus VdpVideoSurfaceDMABuf(
+   VdpVideoSurface               surface,
+   VdpVideoSurfacePlane          plane,
+   struct VdpSurfaceDMABufDesc * result
+);
+
+typedef VdpStatus VdpOutputSurfaceDMABuf(
+   VdpVideoSurface               surface,
+   struct VdpSurfaceDMABufDesc * result
+);
+
+#endif /* _VDPAU_DMABUF_H_ */
diff --git a/src/gallium/include/state_tracker/vdpau_interop.h b/src/gallium/include/state_tracker/vdpau_interop.h
index 3ca7c9d4aa6..04d455a370a 100644
--- a/src/gallium/include/state_tracker/vdpau_interop.h
+++ b/src/gallium/include/state_tracker/vdpau_interop.h
@@ -35,8 +35,13 @@
 #define _VDPAU_INTEROP_H_
 
 /* driver specific functions for NV_vdpau_interop */
-
+#ifndef VDP_FUNC_ID_BASE_DRIVER
 #define VDP_FUNC_ID_BASE_DRIVER 0x2000
+#endif
+
+/* Older implementation relying on passing pipe_video_buffer and
+ * pipe_resources around. Deprecated and shouldn't be used for new things.
+ */
 #define VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 0)
 #define VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 1)
 

From 0042aa508e19bb920d0ab385894cd3e03b9eafde Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Wed, 13 Jan 2016 16:42:44 +0100
Subject: [PATCH 111/238] st/vdpau: move FormatRGBAToPipe into the interop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We are going to need that in the Mesa state tracker as well.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 .../include/state_tracker/vdpau_funcs.h       | 65 +++++++++++++++++++
 src/gallium/state_trackers/vdpau/bitmap.c     |  2 +-
 src/gallium/state_trackers/vdpau/output.c     |  2 +-
 src/gallium/state_trackers/vdpau/query.c      | 10 +--
 .../state_trackers/vdpau/vdpau_private.h      | 22 +------
 5 files changed, 73 insertions(+), 28 deletions(-)
 create mode 100644 src/gallium/include/state_tracker/vdpau_funcs.h

diff --git a/src/gallium/include/state_tracker/vdpau_funcs.h b/src/gallium/include/state_tracker/vdpau_funcs.h
new file mode 100644
index 00000000000..66e3c23ede4
--- /dev/null
+++ b/src/gallium/include/state_tracker/vdpau_funcs.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ *      Christian König <christian.koenig@amd.com>
+ *
+ */
+
+#ifndef _VDPAU_FUNCS_H_
+#define _VDPAU_FUNCS_H_
+
+#include "vdpau_dmabuf.h"
+
+/* Used for implementing NV_vdpau_interop */
+static inline enum pipe_format
+VdpFormatRGBAToPipe(uint32_t vdpau_format)
+{
+   switch (vdpau_format) {
+   case VDP_RGBA_FORMAT_R8:
+      return PIPE_FORMAT_R8_UNORM;
+   case VDP_RGBA_FORMAT_R8G8:
+      return PIPE_FORMAT_R8G8_UNORM;
+   case VDP_RGBA_FORMAT_A8:
+      return PIPE_FORMAT_A8_UNORM;
+   case VDP_RGBA_FORMAT_B10G10R10A2:
+      return PIPE_FORMAT_B10G10R10A2_UNORM;
+   case VDP_RGBA_FORMAT_B8G8R8A8:
+      return PIPE_FORMAT_B8G8R8A8_UNORM;
+   case VDP_RGBA_FORMAT_R10G10B10A2:
+      return PIPE_FORMAT_R10G10B10A2_UNORM;
+   case VDP_RGBA_FORMAT_R8G8B8A8:
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+   default:
+      assert(0);
+   }
+
+   return PIPE_FORMAT_NONE;
+}
+
+#endif /* _VDPAU_FUNCS_H_ */
diff --git a/src/gallium/state_trackers/vdpau/bitmap.c b/src/gallium/state_trackers/vdpau/bitmap.c
index 97a428727a5..35c8820433d 100644
--- a/src/gallium/state_trackers/vdpau/bitmap.c
+++ b/src/gallium/state_trackers/vdpau/bitmap.c
@@ -71,7 +71,7 @@ vlVdpBitmapSurfaceCreate(VdpDevice device,
 
    memset(&res_tmpl, 0, sizeof(res_tmpl));
    res_tmpl.target = PIPE_TEXTURE_2D;
-   res_tmpl.format = FormatRGBAToPipe(rgba_format);
+   res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
    res_tmpl.width0 = width;
    res_tmpl.height0 = height;
    res_tmpl.depth0 = 1;
diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index 95f15cb1264..738e7c74f6b 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -74,7 +74,7 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
    memset(&res_tmpl, 0, sizeof(res_tmpl));
 
    res_tmpl.target = PIPE_TEXTURE_2D;
-   res_tmpl.format = FormatRGBAToPipe(rgba_format);
+   res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
    res_tmpl.width0 = width;
    res_tmpl.height0 = height;
    res_tmpl.depth0 = 1;
diff --git a/src/gallium/state_trackers/vdpau/query.c b/src/gallium/state_trackers/vdpau/query.c
index d41e6d950a7..a279ad3d020 100644
--- a/src/gallium/state_trackers/vdpau/query.c
+++ b/src/gallium/state_trackers/vdpau/query.c
@@ -224,7 +224,7 @@ vlVdpOutputSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba
    if (!pscreen)
       return VDP_STATUS_RESOURCES;
 
-   format = FormatRGBAToPipe(surface_rgba_format);
+   format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -276,7 +276,7 @@ vlVdpOutputSurfaceQueryGetPutBitsNativeCapabilities(VdpDevice device, VdpRGBAFor
    if (!pscreen)
       return VDP_STATUS_ERROR;
 
-   format = FormatRGBAToPipe(surface_rgba_format);
+   format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -317,7 +317,7 @@ vlVdpOutputSurfaceQueryPutBitsIndexedCapabilities(VdpDevice device,
    if (!pscreen)
       return VDP_STATUS_ERROR;
 
-   rgba_format = FormatRGBAToPipe(surface_rgba_format);
+   rgba_format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -376,7 +376,7 @@ vlVdpOutputSurfaceQueryPutBitsYCbCrCapabilities(VdpDevice device, VdpRGBAFormat
    if (!pscreen)
       return VDP_STATUS_ERROR;
 
-   rgba_format = FormatRGBAToPipe(surface_rgba_format);
+   rgba_format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -424,7 +424,7 @@ vlVdpBitmapSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba
    if (!pscreen)
       return VDP_STATUS_RESOURCES;
 
-   format = FormatRGBAToPipe(surface_rgba_format);
+   format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (format == PIPE_FORMAT_NONE)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index 27ac44cd9c1..0d9c1e9f3d0 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -37,6 +37,7 @@
 #include "pipe/p_video_codec.h"
 
 #include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_funcs.h"
 
 #include "util/u_debug.h"
 #include "util/u_rect.h"
@@ -161,27 +162,6 @@ PipeToFormatYCBCR(enum pipe_format p_format)
    return -1;
 }
 
-static inline enum pipe_format
-FormatRGBAToPipe(VdpRGBAFormat vdpau_format)
-{
-   switch (vdpau_format) {
-      case VDP_RGBA_FORMAT_A8:
-         return PIPE_FORMAT_A8_UNORM;
-      case VDP_RGBA_FORMAT_B10G10R10A2:
-         return PIPE_FORMAT_B10G10R10A2_UNORM;
-      case VDP_RGBA_FORMAT_B8G8R8A8:
-         return PIPE_FORMAT_B8G8R8A8_UNORM;
-      case VDP_RGBA_FORMAT_R10G10B10A2:
-         return PIPE_FORMAT_R10G10B10A2_UNORM;
-      case VDP_RGBA_FORMAT_R8G8B8A8:
-         return PIPE_FORMAT_R8G8B8A8_UNORM;
-      default:
-         assert(0);
-   }
-
-   return PIPE_FORMAT_NONE;
-}
-
 static inline VdpRGBAFormat
 PipeToFormatRGBA(enum pipe_format p_format)
 {

From bdeb22b7b6204cf7a0eaab123118e2522a9abcd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 14 Jan 2016 16:45:29 +0100
Subject: [PATCH 112/238] st/vdpau: implement the new DMA-buf based interop v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

That should allow us to get away from passing internal structures around.

v2: rebased

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 src/gallium/state_trackers/vdpau/ftab.c       |  6 +-
 src/gallium/state_trackers/vdpau/output.c     | 41 ++++++++++-
 src/gallium/state_trackers/vdpau/surface.c    | 69 +++++++++++++++++++
 .../state_trackers/vdpau/vdpau_private.h      |  3 +
 4 files changed, 116 insertions(+), 3 deletions(-)

diff --git a/src/gallium/state_trackers/vdpau/ftab.c b/src/gallium/state_trackers/vdpau/ftab.c
index add465983e5..901a444f1c7 100644
--- a/src/gallium/state_trackers/vdpau/ftab.c
+++ b/src/gallium/state_trackers/vdpau/ftab.c
@@ -107,10 +107,12 @@ static void* ftab_winsys[1] =
    &vlVdpPresentationQueueTargetCreateX11  /* VDP_FUNC_ID_PRESENTATION_QUEUE_TARGET_CREATE_X11 */
 };
 
-static void* ftab_driver[2] =
+static void* ftab_driver[4] =
 {
    &vlVdpVideoSurfaceGallium, /* VDP_FUNC_ID_SURFACE_GALLIUM */
-   &vlVdpOutputSurfaceGallium /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */
+   &vlVdpOutputSurfaceGallium, /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */
+   &vlVdpVideoSurfaceDMABuf, /* VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF */
+   &vlVdpOutputSurfaceDMABuf /* VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF */
 };
 
 boolean vlGetFuncFTAB(VdpFuncId function_id, void **func)
diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index 738e7c74f6b..c644cc8ba85 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -36,6 +36,8 @@
 
 #include "vl/vl_csc.h"
 
+#include "state_tracker/drm_driver.h"
+
 #include "vdpau_private.h"
 
 /**
@@ -80,7 +82,7 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
    res_tmpl.depth0 = 1;
    res_tmpl.array_size = 1;
    res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET |
-                   PIPE_BIND_LINEAR;
+                   PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
    res_tmpl.usage = PIPE_USAGE_DEFAULT;
 
    pipe_mutex_lock(dev->mutex);
@@ -764,3 +766,40 @@ struct pipe_resource *vlVdpOutputSurfaceGallium(VdpOutputSurface surface)
 
    return vlsurface->surface->texture;
 }
+
+VdpStatus vlVdpOutputSurfaceDMABuf(VdpVideoSurface surface,
+                                   struct VdpSurfaceDMABufDesc *result)
+{
+   vlVdpOutputSurface *vlsurface;
+   struct pipe_screen *pscreen;
+   struct winsys_handle whandle;
+
+   memset(result, 0, sizeof(*result));
+   result->handle = -1;
+
+   vlsurface = vlGetDataHTAB(surface);
+   if (!vlsurface || !vlsurface->surface)
+      return VDP_STATUS_INVALID_HANDLE;
+
+   pipe_mutex_lock(vlsurface->device->mutex);
+   vlVdpResolveDelayedRendering(vlsurface->device, NULL, NULL);
+   vlsurface->device->context->flush(vlsurface->device->context, NULL, 0);
+   pipe_mutex_unlock(vlsurface->device->mutex);
+
+   memset(&whandle, 0, sizeof(struct winsys_handle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+
+   pscreen = vlsurface->surface->texture->screen;
+   if (!pscreen->resource_get_handle(pscreen, vlsurface->surface->texture, &whandle,
+				     PIPE_HANDLE_USAGE_READ_WRITE))
+      return VDP_STATUS_NO_IMPLEMENTATION;
+
+   result->handle = whandle.handle;
+   result->width = vlsurface->surface->width;
+   result->height = vlsurface->surface->height;
+   result->offset = whandle.offset;
+   result->stride = whandle.stride;
+   result->format = PipeToFormatRGBA(vlsurface->surface->format);
+
+   return VDP_STATUS_OK;
+}
diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c
index ffcedc12de6..0550141b597 100644
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -37,6 +37,8 @@
 #include "util/u_video.h"
 #include "vl/vl_defines.h"
 
+#include "state_tracker/drm_driver.h"
+
 #include "vdpau_private.h"
 
 enum getbits_conversion {
@@ -412,3 +414,70 @@ struct pipe_video_buffer *vlVdpVideoSurfaceGallium(VdpVideoSurface surface)
 
    return p_surf->video_buffer;
 }
+
+VdpStatus vlVdpVideoSurfaceDMABuf(VdpVideoSurface surface,
+                                  VdpVideoSurfacePlane plane,
+                                  struct VdpSurfaceDMABufDesc *result)
+{
+   vlVdpSurface *p_surf = vlGetDataHTAB(surface);
+
+   struct pipe_screen *pscreen;
+   struct winsys_handle whandle;
+
+   struct pipe_surface *surf;
+
+   memset(result, 0, sizeof(*result));
+   result->handle = -1;
+
+   if (!p_surf)
+      return VDP_STATUS_INVALID_HANDLE;
+
+   if (plane > 3)
+      return VDP_STATUS_INVALID_VALUE;
+
+   if (result)
+      return VDP_STATUS_INVALID_POINTER;
+
+   pipe_mutex_lock(p_surf->device->mutex);
+   if (p_surf->video_buffer == NULL) {
+      struct pipe_context *pipe = p_surf->device->context;
+
+      /* try to create a video buffer if we don't already have one */
+      p_surf->video_buffer = pipe->create_video_buffer(pipe, &p_surf->templat);
+   }
+
+   /* Check if surface match interop requirements */
+   if (p_surf->video_buffer == NULL || !p_surf->video_buffer->interlaced ||
+       p_surf->video_buffer->buffer_format != PIPE_FORMAT_NV12) {
+      pipe_mutex_unlock(p_surf->device->mutex);
+      return VDP_STATUS_NO_IMPLEMENTATION;
+   }
+
+   surf = p_surf->video_buffer->get_surfaces(p_surf->video_buffer)[plane];
+   pipe_mutex_unlock(p_surf->device->mutex);
+
+   if (!surf)
+      return VDP_STATUS_RESOURCES;
+
+   memset(&whandle, 0, sizeof(struct winsys_handle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.layer = surf->u.tex.first_layer;
+
+   pscreen = surf->texture->screen;
+   if (!pscreen->resource_get_handle(pscreen, surf->texture, &whandle,
+				     PIPE_HANDLE_USAGE_READ_WRITE))
+      return VDP_STATUS_NO_IMPLEMENTATION;
+
+   result->handle = whandle.handle;
+   result->width = surf->width;
+   result->height = surf->height;
+   result->offset = whandle.offset;
+   result->stride = whandle.stride;
+
+   if (surf->format == PIPE_FORMAT_R8_UNORM)
+      result->format = VDP_RGBA_FORMAT_R8;
+   else
+      result->format = VDP_RGBA_FORMAT_R8G8;
+
+   return VDP_STATUS_OK;
+}
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index 0d9c1e9f3d0..3b6647e9975 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -37,6 +37,7 @@
 #include "pipe/p_video_codec.h"
 
 #include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_dmabuf.h"
 #include "state_tracker/vdpau_funcs.h"
 
 #include "util/u_debug.h"
@@ -522,6 +523,8 @@ VdpPresentationQueueTargetCreateX11 vlVdpPresentationQueueTargetCreateX11;
 /* interop to mesa state tracker */
 VdpVideoSurfaceGallium vlVdpVideoSurfaceGallium;
 VdpOutputSurfaceGallium vlVdpOutputSurfaceGallium;
+VdpVideoSurfaceDMABuf vlVdpVideoSurfaceDMABuf;
+VdpOutputSurfaceDMABuf vlVdpOutputSurfaceDMABuf;
 
 #define VDPAU_OUT   0
 #define VDPAU_ERR   1

From cc68dc2b5e3c603580f70d682b0772d179b28ce3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Thu, 14 Jan 2016 16:46:57 +0100
Subject: [PATCH 113/238] st/mesa: implement new DMA-buf based VDPAU interop v2
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid using internal structures from another API.

v2: rebase and moved includes so they don't cause problem when VDPAU isn't installed.

Signed-off-by: Christian König <christian.koenig@amd.com>
Reviewed-by: Marek Olšák <marek.olsak@amd.com> (v1)
Reviewed-by: Leo Liu <leo.liu@amd.com>
---
 src/mesa/state_tracker/st_vdpau.c | 179 ++++++++++++++++++++++--------
 1 file changed, 131 insertions(+), 48 deletions(-)

diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index 71dd15bc4fe..b9abebfc7bf 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -39,8 +39,6 @@
 #include "pipe/p_state.h"
 #include "pipe/p_video_codec.h"
 
-#include "state_tracker/vdpau_interop.h"
-
 #include "util/u_inlines.h"
 
 #include "st_vdpau.h"
@@ -51,70 +49,155 @@
 
 #ifdef HAVE_ST_VDPAU
 
+#include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_dmabuf.h"
+#include "state_tracker/vdpau_funcs.h"
+#include "state_tracker/drm_driver.h"
+
+static struct pipe_resource *
+st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface,
+                               GLuint index)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+   struct pipe_sampler_view *sv;
+   VdpVideoSurfaceGallium *f;
+
+   struct pipe_video_buffer *buffer;
+   struct pipe_sampler_view **samplers;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f))
+      return NULL;
+
+   buffer = f((uintptr_t)vdpSurface);
+   if (!buffer)
+      return NULL;
+
+   samplers = buffer->get_sampler_view_planes(buffer);
+   if (!samplers)
+      return NULL;
+
+   sv = samplers[index >> 1];
+   if (!sv)
+      return NULL;
+
+   return sv->texture;
+}
+
+static struct pipe_resource *
+st_vdpau_output_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+   VdpOutputSurfaceGallium *f;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f))
+      return NULL;
+
+   return f((uintptr_t)vdpSurface);
+}
+
+static struct pipe_resource *
+st_vdpau_resource_from_description(struct gl_context *ctx,
+                                   const struct VdpSurfaceDMABufDesc *desc)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_resource templ, *res;
+   struct winsys_handle whandle;
+
+   if (desc->handle == -1)
+      return NULL;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.last_level = 0;
+   templ.depth0 = 1;
+   templ.array_size = 1;
+   templ.width0 = desc->width;
+   templ.height0 = desc->height;
+   templ.format = VdpFormatRGBAToPipe(desc->format);
+   templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+   templ.usage = PIPE_USAGE_DEFAULT;
+
+   memset(&whandle, 0, sizeof(whandle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.handle = desc->handle;
+   whandle.offset = desc->offset;
+   whandle.stride = desc->stride;
+
+   res = st->pipe->screen->resource_from_handle(st->pipe->screen, &templ, &whandle,
+						PIPE_HANDLE_USAGE_READ_WRITE);
+   close(desc->handle);
+
+   return res;
+}
+
+static struct pipe_resource *
+st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+
+   struct VdpSurfaceDMABufDesc desc;
+   VdpOutputSurfaceDMABuf *f;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF, (void**)&f))
+      return NULL;
+
+   if (f((uintptr_t)vdpSurface, &desc) != VDP_STATUS_OK)
+      return NULL;
+
+   return st_vdpau_resource_from_description(ctx, &desc);
+}
+
+static struct pipe_resource *
+st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface,
+                               GLuint index)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+
+   struct VdpSurfaceDMABufDesc desc;
+   VdpVideoSurfaceDMABuf *f;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF, (void**)&f))
+      return NULL;
+
+   if (f((uintptr_t)vdpSurface, index, &desc) != VDP_STATUS_OK)
+      return NULL;
+
+   return st_vdpau_resource_from_description(ctx, &desc);
+}
+
 static void
 st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
                      GLboolean output, struct gl_texture_object *texObj,
                      struct gl_texture_image *texImage,
                      const GLvoid *vdpSurface, GLuint index)
 {
-   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
-   uint32_t device = (uintptr_t)ctx->vdpDevice;
-
    struct st_context *st = st_context(ctx);
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct st_texture_image *stImage = st_texture_image(texImage);
- 
+
    struct pipe_resource *res;
    struct pipe_sampler_view templ, **sampler_view;
    mesa_format texFormat;
 
-   getProcAddr = (void *)ctx->vdpGetProcAddress;
    if (output) {
-      VdpOutputSurfaceGallium *f;
-      
-      if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
+      res = st_vdpau_output_surface_dma_buf(ctx, vdpSurface);
 
-      res = f((uintptr_t)vdpSurface);
-
-      if (!res) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
+      if (!res)
+         res = st_vdpau_output_surface_gallium(ctx, vdpSurface);
 
    } else {
-      struct pipe_sampler_view *sv;
-      VdpVideoSurfaceGallium *f;
+      res = st_vdpau_video_surface_dma_buf(ctx, vdpSurface, index);
 
-      struct pipe_video_buffer *buffer;
-      struct pipe_sampler_view **samplers;
-
-      if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      buffer = f((uintptr_t)vdpSurface);
-      if (!buffer) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      samplers = buffer->get_sampler_view_planes(buffer);
-      if (!samplers) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      sv = samplers[index >> 1];
-      if (!sv) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      res = sv->texture;
+      if (!res)
+         res = st_vdpau_video_surface_gallium(ctx, vdpSurface, index);
    }
 
    if (!res) {

From 9d57c84994abe45133382cf72ae617570bfe89da Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 29 Mar 2016 19:27:49 +0200
Subject: [PATCH 114/238] nvc0/ir: move load/store lowering pass to
 handleLDST()

Having all this code in a big switch is not really a good pratice.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 114 +++++++++---------
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h   |   1 +
 2 files changed, 61 insertions(+), 54 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 68a30ecb8d7..850147b62e9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1691,6 +1691,65 @@ NVC0LoweringPass::handleWRSV(Instruction *i)
    return true;
 }
 
+void
+NVC0LoweringPass::handleLDST(Instruction *i)
+{
+   if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+      if (prog->getType() == Program::TYPE_COMPUTE) {
+         i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+         i->getSrc(0)->reg.fileIndex = 0;
+      } else
+      if (prog->getType() == Program::TYPE_GEOMETRY &&
+          i->src(0).isIndirect(0)) {
+         // XXX: this assumes vec4 units
+         Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                                 i->getIndirect(0, 0), bld.mkImm(4));
+         i->setIndirect(0, 0, ptr);
+         i->op = OP_VFETCH;
+      } else {
+         i->op = OP_VFETCH;
+         assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+      }
+   } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+      if (i->src(0).isIndirect(1)) {
+         Value *ptr;
+         if (i->src(0).isIndirect(0))
+            ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
+                             i->getIndirect(0, 1), bld.mkImm(0x1010),
+                             i->getIndirect(0, 0));
+         else
+            ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                             i->getIndirect(0, 1), bld.mkImm(16));
+         i->setIndirect(0, 1, NULL);
+         i->setIndirect(0, 0, ptr);
+         i->subOp = NV50_IR_SUBOP_LDC_IS;
+      }
+   } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+      i->op = OP_VFETCH;
+   } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      Value *ind = i->getIndirect(0, 1);
+      Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+      // XXX come up with a way not to do this for EVERY little access but
+      // rather to batch these up somehow. Unfortunately we've lost the
+      // information about the field width by the time we get here.
+      Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+      Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+      Value *pred = new_LValue(func, FILE_PREDICATE);
+      if (i->src(0).isIndirect(0)) {
+         bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+      }
+      i->setIndirect(0, 1, NULL);
+      i->setIndirect(0, 0, ptr);
+      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+      i->setPredicate(CC_NOT_P, pred);
+      if (i->defExists(0)) {
+         bld.mkMov(i->getDef(0), bld.mkImm(0));
+      }
+   }
+}
+
 void
 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
 {
@@ -2016,60 +2075,7 @@ NVC0LoweringPass::visit(Instruction *i)
       return handleWRSV(i);
    case OP_STORE:
    case OP_LOAD:
-      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
-         if (prog->getType() == Program::TYPE_COMPUTE) {
-            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
-            i->getSrc(0)->reg.fileIndex = 0;
-         } else
-         if (prog->getType() == Program::TYPE_GEOMETRY &&
-             i->src(0).isIndirect(0)) {
-            // XXX: this assumes vec4 units
-            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                    i->getIndirect(0, 0), bld.mkImm(4));
-            i->setIndirect(0, 0, ptr);
-            i->op = OP_VFETCH;
-         } else {
-            i->op = OP_VFETCH;
-            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
-         }
-      } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
-         if (i->src(0).isIndirect(1)) {
-            Value *ptr;
-            if (i->src(0).isIndirect(0))
-               ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(0x1010),
-                                i->getIndirect(0, 0));
-            else
-               ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(16));
-            i->setIndirect(0, 1, NULL);
-            i->setIndirect(0, 0, ptr);
-            i->subOp = NV50_IR_SUBOP_LDC_IS;
-         }
-      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
-         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
-         i->op = OP_VFETCH;
-      } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
-         Value *ind = i->getIndirect(0, 1);
-         Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
-         // XXX come up with a way not to do this for EVERY little access but
-         // rather to batch these up somehow. Unfortunately we've lost the
-         // information about the field width by the time we get here.
-         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
-         Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
-         Value *pred = new_LValue(func, FILE_PREDICATE);
-         if (i->src(0).isIndirect(0)) {
-            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
-            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
-         }
-         i->setIndirect(0, 1, NULL);
-         i->setIndirect(0, 0, ptr);
-         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
-         i->setPredicate(CC_NOT_P, pred);
-         if (i->defExists(0)) {
-            bld.mkMov(i->getDef(0), bld.mkImm(0));
-         }
-      }
+      handleLDST(i);
       break;
    case OP_ATOM:
    {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index d2cb23f45d5..be81d29eb0a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -106,6 +106,7 @@ protected:
    bool handleCasExch(Instruction *, bool needCctl);
    void handleSurfaceOpNVE4(TexInstruction *);
    void handleSharedATOM(Instruction *);
+   void handleLDST(Instruction *);
 
    void checkPredicate(Instruction *);
 

From 96e0894106b25eec2bab4545566d832846e99034 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 29 Mar 2016 13:34:36 -0600
Subject: [PATCH 115/238] svga: avoid freeing non-malloced memory

svga_shader_expand() will fall back to using non-malloced memory for
emit.buf if malloc fails. We should check if the memory is malloced
before freeing it in the error path of svga_tgsi_vgpu9_translate.

Original patch by Thomas Hindoe Paaboel Andersen <phomes@gmail.com>.
Remove trivial svga_destroy_shader_emitter() function, by BrianP.

Signed-off-by: Brian Paul <brianp@vmware.com>
---
 src/gallium/drivers/svga/svga_tgsi.c | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index c62d4d671ef..7396ad08e27 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -50,15 +50,6 @@
  */
 static char err_buf[128];
 
-#if 0
-static void
-svga_destroy_shader_emitter(struct svga_shader_emitter *emit)
-{
-   if (emit->buf != err_buf)
-      FREE(emit->buf);
-}
-#endif
-
 
 static boolean
 svga_shader_expand(struct svga_shader_emitter *emit)
@@ -265,6 +256,7 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,
 
  fail:
    FREE(variant);
-   FREE(emit.buf);
+   if (emit.buf != err_buf)
+      FREE(emit.buf);
    return NULL;
 }

From 2d8df0306b45163eede94421948e9b1ae05e47bd Mon Sep 17 00:00:00 2001
From: Charmaine Lee <charmainel@vmware.com>
Date: Tue, 29 Mar 2016 13:34:36 -0600
Subject: [PATCH 116/238] svga: emit sampler declarations in the helper
 function for non vgpu10

With commit dc9ecf58c0c5c8a97cd41362e78c2fcd9f6e3b80,
we are now getting the sampler target from the sampler view
declaration. But since a sampler view declaration can be defined
after a sampler declaration, we need to emit the
sampler declarations in the pre-helpers function, otherwise,
the sampler target might not have defined yet for the sampler declaration.

Fixes viewperf maya-03 and various gl trace regressions in hwv11.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 .../drivers/svga/svga_tgsi_decl_sm30.c        | 19 ++++++++++++++++---
 src/gallium/drivers/svga/svga_tgsi_emit.h     |  4 ++++
 src/gallium/drivers/svga/svga_tgsi_insn.c     |  3 +++
 3 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index 204b814a964..418f898e0e3 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -535,7 +535,6 @@ svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx)
 
 static boolean
 ps30_sampler( struct svga_shader_emitter *emit,
-              struct tgsi_declaration_semantic semantic,
               unsigned idx )
 {
    SVGA3DOpDclArgs dcl;
@@ -553,6 +552,17 @@ ps30_sampler( struct svga_shader_emitter *emit,
            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
 }
 
+boolean
+svga_shader_emit_samplers_decl( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   for (i = 0; i < emit->num_samplers; i++) {
+      if (!ps30_sampler(emit, i))
+         return FALSE;
+   }
+   return TRUE;
+}
 
 boolean
 svga_translate_decl_sm30( struct svga_shader_emitter *emit,
@@ -563,12 +573,15 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit,
    unsigned idx;
 
    for( idx = first; idx <= last; idx++ ) {
-      boolean ok;
+      boolean ok = TRUE;
 
       switch (decl->Declaration.File) {
       case TGSI_FILE_SAMPLER:
          assert (emit->unit == PIPE_SHADER_FRAGMENT);
-         ok = ps30_sampler( emit, decl->Semantic, idx );
+         /* just keep track of the number of samplers here.
+          * Will emit the declaration in the helpers function.
+          */
+         emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
          break;
 
       case TGSI_FILE_INPUT:
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 7a593ba6e9d..114c9563e2b 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -137,6 +137,7 @@ struct svga_shader_emitter
 
    unsigned pstipple_sampler_unit;
 
+   int num_samplers;
    uint8_t sampler_target[PIPE_MAX_SAMPLERS];
 };
 
@@ -156,6 +157,9 @@ boolean
 svga_shader_emit_instructions(struct svga_shader_emitter *emit,
                               const struct tgsi_token *tokens);
 
+boolean
+svga_shader_emit_samplers_decl(struct svga_shader_emitter *emit);
+
 boolean
 svga_translate_decl_sm30(struct svga_shader_emitter *emit,
                          const struct tgsi_full_declaration *decl);
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 3188c411863..bedda2ecf71 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -3797,6 +3797,9 @@ svga_shader_emit_helpers(struct svga_shader_emitter *emit)
    }
 
    if (emit->unit == PIPE_SHADER_FRAGMENT) {
+      if (!svga_shader_emit_samplers_decl( emit ))
+         return FALSE;
+
       if (!emit_ps_preamble( emit ))
          return FALSE;
 

From 7087e0ab27da5eaad10a18ddba3234960a2b5524 Mon Sep 17 00:00:00 2001
From: Rovanion Luckey <rovanion.luckey@gmail.com>
Date: Tue, 29 Mar 2016 13:43:00 -0600
Subject: [PATCH 117/238] gallium: Format code in pb_buffer_fenced.c according
 to style guide.

This is a tiny housekeeping patch which does the following:

  * Replaced tabs with three spaces.
  * Formatted oneline and multiline code comments. Some doxygen
    comments weren't marked as such and some code comments were marked
    as doxygen comments.
  * Spaces between if- and while-statements and their parenthesis.

According to the mesa coding style guidelines.

Reviewed-by: Brian Paul <brianp@vmware.com>
---
 .../auxiliary/pipebuffer/pb_buffer_fenced.c   | 228 ++++++++----------
 1 file changed, 98 insertions(+), 130 deletions(-)

diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 2678268e923..fbbe8d11eb0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -108,14 +108,14 @@ struct fenced_manager
  */
 struct fenced_buffer
 {
-   /*
+   /**
     * Immutable members.
     */
 
    struct pb_buffer base;
    struct fenced_manager *mgr;
 
-   /*
+   /**
     * Following members are mutable and protected by fenced_manager::mutex.
     */
 
@@ -205,7 +205,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 
    curr = fenced_mgr->unfenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->unfenced) {
+   while (curr != &fenced_mgr->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(!fenced_buf->fence);
       debug_printf("%10p %7u %8u %7s\n",
@@ -219,7 +219,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 
    curr = fenced_mgr->fenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->fenced) {
+   while (curr != &fenced_mgr->fenced) {
       int signaled;
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(fenced_buf->buffer);
@@ -340,7 +340,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
    assert(pipe_is_referenced(&fenced_buf->base.reference));
    assert(fenced_buf->fence);
 
-   if(fenced_buf->fence) {
+   if (fenced_buf->fence) {
       struct pipe_fence_handle *fence = NULL;
       int finished;
       boolean proceed;
@@ -355,8 +355,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
 
       assert(pipe_is_referenced(&fenced_buf->base.reference));
 
-      /*
-       * Only proceed if the fence object didn't change in the meanwhile.
+      /* Only proceed if the fence object didn't change in the meanwhile.
        * Otherwise assume the work has been already carried out by another
        * thread that re-aquired the lock before us.
        */
@@ -364,14 +363,9 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
 
       ops->fence_reference(ops, &fence, NULL);
 
-      if(proceed && finished == 0) {
-         /*
-          * Remove from the fenced list
-          */
-
-         boolean destroyed;
-
-         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+      if (proceed && finished == 0) {
+         /* Remove from the fenced list. */
+         boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
 
          /* TODO: remove consequents buffers with the same fence? */
 
@@ -405,36 +399,33 @@ fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr,
 
    curr = fenced_mgr->fenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->fenced) {
+   while (curr != &fenced_mgr->fenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      if(fenced_buf->fence != prev_fence) {
-	 int signaled;
+      if (fenced_buf->fence != prev_fence) {
+         int signaled;
 
-	 if (wait) {
-	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
+         if (wait) {
+            signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
 
-	    /*
-	     * Don't return just now. Instead preemptively check if the
-	     * following buffers' fences already expired, without further waits.
-	     */
-	    wait = FALSE;
-	 }
-	 else {
-	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-	 }
-
-	 if (signaled != 0) {
-	    return ret;
+            /* Don't return just now. Instead preemptively check if the
+             * following buffers' fences already expired, without further waits.
+             */
+            wait = FALSE;
+         } else {
+            signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
          }
 
-	 prev_fence = fenced_buf->fence;
-      }
-      else {
+         if (signaled != 0) {
+            return ret;
+         }
+
+         prev_fence = fenced_buf->fence;
+      } else {
          /* This buffer's fence object is identical to the previous buffer's
           * fence object, so no need to check the fence again.
           */
-	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
+         assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
       }
 
       fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
@@ -462,22 +453,21 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
 
    curr = fenced_mgr->unfenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->unfenced) {
+   while (curr != &fenced_mgr->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      /*
-       * We can only move storage if the buffer is not mapped and not
+      /* We can only move storage if the buffer is not mapped and not
        * validated.
        */
-      if(fenced_buf->buffer &&
+      if (fenced_buf->buffer &&
          !fenced_buf->mapcount &&
          !fenced_buf->vl) {
          enum pipe_error ret;
 
          ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
-         if(ret == PIPE_OK) {
+         if (ret == PIPE_OK) {
             ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf);
-            if(ret == PIPE_OK) {
+            if (ret == PIPE_OK) {
                fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
                return TRUE;
             }
@@ -499,7 +489,7 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
 static void
 fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf)
 {
-   if(fenced_buf->data) {
+   if (fenced_buf->data) {
       align_free(fenced_buf->data);
       fenced_buf->data = NULL;
       assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size);
@@ -516,14 +506,14 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
                                         struct fenced_buffer *fenced_buf)
 {
    assert(!fenced_buf->data);
-   if(fenced_buf->data)
+   if (fenced_buf->data)
       return PIPE_OK;
 
    if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
    fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment);
-   if(!fenced_buf->data)
+   if (!fenced_buf->data)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
    fenced_mgr->cpu_total_size += fenced_buf->size;
@@ -538,7 +528,7 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
 static void
 fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
 {
-   if(fenced_buf->buffer) {
+   if (fenced_buf->buffer) {
       pb_reference(&fenced_buf->buffer, NULL);
    }
 }
@@ -575,41 +565,37 @@ fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
 {
    assert(!fenced_buf->buffer);
 
-   /*
-    * Check for signaled buffers before trying to allocate.
-    */
+   /* Check for signaled buffers before trying to allocate. */
    fenced_manager_check_signalled_locked(fenced_mgr, FALSE);
 
    fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
 
-   /*
-    * Keep trying while there is some sort of progress:
+   /* Keep trying while there is some sort of progress:
     * - fences are expiring,
     * - or buffers are being being swapped out from GPU memory into CPU memory.
     */
-   while(!fenced_buf->buffer &&
+   while (!fenced_buf->buffer &&
          (fenced_manager_check_signalled_locked(fenced_mgr, FALSE) ||
           fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
       fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
    }
 
-   if(!fenced_buf->buffer && wait) {
-      /*
-       * Same as before, but this time around, wait to free buffers if
+   if (!fenced_buf->buffer && wait) {
+      /* Same as before, but this time around, wait to free buffers if
        * necessary.
        */
-      while(!fenced_buf->buffer &&
+      while (!fenced_buf->buffer &&
             (fenced_manager_check_signalled_locked(fenced_mgr, TRUE) ||
              fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
          fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
       }
    }
 
-   if(!fenced_buf->buffer) {
-      if(0)
+   if (!fenced_buf->buffer) {
+      if (0)
          fenced_manager_dump_locked(fenced_mgr);
 
-      /* give up */
+      /* Give up. */
       return PIPE_ERROR_OUT_OF_MEMORY;
    }
 
@@ -686,18 +672,16 @@ fenced_buffer_map(struct pb_buffer *buf,
 
    assert(!(flags & PB_USAGE_GPU_READ_WRITE));
 
-   /*
-    * Serialize writes.
-    */
-   while((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
+   /* Serialize writes. */
+   while ((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
          ((fenced_buf->flags & PB_USAGE_GPU_READ) &&
           (flags & PB_USAGE_CPU_WRITE))) {
 
-      /* 
-       * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
+      /* Don't wait for the GPU to finish accessing it,
+       * if blocking is forbidden.
        */
-      if((flags & PB_USAGE_DONTBLOCK) &&
-          ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
+      if ((flags & PB_USAGE_DONTBLOCK) &&
+         ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
          goto done;
       }
 
@@ -705,17 +689,15 @@ fenced_buffer_map(struct pb_buffer *buf,
          break;
       }
 
-      /*
-       * Wait for the GPU to finish accessing. This will release and re-acquire
+      /* Wait for the GPU to finish accessing. This will release and re-acquire
        * the mutex, so all copies of mutable state must be discarded.
        */
       fenced_buffer_finish_locked(fenced_mgr, fenced_buf);
    }
 
-   if(fenced_buf->buffer) {
+   if (fenced_buf->buffer) {
       map = pb_map(fenced_buf->buffer, flags, flush_ctx);
-   }
-   else {
+   } else {
       assert(fenced_buf->data);
       map = fenced_buf->data;
    }
@@ -725,7 +707,7 @@ fenced_buffer_map(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE;
    }
 
-done:
+ done:
    pipe_mutex_unlock(fenced_mgr->mutex);
 
    return map;
@@ -741,12 +723,12 @@ fenced_buffer_unmap(struct pb_buffer *buf)
    pipe_mutex_lock(fenced_mgr->mutex);
 
    assert(fenced_buf->mapcount);
-   if(fenced_buf->mapcount) {
+   if (fenced_buf->mapcount) {
       if (fenced_buf->buffer)
          pb_unmap(fenced_buf->buffer);
       --fenced_buf->mapcount;
-      if(!fenced_buf->mapcount)
-	 fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
+      if (!fenced_buf->mapcount)
+         fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
    }
 
    pipe_mutex_unlock(fenced_mgr->mutex);
@@ -765,7 +747,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    pipe_mutex_lock(fenced_mgr->mutex);
 
    if (!vl) {
-      /* invalidate */
+      /* Invalidate. */
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
       ret = PIPE_OK;
@@ -776,40 +758,37 @@ fenced_buffer_validate(struct pb_buffer *buf,
    assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
    flags &= PB_USAGE_GPU_READ_WRITE;
 
-   /* Buffer cannot be validated in two different lists */
-   if(fenced_buf->vl && fenced_buf->vl != vl) {
+   /* Buffer cannot be validated in two different lists. */
+   if (fenced_buf->vl && fenced_buf->vl != vl) {
       ret = PIPE_ERROR_RETRY;
       goto done;
    }
 
-   if(fenced_buf->vl == vl &&
+   if (fenced_buf->vl == vl &&
       (fenced_buf->validation_flags & flags) == flags) {
-      /* Nothing to do -- buffer already validated */
+      /* Nothing to do -- buffer already validated. */
       ret = PIPE_OK;
       goto done;
    }
 
-   /*
-    * Create and update GPU storage.
-    */
-   if(!fenced_buf->buffer) {
+   /* Create and update GPU storage. */
+   if (!fenced_buf->buffer) {
       assert(!fenced_buf->mapcount);
 
       ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
-      if(ret != PIPE_OK) {
+      if (ret != PIPE_OK) {
          goto done;
       }
 
       ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf);
-      if(ret != PIPE_OK) {
+      if (ret != PIPE_OK) {
          fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
          goto done;
       }
 
-      if(fenced_buf->mapcount) {
+      if (fenced_buf->mapcount) {
          debug_printf("warning: validating a buffer while it is still mapped\n");
-      }
-      else {
+      } else {
          fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
       }
    }
@@ -821,7 +800,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
 
-done:
+ done:
    pipe_mutex_unlock(fenced_mgr->mutex);
 
    return ret;
@@ -841,13 +820,12 @@ fenced_buffer_fence(struct pb_buffer *buf,
    assert(pipe_is_referenced(&fenced_buf->base.reference));
    assert(fenced_buf->buffer);
 
-   if(fence != fenced_buf->fence) {
+   if (fence != fenced_buf->fence) {
       assert(fenced_buf->vl);
       assert(fenced_buf->validation_flags);
 
       if (fenced_buf->fence) {
-         boolean destroyed;
-         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+         boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
          assert(!destroyed);
       }
       if (fence) {
@@ -876,16 +854,15 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   /*
-    * This should only be called when the buffer is validated. Typically
+   /* This should only be called when the buffer is validated. Typically
     * when processing relocations.
     */
    assert(fenced_buf->vl);
    assert(fenced_buf->buffer);
 
-   if(fenced_buf->buffer)
+   if (fenced_buf->buffer) {
       pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
-   else {
+   } else {
       *base_buf = buf;
       *offset = 0;
    }
@@ -896,12 +873,12 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
 
 static const struct pb_vtbl
 fenced_buffer_vtbl = {
-      fenced_buffer_destroy,
-      fenced_buffer_map,
-      fenced_buffer_unmap,
-      fenced_buffer_validate,
-      fenced_buffer_fence,
-      fenced_buffer_get_base_buffer
+   fenced_buffer_destroy,
+   fenced_buffer_map,
+   fenced_buffer_unmap,
+   fenced_buffer_validate,
+   fenced_buffer_fence,
+   fenced_buffer_get_base_buffer
 };
 
 
@@ -917,12 +894,11 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
    struct fenced_buffer *fenced_buf;
    enum pipe_error ret;
 
-   /*
-    * Don't stall the GPU, waste time evicting buffers, or waste memory
+   /* Don't stall the GPU, waste time evicting buffers, or waste memory
     * trying to create a buffer that will most likely never fit into the
     * graphics aperture.
     */
-   if(size > fenced_mgr->max_buffer_size) {
+   if (size > fenced_mgr->max_buffer_size) {
       goto no_buffer;
    }
 
@@ -942,29 +918,21 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   /*
-    * Try to create GPU storage without stalling,
-    */
+   /* Try to create GPU storage without stalling. */
    ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE);
 
-   /*
-    * Attempt to use CPU memory to avoid stalling the GPU.
-    */
-   if(ret != PIPE_OK) {
+   /* Attempt to use CPU memory to avoid stalling the GPU. */
+   if (ret != PIPE_OK) {
       ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
    }
 
-   /*
-    * Create GPU storage, waiting for some to be available.
-    */
-   if(ret != PIPE_OK) {
+   /* Create GPU storage, waiting for some to be available. */
+   if (ret != PIPE_OK) {
       ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
    }
 
-   /*
-    * Give up.
-    */
-   if(ret != PIPE_OK) {
+   /* Give up. */
+   if (ret != PIPE_OK) {
       goto no_storage;
    }
 
@@ -976,10 +944,10 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
 
    return &fenced_buf->base;
 
-no_storage:
+ no_storage:
    pipe_mutex_unlock(fenced_mgr->mutex);
    FREE(fenced_buf);
-no_buffer:
+ no_buffer:
    return NULL;
 }
 
@@ -990,12 +958,12 @@ fenced_bufmgr_flush(struct pb_manager *mgr)
    struct fenced_manager *fenced_mgr = fenced_manager(mgr);
 
    pipe_mutex_lock(fenced_mgr->mutex);
-   while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+   while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
       ;
    pipe_mutex_unlock(fenced_mgr->mutex);
 
    assert(fenced_mgr->provider->flush);
-   if(fenced_mgr->provider->flush)
+   if (fenced_mgr->provider->flush)
       fenced_mgr->provider->flush(fenced_mgr->provider);
 }
 
@@ -1007,25 +975,25 @@ fenced_bufmgr_destroy(struct pb_manager *mgr)
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   /* Wait on outstanding fences */
+   /* Wait on outstanding fences. */
    while (fenced_mgr->num_fenced) {
       pipe_mutex_unlock(fenced_mgr->mutex);
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
       sched_yield();
 #endif
       pipe_mutex_lock(fenced_mgr->mutex);
-      while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+      while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
          ;
    }
 
 #ifdef DEBUG
-   /*assert(!fenced_mgr->num_unfenced);*/
+   /* assert(!fenced_mgr->num_unfenced); */
 #endif
 
    pipe_mutex_unlock(fenced_mgr->mutex);
    pipe_mutex_destroy(fenced_mgr->mutex);
 
-   if(fenced_mgr->provider)
+   if (fenced_mgr->provider)
       fenced_mgr->provider->destroy(fenced_mgr->provider);
 
    fenced_mgr->ops->destroy(fenced_mgr->ops);

From f8c69fbb5491c8790dd3bcf991f06151d15d92b9 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 29 Mar 2016 15:35:07 -0700
Subject: [PATCH 118/238] Revert "i965: Set address rounding bits for
 GL_NEAREST filtering as well."

This reverts commit 60d6a8989ab44cf47accee6bc692ba6fb98f6a9f.

It's pretty sketchy, and apparently regressed a bunch of dEQP tests
on Sandybridge.
---
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 7bd21f7aaf0..3bd22c7559f 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -436,17 +436,14 @@ brw_update_sampler_state(struct brw_context *brw,
       }
    }
 
-   /* Set address rounding bits.  The conditions are empirically
-    * derived in order to pass test cases.
-    */
-   bool round_nearest = brw->gen >= 6 && target != GL_TEXTURE_3D;
+   /* Set address rounding bits if not using nearest filtering. */
    unsigned address_rounding = 0;
-   if (min_filter != BRW_MAPFILTER_NEAREST || round_nearest) {
+   if (min_filter != BRW_MAPFILTER_NEAREST) {
       address_rounding |= BRW_ADDRESS_ROUNDING_ENABLE_U_MIN |
                           BRW_ADDRESS_ROUNDING_ENABLE_V_MIN |
                           BRW_ADDRESS_ROUNDING_ENABLE_R_MIN;
    }
-   if (mag_filter != BRW_MAPFILTER_NEAREST || round_nearest) {
+   if (mag_filter != BRW_MAPFILTER_NEAREST) {
       address_rounding |= BRW_ADDRESS_ROUNDING_ENABLE_U_MAG |
                           BRW_ADDRESS_ROUNDING_ENABLE_V_MAG |
                           BRW_ADDRESS_ROUNDING_ENABLE_R_MAG;

From d4a5a61d445e683c20de00c1febe847b4c2db910 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Mon, 28 Mar 2016 20:07:13 -0700
Subject: [PATCH 119/238] i965: Don't use CUBE wrap modes for integer formats
 on IVB/BYT.

There is no linear filtering for integer formats, so we should always
be using CLAMP_TO_EDGE mode.

Fixes 46 dEQP cases on Ivybridge (which were likely broken by commit
0faf26e6a0a34c3544644852802484f2404cc83e).

This workaround doesn't appear to be necessary on any other hardware;
I haven't found any documentation mentioning errata in this area.

v2: Only apply on Ivybridge/Baytrail to avoid regressing GLES3.1 tests.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net> [v1]
---
 src/mesa/drivers/dri/i965/brw_sampler_state.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index 3bd22c7559f..1dc7d71929c 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -459,8 +459,12 @@ brw_update_sampler_state(struct brw_context *brw,
        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
       /* Cube maps must use the same wrap mode for all three coordinate
        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
+       *
+       * Ivybridge and Baytrail seem to have problems with CUBE mode and
+       * integer formats.  Fall back to CLAMP for now.
        */
-      if (tex_cube_map_seamless || sampler->CubeMapSeamless) {
+      if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
+          !(brw->gen == 7 && !brw->is_haswell && is_integer_format)) {
 	 wrap_s = BRW_TEXCOORDMODE_CUBE;
 	 wrap_t = BRW_TEXCOORDMODE_CUBE;
 	 wrap_r = BRW_TEXCOORDMODE_CUBE;

From 105fe527840ed5fc55e78c4561915fbe1fded76c Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 24 Mar 2016 11:24:33 -0600
Subject: [PATCH 120/238] mesa: new _mesa_prepare_mipmap_levels() function for
 mipmap generation

Simplifies the loops in generate_mipmap_uncompressed() and
generate_mipmap_compressed().  Will be used in the state tracker too.
Could probably be used in the meta code.  If so, some additional
clean-ups can be done after that.

v2: use unsigned types instead of GLuint, per Ian

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 src/mesa/main/mipmap.c | 88 +++++++++++++++++++++++++++---------------
 src/mesa/main/mipmap.h |  5 +++
 2 files changed, 62 insertions(+), 31 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 5a02780b960..cb9afdef2af 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1872,6 +1872,49 @@ _mesa_prepare_mipmap_level(struct gl_context *ctx,
 }
 
 
+/**
+ * Prepare all mipmap levels beyond 'baseLevel' for mipmap generation.
+ * When finished, all the gl_texture_image structures for the smaller
+ * mipmap levels will be consistent with the base level (in terms of
+ * dimensions, format, etc).
+ */
+void
+_mesa_prepare_mipmap_levels(struct gl_context *ctx,
+                            struct gl_texture_object *texObj,
+                            unsigned baseLevel, unsigned maxLevel)
+{
+   const struct gl_texture_image *baseImage =
+      _mesa_select_tex_image(texObj, texObj->Target, baseLevel);
+   const GLint border = 0;
+   GLint width = baseImage->Width;
+   GLint height = baseImage->Height;
+   GLint depth = baseImage->Depth;
+   const GLenum intFormat = baseImage->InternalFormat;
+   const mesa_format texFormat = baseImage->TexFormat;
+   GLint newWidth, newHeight, newDepth;
+
+   /* Prepare baseLevel + 1, baseLevel + 2, ... */
+   for (unsigned level = baseLevel + 1; level <= maxLevel; level++) {
+      if (!_mesa_next_mipmap_level_size(texObj->Target, border,
+                                        width, height, depth,
+                                        &newWidth, &newHeight, &newDepth)) {
+         /* all done */
+         break;
+      }
+
+      if (!_mesa_prepare_mipmap_level(ctx, texObj, level,
+                                      newWidth, newHeight, newDepth,
+                                      border, intFormat, texFormat)) {
+         break;
+      }
+
+      width = newWidth;
+      height = newHeight;
+      depth = newDepth;
+   }
+}
+
+
 static void
 generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
 			     struct gl_texture_object *texObj,
@@ -1892,7 +1935,6 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
       GLint dstWidth, dstHeight, dstDepth;
       GLint border;
       GLint slice;
-      GLboolean nextLevel;
       GLubyte **srcMaps, **dstMaps;
       GLboolean success = GL_TRUE;
 
@@ -1904,22 +1946,14 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
       srcDepth = srcImage->Depth;
       border = srcImage->Border;
 
-      nextLevel = _mesa_next_mipmap_level_size(target, border,
-                                         srcWidth, srcHeight, srcDepth,
-                                         &dstWidth, &dstHeight, &dstDepth);
-      if (!nextLevel)
-         return;
-
-      if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1,
-                                      dstWidth, dstHeight, dstDepth,
-                                      border, srcImage->InternalFormat,
-                                      srcImage->TexFormat)) {
-         return;
-      }
-
       /* get dest gl_texture_image */
       dstImage = _mesa_select_tex_image(texObj, target, level + 1);
-      assert(dstImage);
+      if (!dstImage) {
+         break;
+      }
+      dstWidth = dstImage->Width;
+      dstHeight = dstImage->Height;
+      dstDepth = dstImage->Depth;
 
       if (target == GL_TEXTURE_1D_ARRAY) {
 	 srcDepth = srcHeight;
@@ -2087,7 +2121,6 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
       GLint srcWidth, srcHeight, srcDepth;
       GLint dstWidth, dstHeight, dstDepth;
       GLint border;
-      GLboolean nextLevel;
       GLuint temp_dst_row_stride, temp_dst_img_stride; /* in bytes */
       GLint i;
 
@@ -2099,23 +2132,14 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
       srcDepth = srcImage->Depth;
       border = srcImage->Border;
 
-      nextLevel = _mesa_next_mipmap_level_size(target, border,
-                                         srcWidth, srcHeight, srcDepth,
-                                         &dstWidth, &dstHeight, &dstDepth);
-      if (!nextLevel)
-	 goto end;
-
-      if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1,
-                                      dstWidth, dstHeight, dstDepth,
-                                      border, srcImage->InternalFormat,
-                                      srcImage->TexFormat)) {
-         /* all done */
-         goto end;
-      }
-
       /* get dest gl_texture_image */
       dstImage = _mesa_select_tex_image(texObj, target, level + 1);
-      assert(dstImage);
+      if (!dstImage) {
+         break;
+      }
+      dstWidth = dstImage->Width;
+      dstHeight = dstImage->Height;
+      dstDepth = dstImage->Depth;
 
       /* Compute dst image strides and alloc memory on first iteration */
       temp_dst_row_stride = _mesa_format_row_stride(temp_format, dstWidth);
@@ -2194,6 +2218,8 @@ _mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
 
    maxLevel = MIN2(maxLevel, texObj->MaxLevel);
 
+   _mesa_prepare_mipmap_levels(ctx, texObj, texObj->BaseLevel, maxLevel);
+
    if (_mesa_is_format_compressed(srcImage->TexFormat)) {
       generate_mipmap_compressed(ctx, target, texObj, srcImage, maxLevel);
    } else {
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index c0366d329a2..33913e88417 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -47,6 +47,11 @@ _mesa_prepare_mipmap_level(struct gl_context *ctx,
                            GLsizei width, GLsizei height, GLsizei depth,
                            GLsizei border, GLenum intFormat, mesa_format format);
 
+void
+_mesa_prepare_mipmap_levels(struct gl_context *ctx,
+                            struct gl_texture_object *texObj,
+                            unsigned baseLevel, unsigned maxLevel);
+
 extern void
 _mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
                       struct gl_texture_object *texObj);

From d8d029f22bcb49c26ddb11d0bb968d24f2a5fb7e Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 24 Mar 2016 14:09:24 -0600
Subject: [PATCH 121/238] st/mesa: simplify st_generate_mipmap()

The whole st_generate_mipmap() function was overly complicated.  Now
we just call the new _mesa_prepare_mipmap_levels() function to prepare
the texture mipmap memory, then call the generate function which fills
in the texture images.

This fixes a failed assertion in llvmpipe/softpipe which is hit with the
new piglit generatemipmap-base-change test.  Also fixes some device errors
(format mismatches) with the VMware svga driver.

v2: fix a comment typo, per Sinclair

Reviewed-by: Sinclair Yeh <syeh@vmware.com>
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/state_tracker/st_gen_mipmap.c | 96 ++++++--------------------
 1 file changed, 21 insertions(+), 75 deletions(-)

diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index c4b3492b0d3..a14bbfabaa3 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -82,7 +82,6 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
    const uint baseLevel = texObj->BaseLevel;
    enum pipe_format format;
    uint lastLevel, first_layer, last_layer;
-   uint dstLevel;
 
    if (!pt)
       return;
@@ -103,42 +102,33 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
    stObj->lastLevel = lastLevel;
 
    if (!texObj->Immutable) {
-      if (pt->last_level < lastLevel) {
-         /* The current gallium texture doesn't have space for all the
-         * mipmap levels we need to generate.  So allocate a new texture.
-         */
-         struct pipe_resource *oldTex = stObj->pt;
+      const GLboolean genSave = texObj->GenerateMipmap;
 
-         /* create new texture with space for more levels */
-         stObj->pt = st_texture_create(st,
-                                       oldTex->target,
-                                       oldTex->format,
-                                       lastLevel,
-                                       oldTex->width0,
-                                       oldTex->height0,
-                                       oldTex->depth0,
-                                       oldTex->array_size,
-                                       0,
-                                       oldTex->bind);
+      /* Temporarily set GenerateMipmap to true so that allocate_full_mipmap()
+       * makes the right decision about full mipmap allocation.
+       */
+      texObj->GenerateMipmap = GL_TRUE;
 
-         /* This will copy the old texture's base image into the new texture
-         * which we just allocated.
-         */
-         st_finalize_texture(ctx, st->pipe, texObj);
+      _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, lastLevel);
 
-         /* release the old tex (will likely be freed too) */
-         pipe_resource_reference(&oldTex, NULL);
-         st_texture_release_all_sampler_views(st, stObj);
-      }
-      else {
-         /* Make sure that the base texture image data is present in the
-         * texture buffer.
-         */
-         st_finalize_texture(ctx, st->pipe, texObj);
-      }
+      texObj->GenerateMipmap = genSave;
+
+      /* At this point, memory for all the texture levels has been
+       * allocated.  However, the base level image may be in one resource
+       * while the subsequent/smaller levels may be in another resource.
+       * Finalizing the texture will copy the base images from the former
+       * resource to the latter.
+       *
+       * After this, we'll have all mipmap levels in one resource.
+       */
+      st_finalize_texture(ctx, st->pipe, texObj);
    }
 
    pt = stObj->pt;
+   if (!pt) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "mipmap generation");
+      return;
+   }
 
    assert(pt->last_level >= lastLevel);
 
@@ -169,48 +159,4 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
          _mesa_generate_mipmap(ctx, target, texObj);
       }
    }
-
-   /* Fill in the Mesa gl_texture_image fields */
-   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
-      const uint srcLevel = dstLevel - 1;
-      const struct gl_texture_image *srcImage
-         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
-      struct gl_texture_image *dstImage;
-      struct st_texture_image *stImage;
-      uint border = srcImage->Border;
-      uint dstWidth, dstHeight, dstDepth;
-
-      dstWidth = u_minify(pt->width0, dstLevel);
-      if (texObj->Target == GL_TEXTURE_1D_ARRAY) {
-         dstHeight = pt->array_size;
-      }
-      else {
-         dstHeight = u_minify(pt->height0, dstLevel);
-      }
-      if (texObj->Target == GL_TEXTURE_2D_ARRAY ||
-          texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY) {
-         dstDepth = pt->array_size;
-      }
-      else {
-         dstDepth = u_minify(pt->depth0, dstLevel);
-      }
-
-      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
-      if (!dstImage) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
-         return;
-      }
-
-      /* Free old image data */
-      ctx->Driver.FreeTextureImageBuffer(ctx, dstImage);
-
-      /* initialize new image */
-      _mesa_init_teximage_fields(ctx, dstImage, dstWidth, dstHeight,
-                                 dstDepth, border, srcImage->InternalFormat,
-                                 srcImage->TexFormat);
-
-      stImage = st_texture_image(dstImage);
-
-      pipe_resource_reference(&stImage->pt, pt);
-   }
 }

From 75b713455c77931e5e90c30e65ac99d74dfb17f0 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Sat, 26 Mar 2016 11:35:00 -0600
Subject: [PATCH 122/238] xlib: add support for GLX_ARB_create_context

This adds the glXCreateContextAttribsARB() function for the xlib/swrast
driver.  This allows more piglit tests to run with this driver.

For example, without this patch we get:
$ bin/fbo-generatemipmap-1d -auto
piglit: error: waffle_config_choose failed due to WAFFLE_ERROR_UNSUPPORTED_
ON_PLATFORM: GLX_ARB_create_context is required in order to request an OpenGL
version not equal to the default value 1.0
piglit: error: Failed to create waffle_config for OpenGL 2.0 Compatibility Context
piglit: info: Failed to create any GL context
PIGLIT: {"result": "skip" }

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Acked-by: Roland Scheidegger <sroland@vmware.com>
---
 src/mesa/drivers/x11/fakeglx.c | 52 ++++++++++++++++++++++++++++++++++
 src/mesa/drivers/x11/glxapi.c  | 20 +++++++++++++
 src/mesa/drivers/x11/glxapi.h  |  5 ++++
 3 files changed, 77 insertions(+)

diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c
index 9286f718d00..80b71765e6c 100644
--- a/src/mesa/drivers/x11/fakeglx.c
+++ b/src/mesa/drivers/x11/fakeglx.c
@@ -74,6 +74,7 @@
    "GLX_MESA_copy_sub_buffer " \
    "GLX_MESA_pixmap_colormap " \
    "GLX_MESA_release_buffers " \
+   "GLX_ARB_create_context " \
    "GLX_ARB_get_proc_address " \
    "GLX_EXT_texture_from_pixmap " \
    "GLX_EXT_visual_info " \
@@ -2831,6 +2832,56 @@ Fake_glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer)
 }
 
 
+static GLXContext
+Fake_glXCreateContextAttribs(Display *dpy, GLXFBConfig config,
+                             GLXContext share_context, Bool direct,
+                             const int *attrib_list)
+{
+   XMesaContext xmCtx;
+   XMesaVisual xmvis = (XMesaVisual) config;
+   int i;
+   int major = 0, minor = 0, ctxFlags = 0, profileFlags = 0;
+
+   for (i = 0; attrib_list[i]; i += 2) {
+      switch (attrib_list[i]) {
+      case GLX_CONTEXT_MAJOR_VERSION_ARB:
+         major = attrib_list[i + 1];
+         break;
+      case GLX_CONTEXT_MINOR_VERSION_ARB:
+         minor = attrib_list[i + 1];
+         break;
+      case GLX_CONTEXT_FLAGS_ARB:
+         ctxFlags = attrib_list[i + 1];
+         break;
+      case GLX_CONTEXT_PROFILE_MASK_ARB:
+         profileFlags = attrib_list[i + 1];
+         break;
+      default:
+         fprintf(stderr, "Bad attribute in glXCreateContextAttribs()\n");
+         return 0;
+      }
+   }
+
+   if (major * 10 + minor > 21) {
+      /* swrast only supports GL 2.1 and earlier */
+      return 0;
+   }
+
+   /* These are ignored for now.  We'd have to enhance XMesaCreateContext
+    * to take these flags and the version, at least.
+    */
+   (void) ctxFlags;
+   (void) profileFlags;
+
+   /* deallocate unused windows/buffers */
+   XMesaGarbageCollect(dpy);
+
+   xmCtx = XMesaCreateContext(xmvis, (XMesaContext) share_context);
+
+   return (GLXContext) xmCtx;
+}
+
+
 /* silence warning */
 extern struct _glxapi_table *_mesa_GetGLXDispatchTable(void);
 
@@ -2990,5 +3041,6 @@ _mesa_GetGLXDispatchTable(void)
    glx.BindTexImageEXT = Fake_glXBindTexImageEXT;
    glx.ReleaseTexImageEXT = Fake_glXReleaseTexImageEXT;
 
+   glx.CreateContextAttribs = Fake_glXCreateContextAttribs;
    return &glx;
 }
diff --git a/src/mesa/drivers/x11/glxapi.c b/src/mesa/drivers/x11/glxapi.c
index a870e94ed4a..cc1bb2ab4b3 100644
--- a/src/mesa/drivers/x11/glxapi.c
+++ b/src/mesa/drivers/x11/glxapi.c
@@ -1319,6 +1319,9 @@ static struct name_address_pair GLX_functions[] = {
    { "glXBindTexImageEXT", (__GLXextFuncPtr) glXBindTexImageEXT },
    { "glXReleaseTexImageEXT", (__GLXextFuncPtr) glXReleaseTexImageEXT },
 
+   /*** GLX_ARB_create_context ***/
+   { "glXCreateContextAttribsARB", (__GLXextFuncPtr) glXCreateContextAttribsARB },
+
    { NULL, NULL }   /* end of list */
 };
 
@@ -1370,3 +1373,20 @@ void PUBLIC
 {
    return glXGetProcAddressARB(procName);
 }
+
+
+/**
+ * Added in GLX_ARB_create_context.
+ */
+GLXContext PUBLIC
+glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
+                           GLXContext share_context, Bool direct,
+                           const int *attrib_list)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateContextAttribs)(dpy, config, share_context, direct,
+                                    attrib_list);
+}
diff --git a/src/mesa/drivers/x11/glxapi.h b/src/mesa/drivers/x11/glxapi.h
index bd6e97053e6..aff38f7531d 100644
--- a/src/mesa/drivers/x11/glxapi.h
+++ b/src/mesa/drivers/x11/glxapi.h
@@ -201,6 +201,11 @@ struct _glxapi_table {
    void (*BindTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer,
                            const int *attrib_list);
    void (*ReleaseTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer);
+
+   /*** GLX_ARB_create_context ***/
+   GLXContext (*CreateContextAttribs)(Display *dpy, GLXFBConfig config,
+                                      GLXContext share_context, Bool direct,
+                                      const int *attrib_list);
 };
 
 

From 6775268b61b1943c85f5a1a30ce330d34003e328 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 29 Mar 2016 11:32:24 -0600
Subject: [PATCH 123/238] gallium/docs: s/gven/given/

---
 src/gallium/docs/source/tgsi.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 6366f7e802d..3ac6ba3c25a 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2095,7 +2095,7 @@ after lookup.
 .. opcode:: SAMPLE
 
   Using provided address, sample data from the specified texture using the
-  filtering mode identified by the gven sampler. The source data may come from
+  filtering mode identified by the given sampler. The source data may come from
   any resource type other than buffers.
 
   Syntax: ``SAMPLE dst, address, sampler_view, sampler``

From 86e1768c13d67945f4a9549820e711b70ff2aba7 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Sat, 26 Mar 2016 11:46:53 -0600
Subject: [PATCH 124/238] tgsi: collect texture sampler target info in
 tgsi_scan_shader()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Texture sample instructions specify a sampler unit and texture target
such as "1D", "2D", "CUBE", etc.  Sampler view declarations also specify
the sampler unit and texture target.

This patch checks that the texture instructions agree with the declarations
and collects the texture target type for each sampler unit.

v2: only compare instruction's texture target to the sampler view declaration
target if the instruction is a TEX instruction, not a SAMPLE instruction.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 40 ++++++++++++++++++++++++--
 src/gallium/auxiliary/tgsi/tgsi_scan.h |  1 +
 2 files changed, 38 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index d32c3a14344..6d4b00d8879 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -181,11 +181,33 @@ scan_instruction(struct tgsi_shader_info *info,
          info->indirect_files_read |= (1 << src->Register.File);
       }
 
-      /* MSAA samplers */
+      /* Texture samplers */
       if (src->Register.File == TGSI_FILE_SAMPLER) {
-         assert(fullinst->Instruction.Texture);
-         assert(src->Register.Index < Elements(info->is_msaa_sampler));
+         const unsigned index = src->Register.Index;
+         const unsigned target = fullinst->Texture.Texture;
 
+         assert(fullinst->Instruction.Texture);
+         assert(index < Elements(info->is_msaa_sampler));
+         assert(index < PIPE_MAX_SAMPLERS);
+         assert(target < TGSI_TEXTURE_UNKNOWN);
+
+         if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_tex) {
+            /* for texture instructions, check that the texture instruction
+             * target matches the previous sampler view declaration (if there
+             * was one.)
+             */
+            if (info->sampler_targets[index] == TGSI_TEXTURE_UNKNOWN) {
+               /* probably no sampler view declaration */
+               info->sampler_targets[index] = target;
+            } else {
+               /* Make sure the texture instruction's sampler/target info
+                * agrees with the sampler view declaration.
+                */
+               assert(info->sampler_targets[index] == target);
+            }
+         }
+
+         /* MSAA samplers */
          if (fullinst->Instruction.Texture &&
              (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
               fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
@@ -431,6 +453,16 @@ scan_declaration(struct tgsi_shader_info *info,
          }
       } else if (file == TGSI_FILE_SAMPLER) {
          info->samplers_declared |= 1 << reg;
+      } else if (file == TGSI_FILE_SAMPLER_VIEW) {
+         unsigned target = fulldecl->SamplerView.Resource;
+         assert(target < TGSI_TEXTURE_UNKNOWN);
+         if (info->sampler_targets[reg] == TGSI_TEXTURE_UNKNOWN) {
+            /* Save sampler target for this sampler index */
+            info->sampler_targets[reg] = target;
+         } else {
+            /* if previously declared, make sure targets agree */
+            assert(info->sampler_targets[reg] == target);
+         }
       } else if (file == TGSI_FILE_IMAGE) {
          if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
             info->images_buffers |= 1 << reg;
@@ -493,6 +525,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
    for (i = 0; i < Elements(info->const_file_max); i++)
       info->const_file_max[i] = -1;
    info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = 1;
+   for (i = 0; i < Elements(info->sampler_targets); i++)
+      info->sampler_targets[i] = TGSI_TEXTURE_UNKNOWN;
 
    /**
     ** Setup to begin parsing input shader
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 76d8925119e..31adce7a603 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -65,6 +65,7 @@ struct tgsi_shader_info
    int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
    int const_file_max[PIPE_MAX_CONSTANT_BUFFERS];
    unsigned samplers_declared; /**< bitmask of declared samplers */
+   ubyte sampler_targets[PIPE_MAX_SHADER_SAMPLER_VIEWS];  /**< TGSI_TEXTURE_x values */
 
    ubyte input_array_first[PIPE_MAX_SHADER_INPUTS];
    ubyte input_array_last[PIPE_MAX_SHADER_INPUTS];

From 5c85c3be26566711a3bdf27df4f9fb07b126882d Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 29 Mar 2016 11:43:02 -0600
Subject: [PATCH 125/238] tgsi: simplify tgsi_shader_info::is_msaa_sampler
 checking
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We assert that fullinst->Instruction.Texture != 0 above so no need to
check it in the conditional.  We also have the fullinst->Texture.Texture
value in a local variable, so use it.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 6d4b00d8879..c71c7770bfb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -208,9 +208,8 @@ scan_instruction(struct tgsi_shader_info *info,
          }
 
          /* MSAA samplers */
-         if (fullinst->Instruction.Texture &&
-             (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
-              fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
+         if (target == TGSI_TEXTURE_2D_MSAA ||
+             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
             info->is_msaa_sampler[src->Register.Index] = TRUE;
          }
       }

From bab0752a805214645af92aec7ca692f723640c36 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 29 Mar 2016 12:54:10 -0600
Subject: [PATCH 126/238] docs: add HTTP link for Mesa downloads

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=92628
Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 docs/download.html | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/download.html b/docs/download.html
index 856c2a53ca2..e90c1bbbf23 100644
--- a/docs/download.html
+++ b/docs/download.html
@@ -18,7 +18,9 @@
 
 <p>
 Primary Mesa download site:
-<a href="ftp://ftp.freedesktop.org/pub/mesa/">freedesktop.org</a> (FTP)
+<a href="ftp://ftp.freedesktop.org/pub/mesa/">ftp.freedesktop.org</a> (FTP)
+or <a href="https://mesa.freedesktop.org/archive/">mesa.freedesktop.org</a>
+(HTTP).
 </p>
 
 <p>

From ed39de90f1cd209b10baeed8ae98b1f56127c8de Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Tue, 29 Mar 2016 17:44:00 -0600
Subject: [PATCH 127/238] meta: use _mesa_prepare_mipmap_levels()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The prepare_mipmap_level() wrapper for _mesa_prepare_mipmap_level() is
not needed.  It only served to undo the GL_TEXTURE_1D_ARRAY height/depth
change was was made before the call to prepare_mipmap_level()

Said another way, regardless of how the meta code manipulates the height/
depth dims for GL_TEXTURE_1D_ARRAY, the gl_texture_image dimensions are
correctly set up by _mesa_prepare_mipmap_levels().

Tested by plugging _mesa_meta_GenerateMipmap() into the swrast driver
and testing with piglit.

v2 (idr): Early out of the mipmap generation loop with dstImage is NULL.
This can occur for immutable textures that have a limited range of
levels or in the presense of memory allocation failures.  Fixes
arb_texture_view-mipgen on Intel platforms.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Tested-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
---
 .../drivers/common/meta_generate_mipmap.c     | 32 +++++--------------
 1 file changed, 8 insertions(+), 24 deletions(-)

diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index d4b75390ebf..b81e179e2cd 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -137,21 +137,6 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gl_context *ctx,
    _mesa_meta_blit_shader_table_cleanup(ctx, &mipmap->shaders);
 }
 
-static GLboolean
-prepare_mipmap_level(struct gl_context *ctx,
-                     struct gl_texture_object *texObj, GLuint level,
-                     GLsizei width, GLsizei height, GLsizei depth,
-                     GLenum intFormat, mesa_format format)
-{
-   if (texObj->Target == GL_TEXTURE_1D_ARRAY) {
-      /* Work around Mesa expecting the number of array slices in "height". */
-      height = depth;
-      depth = 1;
-   }
-
-   return _mesa_prepare_mipmap_level(ctx, texObj, level, width, height, depth,
-                                     0, intFormat, format);
-}
 
 /**
  * Called via ctx->Driver.GenerateMipmap()
@@ -270,6 +255,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
    /* texture is already locked, unlock now */
    _mesa_unlock_texture(ctx, texObj);
 
+   _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, maxLevel);
+
    for (dstLevel = baseLevel + 1; dstLevel <= maxLevel; dstLevel++) {
       const struct gl_texture_image *srcImage;
       struct gl_texture_image *dstImage;
@@ -309,17 +296,14 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
       _mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL,
                                 (GLint *) &dstLevel, false);
 
-      if (!prepare_mipmap_level(ctx, texObj, dstLevel,
-                                dstWidth, dstHeight, dstDepth,
-                                srcImage->InternalFormat,
-                                srcImage->TexFormat)) {
-         /* All done.  We either ran out of memory or we would go beyond the
-          * last valid level of an immutable texture if we continued.
-          */
-         break;
-      }
       dstImage = _mesa_select_tex_image(texObj, faceTarget, dstLevel);
 
+      /* All done.  We either ran out of memory or we would go beyond the last
+       * valid level of an immutable texture if we continued.
+       */
+      if (dstImage == NULL)
+         break;
+
       /* limit minification to src level */
       _mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL,
                                 (GLint *) &srcLevel, false);

From 513384d7e8db294d54a910f40c90492c211abc54 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Mon, 28 Mar 2016 17:27:27 -0600
Subject: [PATCH 128/238] mesa: make _mesa_prepare_mipmap_level() static
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

No longer called from any other file.

Reviewed-by: José Fonseca <jfonseca@vmware.com>
Reviewed-by: Roland Scheidegger <sroland@vmware.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Tested-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/mipmap.c | 16 ++++++++--------
 src/mesa/main/mipmap.h |  7 -------
 2 files changed, 8 insertions(+), 15 deletions(-)

diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index cb9afdef2af..5ff53f4265c 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1810,11 +1810,11 @@ _mesa_next_mipmap_level_size(GLenum target, GLint border,
  * for mipmap generation.  If not, (re) allocate it.
  * \return GL_TRUE if successful, GL_FALSE if mipmap generation should stop
  */
-GLboolean
-_mesa_prepare_mipmap_level(struct gl_context *ctx,
-                           struct gl_texture_object *texObj, GLuint level,
-                           GLsizei width, GLsizei height, GLsizei depth,
-                           GLsizei border, GLenum intFormat, mesa_format format)
+static GLboolean
+prepare_mipmap_level(struct gl_context *ctx,
+                     struct gl_texture_object *texObj, GLuint level,
+                     GLsizei width, GLsizei height, GLsizei depth,
+                     GLsizei border, GLenum intFormat, mesa_format format)
 {
    const GLuint numFaces = _mesa_num_tex_faces(texObj->Target);
    GLuint face;
@@ -1902,9 +1902,9 @@ _mesa_prepare_mipmap_levels(struct gl_context *ctx,
          break;
       }
 
-      if (!_mesa_prepare_mipmap_level(ctx, texObj, level,
-                                      newWidth, newHeight, newDepth,
-                                      border, intFormat, texFormat)) {
+      if (!prepare_mipmap_level(ctx, texObj, level,
+                                newWidth, newHeight, newDepth,
+                                border, intFormat, texFormat)) {
          break;
       }
 
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index 33913e88417..d11c7fada37 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -40,13 +40,6 @@ _mesa_generate_mipmap_level(GLenum target,
                             GLubyte **dstData,
                             GLint dstRowStride);
 
-
-extern GLboolean
-_mesa_prepare_mipmap_level(struct gl_context *ctx,
-                           struct gl_texture_object *texObj, GLuint level,
-                           GLsizei width, GLsizei height, GLsizei depth,
-                           GLsizei border, GLenum intFormat, mesa_format format);
-
 void
 _mesa_prepare_mipmap_levels(struct gl_context *ctx,
                             struct gl_texture_object *texObj,

From 553e37aa337783d468f218291f6de6a74e49289b Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 29 Mar 2016 15:31:05 -0400
Subject: [PATCH 129/238] mesa: allow mutable buffer textures to back GL ES
 images

Since there is no way to create immutable texture buffers in GL ES,
mutable buffer textures are allowed to back images. See issue 7 of the
GL_OES_texture_buffer specification.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
---
 src/mesa/main/shaderimage.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index fd5934f939f..90643c4ed6d 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -583,8 +583,13 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
        *
        * "An INVALID_OPERATION error is generated if texture is not the name
        *  of an immutable texture object."
+       *
+       * However note that issue 7 of the GL_OES_texture_buffer spec
+       * recognizes that there is no way to create immutable buffer textures,
+       * so those are excluded from this requirement.
        */
-      if (_mesa_is_gles(ctx) && !t->Immutable) {
+      if (_mesa_is_gles(ctx) && !t->Immutable &&
+          t->Target != GL_TEXTURE_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glBindImageTexture(!immutable)");
          return;

From 2d3b8aefda1df66ef43c11c66e95ecb9a19c9137 Mon Sep 17 00:00:00 2001
From: Roland Scheidegger <sroland@vmware.com>
Date: Wed, 30 Mar 2016 04:25:45 +0200
Subject: [PATCH 130/238] tgsi: (trivial) only verify target for is_tex
 instructions

d3d10 state tracker does not encode (valid) target (only offsets are
really used from the texture bits), since that information always comes
from the sview dcl, and not the instruction (note the meaning of target
is actually slightly different between gl and d3d10 in any case, because
d3d10 target does never include shadow bit).
Also move the msaa sampler identification as well - would need to set that
on the sview not sampler, so while this does not fix it make it at least
obvious it won't work with sample instructions.
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index c71c7770bfb..76a6fef8b44 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -184,14 +184,14 @@ scan_instruction(struct tgsi_shader_info *info,
       /* Texture samplers */
       if (src->Register.File == TGSI_FILE_SAMPLER) {
          const unsigned index = src->Register.Index;
-         const unsigned target = fullinst->Texture.Texture;
 
          assert(fullinst->Instruction.Texture);
          assert(index < Elements(info->is_msaa_sampler));
          assert(index < PIPE_MAX_SAMPLERS);
-         assert(target < TGSI_TEXTURE_UNKNOWN);
 
          if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_tex) {
+            const unsigned target = fullinst->Texture.Texture;
+            assert(target < TGSI_TEXTURE_UNKNOWN);
             /* for texture instructions, check that the texture instruction
              * target matches the previous sampler view declaration (if there
              * was one.)
@@ -205,12 +205,11 @@ scan_instruction(struct tgsi_shader_info *info,
                 */
                assert(info->sampler_targets[index] == target);
             }
-         }
-
-         /* MSAA samplers */
-         if (target == TGSI_TEXTURE_2D_MSAA ||
-             target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
-            info->is_msaa_sampler[src->Register.Index] = TRUE;
+            /* MSAA samplers */
+            if (target == TGSI_TEXTURE_2D_MSAA ||
+                target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+               info->is_msaa_sampler[src->Register.Index] = TRUE;
+            }
          }
       }
 

From 6773128bbf8703663ed1a4d6c1c3308b3c002a35 Mon Sep 17 00:00:00 2001
From: Lars Hamre <chemecse@gmail.com>
Date: Mon, 28 Mar 2016 20:42:14 -0400
Subject: [PATCH 131/238] glsl: invalidate float suffixes for GLSL 1.10 and
 GLSL ES 1.00

Float suffixes are not allowed in GLSL 1.10 nor GLSL ES 1.00.

Fixes the following piglit tests:
tests/spec/glsl-1.10/compiler/literals/invalid-float-suffix-capital-f.vert
tests/spec/glsl-1.10/compiler/literals/invalid-float-suffix-f.vert`

v2: modify error message
v3: parse the float instead of returning an ERROR_TOK
v4: (by Ken) Change to is_version(120, 300) to avoid breaking ES3
    shaders; update commit message accordingly.

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=81585
Signed-off-by: Lars Hamre <chemecse@gmail.com>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/compiler/glsl/glsl_lexer.ll | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 883c58f0da9..5492045f7c3 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -472,6 +472,13 @@ layout		{
 \.[0-9]+([eE][+-]?[0-9]+)?[fF]?		|
 [0-9]+\.([eE][+-]?[0-9]+)?[fF]?		|
 [0-9]+[eE][+-]?[0-9]+[fF]?		{
+			    struct _mesa_glsl_parse_state *state = yyextra;
+			    char suffix = yytext[strlen(yytext) - 1];
+			    if (!state->is_version(120, 300) &&
+			        (suffix == 'f' || suffix == 'F')) {
+			        _mesa_glsl_error(yylloc, state,
+			                         "Float suffixes are invalid in GLSL 1.10");
+			    }
 			    yylval->real = _mesa_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}

From bb37886f75f48727e5d3f5e2715c4a2f418ac1a1 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Tue, 29 Mar 2016 23:11:07 +0200
Subject: [PATCH 132/238] glsl: add missing types for buffer images

Type of GLSL_SAMPLER_DIM_BUF can be sampler or image.

Spotted while trying to run dEQP tests related to
ARB_shader_image_load_store.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Tested-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/compiler/glsl/ast_to_hir.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 29a4642af2c..fcc542ab5e0 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -2353,11 +2353,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
             return names[type_idx];
          }
          case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "samplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "samplerBuffer", NULL, NULL, NULL,
+              "imageBuffer", NULL, NULL, NULL
             };
-            return names[type_idx];
+            return names[offset + type_idx];
          }
          case GLSL_SAMPLER_DIM_EXTERNAL: {
             assert(type->base_type == GLSL_TYPE_SAMPLER);
@@ -2415,11 +2415,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
             return names[type_idx];
          }
          case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "isamplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "isamplerBuffer", NULL, NULL, NULL,
+              "iimageBuffer", NULL, NULL, NULL
             };
-            return names[type_idx];
+            return names[offset + type_idx];
          }
          default:
             unreachable("Unsupported isampler/iimage dimensionality");
@@ -2470,11 +2470,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
             return names[type_idx];
          }
          case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "usamplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "usamplerBuffer", NULL, NULL, NULL,
+              "uimageBuffer", NULL, NULL, NULL
             };
-            return names[type_idx];
+            return names[offset + type_idx];
          }
          default:
             unreachable("Unsupported usampler/uimage dimensionality");

From 4541a785020aa6b9c6472d0fc4fb0fe8cdcec40f Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 30 Mar 2016 11:22:07 -0600
Subject: [PATCH 133/238] docs: remove docs/COPYING which contains GPL license

There hasn't been GPL code in Mesa for a long time now.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 docs/COPYING | 490 ---------------------------------------------------
 1 file changed, 490 deletions(-)
 delete mode 100644 docs/COPYING

diff --git a/docs/COPYING b/docs/COPYING
deleted file mode 100644
index b88946cc66b..00000000000
--- a/docs/COPYING
+++ /dev/null
@@ -1,490 +0,0 @@
-
-Some parts of Mesa are copyrighted under the GNU LGPL.  See the
-Mesa/docs/COPYRIGHT file for details.
-
-The following is the standard GNU copyright file.
-----------------------------------------------------------------------
-
-
-		  GNU LIBRARY GENERAL PUBLIC LICENSE
-		       Version 2, June 1991
-
- Copyright (C) 1991 Free Software Foundation, Inc.
-                    675 Mass Ave, Cambridge, MA 02139, USA
- Everyone is permitted to copy and distribute verbatim copies
- of this license document, but changing it is not allowed.
-
-[This is the first released version of the library GPL.  It is
- numbered 2 because it goes with version 2 of the ordinary GPL.]
-
-			    Preamble
-
-  The licenses for most software are designed to take away your
-freedom to share and change it.  By contrast, the GNU General Public
-Licenses are intended to guarantee your freedom to share and change
-free software--to make sure the software is free for all its users.
-
-  This license, the Library General Public License, applies to some
-specially designated Free Software Foundation software, and to any
-other libraries whose authors decide to use it.  You can use it for
-your libraries, too.
-
-  When we speak of free software, we are referring to freedom, not
-price.  Our General Public Licenses are designed to make sure that you
-have the freedom to distribute copies of free software (and charge for
-this service if you wish), that you receive source code or can get it
-if you want it, that you can change the software or use pieces of it
-in new free programs; and that you know you can do these things.
-
-  To protect your rights, we need to make restrictions that forbid
-anyone to deny you these rights or to ask you to surrender the rights.
-These restrictions translate to certain responsibilities for you if
-you distribute copies of the library, or if you modify it.
-
-  For example, if you distribute copies of the library, whether gratis
-or for a fee, you must give the recipients all the rights that we gave
-you.  You must make sure that they, too, receive or can get the source
-code.  If you link a program with the library, you must provide
-complete object files to the recipients so that they can relink them
-with the library, after making changes to the library and recompiling
-it.  And you must show them these terms so they know their rights.
-
-  Our method of protecting your rights has two steps: (1) copyright
-the library, and (2) offer you this license which gives you legal
-permission to copy, distribute and/or modify the library.
-
-  Also, for each distributor's protection, we want to make certain
-that everyone understands that there is no warranty for this free
-library.  If the library is modified by someone else and passed on, we
-want its recipients to know that what they have is not the original
-version, so that any problems introduced by others will not reflect on
-the original authors' reputations.
-
-  Finally, any free program is threatened constantly by software
-patents.  We wish to avoid the danger that companies distributing free
-software will individually obtain patent licenses, thus in effect
-transforming the program into proprietary software.  To prevent this,
-we have made it clear that any patent must be licensed for everyone's
-free use or not licensed at all.
-
-  Most GNU software, including some libraries, is covered by the ordinary
-GNU General Public License, which was designed for utility programs.  This
-license, the GNU Library General Public License, applies to certain
-designated libraries.  This license is quite different from the ordinary
-one; be sure to read it in full, and don't assume that anything in it is
-the same as in the ordinary license.
-
-  The reason we have a separate public license for some libraries is that
-they blur the distinction we usually make between modifying or adding to a
-program and simply using it.  Linking a program with a library, without
-changing the library, is in some sense simply using the library, and is
-analogous to running a utility program or application program.  However, in
-a textual and legal sense, the linked executable is a combined work, a
-derivative of the original library, and the ordinary General Public License
-treats it as such.
-
-  Because of this blurred distinction, using the ordinary General
-Public License for libraries did not effectively promote software
-sharing, because most developers did not use the libraries.  We
-concluded that weaker conditions might promote sharing better.
-
-  However, unrestricted linking of non-free programs would deprive the
-users of those programs of all benefit from the free status of the
-libraries themselves.  This Library General Public License is intended to
-permit developers of non-free programs to use free libraries, while
-preserving your freedom as a user of such programs to change the free
-libraries that are incorporated in them.  (We have not seen how to achieve
-this as regards changes in header files, but we have achieved it as regards
-changes in the actual functions of the Library.)  The hope is that this
-will lead to faster development of free libraries.
-
-  The precise terms and conditions for copying, distribution and
-modification follow.  Pay close attention to the difference between a
-"work based on the library" and a "work that uses the library".  The
-former contains code derived from the library, while the latter only
-works together with the library.
-
-  Note that it is possible for a library to be covered by the ordinary
-General Public License rather than by this special one.
-
-		  GNU LIBRARY GENERAL PUBLIC LICENSE
-   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
-
-  0. This License Agreement applies to any software library which
-contains a notice placed by the copyright holder or other authorized
-party saying it may be distributed under the terms of this Library
-General Public License (also called "this License").  Each licensee is
-addressed as "you".
-
-  A "library" means a collection of software functions and/or data
-prepared so as to be conveniently linked with application programs
-(which use some of those functions and data) to form executables.
-
-  The "Library", below, refers to any such software library or work
-which has been distributed under these terms.  A "work based on the
-Library" means either the Library or any derivative work under
-copyright law: that is to say, a work containing the Library or a
-portion of it, either verbatim or with modifications and/or translated
-straightforwardly into another language.  (Hereinafter, translation is
-included without limitation in the term "modification".)
-
-  "Source code" for a work means the preferred form of the work for
-making modifications to it.  For a library, complete source code means
-all the source code for all modules it contains, plus any associated
-interface definition files, plus the scripts used to control compilation
-and installation of the library.
-
-  Activities other than copying, distribution and modification are not
-covered by this License; they are outside its scope.  The act of
-running a program using the Library is not restricted, and output from
-such a program is covered only if its contents constitute a work based
-on the Library (independent of the use of the Library in a tool for
-writing it).  Whether that is true depends on what the Library does
-and what the program that uses the Library does.
-  
-  1. You may copy and distribute verbatim copies of the Library's
-complete source code as you receive it, in any medium, provided that
-you conspicuously and appropriately publish on each copy an
-appropriate copyright notice and disclaimer of warranty; keep intact
-all the notices that refer to this License and to the absence of any
-warranty; and distribute a copy of this License along with the
-Library.
-
-  You may charge a fee for the physical act of transferring a copy,
-and you may at your option offer warranty protection in exchange for a
-fee.
-
-  2. You may modify your copy or copies of the Library or any portion
-of it, thus forming a work based on the Library, and copy and
-distribute such modifications or work under the terms of Section 1
-above, provided that you also meet all of these conditions:
-
-    a) The modified work must itself be a software library.
-
-    b) You must cause the files modified to carry prominent notices
-    stating that you changed the files and the date of any change.
-
-    c) You must cause the whole of the work to be licensed at no
-    charge to all third parties under the terms of this License.
-
-    d) If a facility in the modified Library refers to a function or a
-    table of data to be supplied by an application program that uses
-    the facility, other than as an argument passed when the facility
-    is invoked, then you must make a good faith effort to ensure that,
-    in the event an application does not supply such function or
-    table, the facility still operates, and performs whatever part of
-    its purpose remains meaningful.
-
-    (For example, a function in a library to compute square roots has
-    a purpose that is entirely well-defined independent of the
-    application.  Therefore, Subsection 2d requires that any
-    application-supplied function or table used by this function must
-    be optional: if the application does not supply it, the square
-    root function must still compute square roots.)
-
-These requirements apply to the modified work as a whole.  If
-identifiable sections of that work are not derived from the Library,
-and can be reasonably considered independent and separate works in
-themselves, then this License, and its terms, do not apply to those
-sections when you distribute them as separate works.  But when you
-distribute the same sections as part of a whole which is a work based
-on the Library, the distribution of the whole must be on the terms of
-this License, whose permissions for other licensees extend to the
-entire whole, and thus to each and every part regardless of who wrote
-it.
-
-Thus, it is not the intent of this section to claim rights or contest
-your rights to work written entirely by you; rather, the intent is to
-exercise the right to control the distribution of derivative or
-collective works based on the Library.
-
-In addition, mere aggregation of another work not based on the Library
-with the Library (or with a work based on the Library) on a volume of
-a storage or distribution medium does not bring the other work under
-the scope of this License.
-
-  3. You may opt to apply the terms of the ordinary GNU General Public
-License instead of this License to a given copy of the Library.  To do
-this, you must alter all the notices that refer to this License, so
-that they refer to the ordinary GNU General Public License, version 2,
-instead of to this License.  (If a newer version than version 2 of the
-ordinary GNU General Public License has appeared, then you can specify
-that version instead if you wish.)  Do not make any other change in
-these notices.
-
-  Once this change is made in a given copy, it is irreversible for
-that copy, so the ordinary GNU General Public License applies to all
-subsequent copies and derivative works made from that copy.
-
-  This option is useful when you wish to copy part of the code of
-the Library into a program that is not a library.
-
-  4. You may copy and distribute the Library (or a portion or
-derivative of it, under Section 2) in object code or executable form
-under the terms of Sections 1 and 2 above provided that you accompany
-it with the complete corresponding machine-readable source code, which
-must be distributed under the terms of Sections 1 and 2 above on a
-medium customarily used for software interchange.
-
-  If distribution of object code is made by offering access to copy
-from a designated place, then offering equivalent access to copy the
-source code from the same place satisfies the requirement to
-distribute the source code, even though third parties are not
-compelled to copy the source along with the object code.
-
-  5. A program that contains no derivative of any portion of the
-Library, but is designed to work with the Library by being compiled or
-linked with it, is called a "work that uses the Library".  Such a
-work, in isolation, is not a derivative work of the Library, and
-therefore falls outside the scope of this License.
-
-  However, linking a "work that uses the Library" with the Library
-creates an executable that is a derivative of the Library (because it
-contains portions of the Library), rather than a "work that uses the
-library".  The executable is therefore covered by this License.
-Section 6 states terms for distribution of such executables.
-
-  When a "work that uses the Library" uses material from a header file
-that is part of the Library, the object code for the work may be a
-derivative work of the Library even though the source code is not.
-Whether this is true is especially significant if the work can be
-linked without the Library, or if the work is itself a library.  The
-threshold for this to be true is not precisely defined by law.
-
-  If such an object file uses only numerical parameters, data
-structure layouts and accessors, and small macros and small inline
-functions (ten lines or less in length), then the use of the object
-file is unrestricted, regardless of whether it is legally a derivative
-work.  (Executables containing this object code plus portions of the
-Library will still fall under Section 6.)
-
-  Otherwise, if the work is a derivative of the Library, you may
-distribute the object code for the work under the terms of Section 6.
-Any executables containing that work also fall under Section 6,
-whether or not they are linked directly with the Library itself.
-
-  6. As an exception to the Sections above, you may also compile or
-link a "work that uses the Library" with the Library to produce a
-work containing portions of the Library, and distribute that work
-under terms of your choice, provided that the terms permit
-modification of the work for the customer's own use and reverse
-engineering for debugging such modifications.
-
-  You must give prominent notice with each copy of the work that the
-Library is used in it and that the Library and its use are covered by
-this License.  You must supply a copy of this License.  If the work
-during execution displays copyright notices, you must include the
-copyright notice for the Library among them, as well as a reference
-directing the user to the copy of this License.  Also, you must do one
-of these things:
-
-    a) Accompany the work with the complete corresponding
-    machine-readable source code for the Library including whatever
-    changes were used in the work (which must be distributed under
-    Sections 1 and 2 above); and, if the work is an executable linked
-    with the Library, with the complete machine-readable "work that
-    uses the Library", as object code and/or source code, so that the
-    user can modify the Library and then relink to produce a modified
-    executable containing the modified Library.  (It is understood
-    that the user who changes the contents of definitions files in the
-    Library will not necessarily be able to recompile the application
-    to use the modified definitions.)
-
-    b) Accompany the work with a written offer, valid for at
-    least three years, to give the same user the materials
-    specified in Subsection 6a, above, for a charge no more
-    than the cost of performing this distribution.
-
-    c) If distribution of the work is made by offering access to copy
-    from a designated place, offer equivalent access to copy the above
-    specified materials from the same place.
-
-    d) Verify that the user has already received a copy of these
-    materials or that you have already sent this user a copy.
-
-  For an executable, the required form of the "work that uses the
-Library" must include any data and utility programs needed for
-reproducing the executable from it.  However, as a special exception,
-the source code distributed need not include anything that is normally
-distributed (in either source or binary form) with the major
-components (compiler, kernel, and so on) of the operating system on
-which the executable runs, unless that component itself accompanies
-the executable.
-
-  It may happen that this requirement contradicts the license
-restrictions of other proprietary libraries that do not normally
-accompany the operating system.  Such a contradiction means you cannot
-use both them and the Library together in an executable that you
-distribute.
-
-  7. You may place library facilities that are a work based on the
-Library side-by-side in a single library together with other library
-facilities not covered by this License, and distribute such a combined
-library, provided that the separate distribution of the work based on
-the Library and of the other library facilities is otherwise
-permitted, and provided that you do these two things:
-
-    a) Accompany the combined library with a copy of the same work
-    based on the Library, uncombined with any other library
-    facilities.  This must be distributed under the terms of the
-    Sections above.
-
-    b) Give prominent notice with the combined library of the fact
-    that part of it is a work based on the Library, and explaining
-    where to find the accompanying uncombined form of the same work.
-
-  8. You may not copy, modify, sublicense, link with, or distribute
-the Library except as expressly provided under this License.  Any
-attempt otherwise to copy, modify, sublicense, link with, or
-distribute the Library is void, and will automatically terminate your
-rights under this License.  However, parties who have received copies,
-or rights, from you under this License will not have their licenses
-terminated so long as such parties remain in full compliance.
-
-  9. You are not required to accept this License, since you have not
-signed it.  However, nothing else grants you permission to modify or
-distribute the Library or its derivative works.  These actions are
-prohibited by law if you do not accept this License.  Therefore, by
-modifying or distributing the Library (or any work based on the
-Library), you indicate your acceptance of this License to do so, and
-all its terms and conditions for copying, distributing or modifying
-the Library or works based on it.
-
-  10. Each time you redistribute the Library (or any work based on the
-Library), the recipient automatically receives a license from the
-original licensor to copy, distribute, link with or modify the Library
-subject to these terms and conditions.  You may not impose any further
-restrictions on the recipients' exercise of the rights granted herein.
-You are not responsible for enforcing compliance by third parties to
-this License.
-
-  11. If, as a consequence of a court judgment or allegation of patent
-infringement or for any other reason (not limited to patent issues),
-conditions are imposed on you (whether by court order, agreement or
-otherwise) that contradict the conditions of this License, they do not
-excuse you from the conditions of this License.  If you cannot
-distribute so as to satisfy simultaneously your obligations under this
-License and any other pertinent obligations, then as a consequence you
-may not distribute the Library at all.  For example, if a patent
-license would not permit royalty-free redistribution of the Library by
-all those who receive copies directly or indirectly through you, then
-the only way you could satisfy both it and this License would be to
-refrain entirely from distribution of the Library.
-
-If any portion of this section is held invalid or unenforceable under any
-particular circumstance, the balance of the section is intended to apply,
-and the section as a whole is intended to apply in other circumstances.
-
-It is not the purpose of this section to induce you to infringe any
-patents or other property right claims or to contest validity of any
-such claims; this section has the sole purpose of protecting the
-integrity of the free software distribution system which is
-implemented by public license practices.  Many people have made
-generous contributions to the wide range of software distributed
-through that system in reliance on consistent application of that
-system; it is up to the author/donor to decide if he or she is willing
-to distribute software through any other system and a licensee cannot
-impose that choice.
-
-This section is intended to make thoroughly clear what is believed to
-be a consequence of the rest of this License.
-
-  12. If the distribution and/or use of the Library is restricted in
-certain countries either by patents or by copyrighted interfaces, the
-original copyright holder who places the Library under this License may add
-an explicit geographical distribution limitation excluding those countries,
-so that distribution is permitted only in or among countries not thus
-excluded.  In such case, this License incorporates the limitation as if
-written in the body of this License.
-
-  13. The Free Software Foundation may publish revised and/or new
-versions of the Library General Public License from time to time.
-Such new versions will be similar in spirit to the present version,
-but may differ in detail to address new problems or concerns.
-
-Each version is given a distinguishing version number.  If the Library
-specifies a version number of this License which applies to it and
-"any later version", you have the option of following the terms and
-conditions either of that version or of any later version published by
-the Free Software Foundation.  If the Library does not specify a
-license version number, you may choose any version ever published by
-the Free Software Foundation.
-
-  14. If you wish to incorporate parts of the Library into other free
-programs whose distribution conditions are incompatible with these,
-write to the author to ask for permission.  For software which is
-copyrighted by the Free Software Foundation, write to the Free
-Software Foundation; we sometimes make exceptions for this.  Our
-decision will be guided by the two goals of preserving the free status
-of all derivatives of our free software and of promoting the sharing
-and reuse of software generally.
-
-			    NO WARRANTY
-
-  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
-WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
-EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
-OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
-KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
-PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
-LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
-THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
-
-  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
-WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
-AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
-FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
-CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
-LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
-RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
-FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
-SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
-DAMAGES.
-
-		     END OF TERMS AND CONDITIONS
-
-     Appendix: How to Apply These Terms to Your New Libraries
-
-  If you develop a new library, and you want it to be of the greatest
-possible use to the public, we recommend making it free software that
-everyone can redistribute and change.  You can do so by permitting
-redistribution under these terms (or, alternatively, under the terms of the
-ordinary General Public License).
-
-  To apply these terms, attach the following notices to the library.  It is
-safest to attach them to the start of each source file to most effectively
-convey the exclusion of warranty; and each file should have at least the
-"copyright" line and a pointer to where the full notice is found.
-
-    <one line to give the library's name and a brief idea of what it does.>
-    Copyright (C) <year>  <name of author>
-
-    This library is free software; you can redistribute it and/or
-    modify it under the terms of the GNU Library General Public
-    License as published by the Free Software Foundation; either
-    version 2 of the License, or (at your option) any later version.
-
-    This library is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-    Library General Public License for more details.
-
-    You should have received a copy of the GNU Library General Public
-    License along with this library; if not, write to the Free
-    Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
-
-Also add information on how to contact you by electronic and paper mail.
-
-You should also get your employer (if you work as a programmer) or your
-school, if any, to sign a "copyright disclaimer" for the library, if
-necessary.  Here is a sample; alter the names:
-
-  Yoyodyne, Inc., hereby disclaims all copyright interest in the
-  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
-
-  <signature of Ty Coon>, 1 April 1990
-  Ty Coon, President of Vice
-
-That's all there is to it!
-

From 9a73f5728e9b834c51128e34317854702281bf3e Mon Sep 17 00:00:00 2001
From: Thomas Hindoe Paaboel Andersen <phomes@gmail.com>
Date: Wed, 30 Mar 2016 08:13:24 +0200
Subject: [PATCH 134/238] st/vdpau: correct null check
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The null check of result was the wrong way around. Also, move memset
and dereference of result after the null check.

Reviewed-by: Christian König <christian.koenig@amd.com>
---
 src/gallium/state_trackers/vdpau/surface.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c
index 0550141b597..d418d56a26a 100644
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -426,18 +426,18 @@ VdpStatus vlVdpVideoSurfaceDMABuf(VdpVideoSurface surface,
 
    struct pipe_surface *surf;
 
-   memset(result, 0, sizeof(*result));
-   result->handle = -1;
-
    if (!p_surf)
       return VDP_STATUS_INVALID_HANDLE;
 
    if (plane > 3)
       return VDP_STATUS_INVALID_VALUE;
 
-   if (result)
+   if (!result)
       return VDP_STATUS_INVALID_POINTER;
 
+   memset(result, 0, sizeof(*result));
+   result->handle = -1;
+
    pipe_mutex_lock(p_surf->device->mutex);
    if (p_surf->video_buffer == NULL) {
       struct pipe_context *pipe = p_surf->device->context;

From 1faca438bdbf11d85a6158d41ea91cab40fc2033 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Christian=20K=C3=B6nig?= <christian.koenig@amd.com>
Date: Wed, 30 Mar 2016 15:38:29 +0200
Subject: [PATCH 135/238] r600: ignore PIPE_BIND_LINEAR in
 *_is_format_supported
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similar to radeonsi linear layout should work for all not compressed
or depth/stencil formats. Fixes issues with VDPAU on r600.

Signed-off-by: Christian König <christian.koenig@amd.com>
Tested-by: Dieter Nützel <Dieter@nuetzel-hh.de>
---
 src/gallium/drivers/r600/evergreen_state.c | 5 +++++
 src/gallium/drivers/r600/r600_state.c      | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 83313cb28cf..65952676987 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -299,6 +299,11 @@ boolean evergreen_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;
 
+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }
 
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index f9026197b26..3189a1360b1 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -239,6 +239,11 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;
 
+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }
 

From a74fc3fe8ada87e1fedeea86f2d93f736a1217bc Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Tue, 24 Nov 2015 17:17:29 -0800
Subject: [PATCH 136/238] i965: Don't inline intel_batchbuffer_require_space().

It's called by the inline intel_batchbuffer_begin() function which
itself is used in BEGIN_BATCH. So in sequence of code emitting multiple
packets, we have inlined this ~200 byte function multiple times. Making
it an out-of-line function presumably improved icache usage.

Improves performance of Gl32Batch7 by 3.39898% +/- 0.358674% (n=155) on
Ivybridge.

Reviewed-by: Abdiel Janulgue <abdiel.janulgue@linux.intel.com>
---
 src/mesa/drivers/dri/i965/intel_batchbuffer.c | 26 +++++++++++++++++
 src/mesa/drivers/dri/i965/intel_batchbuffer.h | 28 ++-----------------
 2 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f77807472fd..e41f927819e 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -106,6 +106,32 @@ intel_batchbuffer_free(struct brw_context *brw)
    drm_intel_bo_unreference(brw->batch.bo);
 }
 
+void
+intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
+                                enum brw_gpu_ring ring)
+{
+   /* If we're switching rings, implicitly flush the batch. */
+   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
+       brw->gen >= 6) {
+      intel_batchbuffer_flush(brw);
+   }
+
+#ifdef DEBUG
+   assert(sz < BATCH_SZ - BATCH_RESERVED);
+#endif
+   if (intel_batchbuffer_space(brw) < sz)
+      intel_batchbuffer_flush(brw);
+
+   enum brw_gpu_ring prev_ring = brw->batch.ring;
+   /* The intel_batchbuffer_flush() calls above might have changed
+    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
+    */
+   brw->batch.ring = ring;
+
+   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
+      intel_batchbuffer_emit_render_ring_prelude(brw);
+}
+
 static void
 do_batch_dump(struct brw_context *brw)
 {
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index f47369029a0..aa1dc38babc 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -44,6 +44,8 @@ void intel_batchbuffer_init(struct brw_context *brw);
 void intel_batchbuffer_free(struct brw_context *brw);
 void intel_batchbuffer_save_state(struct brw_context *brw);
 void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
+void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
+                                     enum brw_gpu_ring ring);
 
 int _intel_batchbuffer_flush(struct brw_context *brw,
 			     const char *file, int line);
@@ -116,32 +118,6 @@ intel_batchbuffer_emit_float(struct brw_context *brw, float f)
    intel_batchbuffer_emit_dword(brw, float_as_int(f));
 }
 
-static inline void
-intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
-                                enum brw_gpu_ring ring)
-{
-   /* If we're switching rings, implicitly flush the batch. */
-   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
-       brw->gen >= 6) {
-      intel_batchbuffer_flush(brw);
-   }
-
-#ifdef DEBUG
-   assert(sz < BATCH_SZ - BATCH_RESERVED);
-#endif
-   if (intel_batchbuffer_space(brw) < sz)
-      intel_batchbuffer_flush(brw);
-
-   enum brw_gpu_ring prev_ring = brw->batch.ring;
-   /* The intel_batchbuffer_flush() calls above might have changed
-    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
-    */
-   brw->batch.ring = ring;
-
-   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
-      intel_batchbuffer_emit_render_ring_prelude(brw);
-}
-
 static inline void
 intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
 {

From 6702f1acde9e93e41783fd2f4a7999fed8d9cb75 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 22 Feb 2016 15:16:49 -0800
Subject: [PATCH 137/238] nir: Propagate negates up multiplication chains.

total instructions in shared programs: 7112159 -> 7088092 (-0.34%)
instructions in affected programs: 1374915 -> 1350848 (-1.75%)
helped: 7392
HURT: 621

GAINED: 2
LOST:   2
---
 src/compiler/nir/nir_opt_algebraic.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 53633233f2b..c2e56e71734 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -269,6 +269,10 @@ optimizations = [
    (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
    (('iabs', ('isub', 0, a)), ('iabs', a)),
 
+   # Propagate negation up multiplication chains
+   (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
+   (('imul', ('ineg', a), b), ('ineg', ('fmul', a, b))),
+
    # Misc. lowering
    (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
    (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),

From 1ff4cc053589ae2ea10a63116b1e1fe15ecdfbeb Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 21 Mar 2016 08:51:54 +1000
Subject: [PATCH 138/238] tgsi_exec: add support for up to 3 address registers
 (v2)

v2: be consistent with other definitions.

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 991c3bfc5db..8b5a5806602 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -201,12 +201,13 @@ struct tgsi_sampler
 #define TGSI_EXEC_NUM_TEMP_R        4
 
 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 8)
+#define TGSI_EXEC_NUM_ADDRS         3
 
 /* predicate register */
-#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 11)
 #define TGSI_EXEC_NUM_PREDS         1
 
-#define TGSI_EXEC_NUM_TEMP_EXTRAS   10
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   12
 
 
 

From ca180c09bb0941468814796f13c0701590523be4 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Mon, 21 Mar 2016 08:52:14 +1000
Subject: [PATCH 139/238] tgsi_exec: handle execmask when doing indirect
 lookups

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 126259fc0f8..a44a05c49f4 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1995,11 +1995,11 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                    uint sampler)
 {
    uint unit;
-
+   int i;
    if (inst->Src[sampler].Register.Indirect) {
       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
       union tgsi_exec_channel indir_index, index2;
-
+      const uint execmask = mach->ExecMask;
       index2.i[0] =
       index2.i[1] =
       index2.i[2] =
@@ -2012,7 +2012,13 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                              &index2,
                              &ZeroVec,
                              &indir_index);
-      unit = inst->Src[sampler].Register.Index + indir_index.i[0];
+      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+         if (execmask & (1 << i)) {
+            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
+            break;
+         }
+      }
+
    } else {
       unit = inst->Src[sampler].Register.Index;
    }

From 827393b76fffa352e0ff3cae077c7817d6cfbf8a Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 07:50:37 +1000
Subject: [PATCH 140/238] tgsi: introduce NonHelperMask

This is a mask of which of the current 2x2 grid are non-helper
invocations. This allows us to mask off the helper invocations
later for the image operations.

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 2 ++
 src/gallium/auxiliary/tgsi/tgsi_exec.h | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index a44a05c49f4..fa1c9161bcc 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -5199,6 +5199,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
       default_mask = 0x1;
    }
 
+   if (mach->NonHelperMask == 0)
+      mach->NonHelperMask = default_mask;
    mach->CondMask = default_mask;
    mach->LoopMask = default_mask;
    mach->ContMask = default_mask;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 8b5a5806602..6bcf45b4dae 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -312,6 +312,9 @@ struct tgsi_exec_machine
    struct tgsi_exec_vector       QuadPos;
    float                         Face;    /**< +1 if front facing, -1 if back facing */
    bool                          flatshade_color;
+
+   /* See GLSL 4.50 specification for definition of helper invocations */
+   uint NonHelperMask;  /**< non-helpers */
    /* Conditional execution masks */
    uint CondMask;  /**< For IF/ELSE/ENDIF */
    uint LoopMask;  /**< For BGNLOOP/ENDLOOP */

From 493eab76792307d066489bc1d88798f14a5df31d Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 07:52:26 +1000
Subject: [PATCH 141/238] softpipe: add support for explicit early depth
 testing

ARB_shader_image_load_store adds support for explicit early
depth testing. However we need to make sure we don't overwrite
values using the shader written values in this case.

This fixes early depth testing in softpipe to conform with
those requirements.

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/drivers/softpipe/sp_context.h        |  2 ++
 src/gallium/drivers/softpipe/sp_fs_exec.c        | 16 ++++++++++------
 .../drivers/softpipe/sp_quad_depth_test.c        |  4 ++--
 src/gallium/drivers/softpipe/sp_quad_fs.c        |  2 +-
 src/gallium/drivers/softpipe/sp_quad_pipe.c      |  6 ++++--
 src/gallium/drivers/softpipe/sp_state.h          |  3 ++-
 6 files changed, 21 insertions(+), 12 deletions(-)

diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index d5c4aaae638..d18bbe693f3 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -175,6 +175,8 @@ struct softpipe_context {
    } tgsi;
 
    struct tgsi_exec_machine *fs_machine;
+   /** whether early depth testing is enabled */
+   bool early_depth;
 
    /** The primitive drawing context */
    struct draw_context *draw;
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 89411777ec9..e2d527dab66 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -116,7 +116,8 @@ setup_pos_vector(const struct tgsi_interp_coef *coef,
 static unsigned 
 exec_run( const struct sp_fragment_shader_variant *var,
 	  struct tgsi_exec_machine *machine,
-	  struct quad_header *quad )
+	  struct quad_header *quad,
+	  bool early_depth_test )
 {
    /* Compute X, Y, Z, W vals for this quad */
    setup_pos_vector(quad->posCoef, 
@@ -155,16 +156,19 @@ exec_run( const struct sp_fragment_shader_variant *var,
             {
                uint j;
 
-               for (j = 0; j < 4; j++)
-                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               if (!early_depth_test) {
+                  for (j = 0; j < 4; j++)
+                     quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
             }
             break;
          case TGSI_SEMANTIC_STENCIL:
             {
                uint j;
-
-               for (j = 0; j < 4; j++)
-                  quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+               if (!early_depth_test) {
+                  for (j = 0; j < 4; j++)
+                     quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+               }
             }
             break;
          }
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 4cce9e9bc12..847a616f491 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -782,7 +782,7 @@ depth_test_quads_fallback(struct quad_stage *qs,
 {
    unsigned i, pass = 0;
    const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
-   boolean interp_depth = !fsInfo->writes_z;
+   boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
    boolean shader_stencil_ref = fsInfo->writes_stencil;
    struct depth_data data;
    unsigned vp_idx = quads[0]->input.viewport_index;
@@ -902,7 +902,7 @@ choose_depth_test(struct quad_stage *qs,
 {
    const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
 
-   boolean interp_depth = !fsInfo->writes_z;
+   boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
 
    boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 395bc70f2cf..8fb632d9dcf 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -80,7 +80,7 @@ shade_quad(struct quad_stage *qs, struct quad_header *quad)
 
    /* run shader */
    machine->flatshade_color = softpipe->rasterizer->flatshade ? TRUE : FALSE;
-   return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad );
+   return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad, softpipe->early_depth );
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index 7131512daee..dbe4c0eb67e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -43,15 +43,17 @@ void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
    boolean early_depth_test =
-      sp->depth_stencil->depth.enabled &&
+      (sp->depth_stencil->depth.enabled &&
       sp->framebuffer.zsbuf &&
       !sp->depth_stencil->alpha.enabled &&
       !sp->fs_variant->info.uses_kill &&
       !sp->fs_variant->info.writes_z &&
-      !sp->fs_variant->info.writes_stencil;
+       !sp->fs_variant->info.writes_stencil) ||
+      sp->fs_variant->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL];
 
    sp->quad.first = sp->quad.blend;
 
+   sp->early_depth = early_depth_test;
    if (early_depth_test) {
       insert_stage_at_head( sp, sp->quad.shade );
       insert_stage_at_head( sp, sp->quad.depth_test );
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 16a2897f526..7a2d3715f8b 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -85,7 +85,8 @@ struct sp_fragment_shader_variant
 
    unsigned (*run)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
-		   struct quad_header *quad);
+		   struct quad_header *quad,
+		   bool early_depth_test);
 
    /* Deletes this instance of the object */
    void (*delete)(struct sp_fragment_shader_variant *shader,

From 22d1296013825a4dce84e6f579581202a18767c7 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 07:53:48 +1000
Subject: [PATCH 142/238] tgsi: add support for image operations to tgsi_exec.
 (v2.1)

This adds support for load/store/atomic operations on images
along with image tracking support.

v2: add RESQ support. (Ilia)
v2.1: constify interface (Brian)
split get_image_coord_dim (Brian)

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/draw/draw_gs.c      |   2 +-
 src/gallium/auxiliary/draw/draw_vs_exec.c |   2 +-
 src/gallium/auxiliary/tgsi/tgsi_exec.c    | 273 +++++++++++++++++++++-
 src/gallium/auxiliary/tgsi/tgsi_exec.h    |  44 +++-
 src/gallium/drivers/softpipe/sp_fs_exec.c |   4 +-
 5 files changed, 319 insertions(+), 6 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index fcef31b4ff5..2f18df8f789 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -681,7 +681,7 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
    if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(shader->machine,
                                     shader->state.tokens,
-                                    draw->gs.tgsi.sampler);
+                                    draw->gs.tgsi.sampler, NULL);
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 3fd8ef3cd2f..c1266e7ffec 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -70,7 +70,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
    if (evs->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(evs->machine,
                                     shader->state.tokens,
-                                    draw->vs.tgsi.sampler);
+                                    draw->vs.tgsi.sampler, NULL);
    }
 }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index fa1c9161bcc..53d5937b2df 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -853,7 +853,8 @@ void
 tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
-   struct tgsi_sampler *sampler)
+   struct tgsi_sampler *sampler,
+   struct tgsi_image *image)
 {
    uint k;
    struct tgsi_parse_context parse;
@@ -871,6 +872,7 @@ tgsi_exec_machine_bind_shader(
 
    mach->Tokens = tokens;
    mach->Sampler = sampler;
+   mach->Image = image;
 
    if (!tokens) {
       /* unbind and free all */
@@ -3706,6 +3708,247 @@ exec_dfracexp(struct tgsi_exec_machine *mach,
    }
 }
 
+static int
+get_image_coord_dim(unsigned tgsi_tex)
+{
+   int dim;
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_BUFFER:
+   case TGSI_TEXTURE_1D:
+      dim = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+      dim = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      dim = 3;
+      break;
+   default:
+      assert(!"unknown texture target");
+      dim = 0;
+      break;
+   }
+
+   return dim;
+}
+
+static int
+get_image_coord_sample(unsigned tgsi_tex)
+{
+   int sample = 0;
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_2D_MSAA:
+      sample = 3;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      sample = 4;
+      break;
+   default:
+      break;
+   }
+   return sample;
+}
+
+static void
+exec_load(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4], sample_r;
+   uint unit;
+   int sample;
+   int i, j;
+   int dim;
+   uint chan;
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+   }
+
+   if (sample)
+      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+   mach->Image->load(mach->Image, &params,
+                     r[0].i, r[1].i, r[2].i, sample_r.i,
+                     rgba);
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_store(struct tgsi_exec_machine *mach,
+           const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[3], sample_r;
+   union tgsi_exec_channel value[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int dim;
+   int sample;
+   int i, j;
+   uint unit;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+   unit = inst->Dst[0].Register.Index;
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
+   }
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 1, TGSI_CHAN_X + i);
+   }
+   if (sample)
+      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+
+   mach->Image->store(mach->Image, &params,
+                      r[0].i, r[1].i, r[2].i, sample_r.i,
+                      rgba);
+}
+
+static void
+exec_atomop(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[3], sample_r;
+   union tgsi_exec_channel value[4], value2[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int dim;
+   int sample;
+   int i, j;
+   uint unit, chan;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+   unit = fetch_sampler_unit(mach, inst, 0);
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+   }
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 2, TGSI_CHAN_X + i);
+      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
+   }
+   if (sample)
+      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         rgba2[0][j] = value2[0].f[j];
+         rgba2[1][j] = value2[1].f[j];
+         rgba2[2][j] = value2[2].f[j];
+         rgba2[3][j] = value2[3].f[j];
+      }
+   }
+
+   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
+                   r[0].i, r[1].i, r[2].i, sample_r.i,
+                   rgba, rgba2);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_resq(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   int result[4];
+   union tgsi_exec_channel r[4];
+   uint unit;
+   int i, chan, j;
+   struct tgsi_image_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   mach->Image->get_dims(mach->Image, &params, result);
+
+   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+      for (j = 0; j < 4; j++) {
+         r[j].i[i] = result[j];
+      }
+   }
+
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
+                    TGSI_EXEC_DATA_INT);
+      }
+   }
+}
 
 static void
 micro_i2f(union tgsi_exec_channel *dst,
@@ -5172,6 +5415,34 @@ exec_instruction(
    case TGSI_OPCODE_D2U:
       exec_d2u(mach, inst);
       break;
+
+   case TGSI_OPCODE_LOAD:
+      exec_load(mach, inst);
+      break;
+
+   case TGSI_OPCODE_STORE:
+      exec_store(mach, inst);
+      break;
+
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      exec_atomop(mach, inst);
+      break;
+
+   case TGSI_OPCODE_RESQ:
+      exec_resq(mach, inst);
+      break;
+   case TGSI_OPCODE_BARRIER:
+   case TGSI_OPCODE_MEMBAR:
+      break;
    default:
       assert( 0 );
    }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 6bcf45b4dae..2c81d5e827d 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -98,6 +98,46 @@ enum tgsi_sampler_control
    TGSI_SAMPLER_GATHER,
 };
 
+struct tgsi_image_params {
+   unsigned unit;
+   unsigned tgsi_tex_instr;
+   enum pipe_format format;
+   unsigned execmask;
+};
+
+struct tgsi_image {
+   /* image interfaces */
+   void (*load)(const struct tgsi_image *image,
+                const struct tgsi_image_params *params,
+                const int s[TGSI_QUAD_SIZE],
+                const int t[TGSI_QUAD_SIZE],
+                const int r[TGSI_QUAD_SIZE],
+                const int sample[TGSI_QUAD_SIZE],
+                float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*store)(const struct tgsi_image *image,
+                 const struct tgsi_image_params *params,
+                 const int s[TGSI_QUAD_SIZE],
+                 const int t[TGSI_QUAD_SIZE],
+                 const int r[TGSI_QUAD_SIZE],
+                 const int sample[TGSI_QUAD_SIZE],
+                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*op)(const struct tgsi_image *image,
+              const struct tgsi_image_params *params,
+              unsigned opcode,
+              const int s[TGSI_QUAD_SIZE],
+              const int t[TGSI_QUAD_SIZE],
+              const int r[TGSI_QUAD_SIZE],
+              const int sample[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*get_dims)(const struct tgsi_image *image,
+                    const struct tgsi_image_params *params,
+                    int dims[4]);
+};
+
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
@@ -293,6 +333,7 @@ struct tgsi_exec_machine
 
    struct tgsi_sampler           *Sampler;
 
+   struct tgsi_image             *Image;
    unsigned                      ImmLimit;
 
    const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
@@ -382,7 +423,8 @@ void
 tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
-   struct tgsi_sampler *sampler);
+   struct tgsi_sampler *sampler,
+   struct tgsi_image *image);
 
 uint
 tgsi_exec_machine_run(
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index e2d527dab66..2c5bf7ef5c9 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -69,7 +69,7 @@ exec_prepare( const struct sp_fragment_shader_variant *var,
     */
    tgsi_exec_machine_bind_shader(machine,
                                  var->tokens,
-                                 sampler);
+                                 sampler, NULL);
 }
 
 
@@ -184,7 +184,7 @@ exec_delete(struct sp_fragment_shader_variant *var,
             struct tgsi_exec_machine *machine)
 {
    if (machine->Tokens == var->tokens) {
-      tgsi_exec_machine_bind_shader(machine, NULL, NULL);
+      tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL);
    }
 
    FREE( (void *) var->tokens );

From 0d1f679dedfb47944259e846d7f2eadbcf0907ca Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 07:58:39 +1000
Subject: [PATCH 143/238] draw: add support for passing images to vs/gs
 shaders.

This just adds support for passing through images to the
tgsi execution stage.

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/draw/draw_context.c | 18 ++++++++++++++++++
 src/gallium/auxiliary/draw/draw_context.h |  6 ++++++
 src/gallium/auxiliary/draw/draw_gs.c      |  2 +-
 src/gallium/auxiliary/draw/draw_private.h |  3 +++
 src/gallium/auxiliary/draw/draw_vs_exec.c |  2 +-
 5 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 16a261c14cf..2ba9b099664 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -731,6 +731,24 @@ draw_texture_sampler(struct draw_context *draw,
    }
 }
 
+/**
+ * Provide TGSI image objects for vertex/geometry shaders that use
+ * texture fetches.  This state only needs to be set once per context.
+ * This might only be used by software drivers for the time being.
+ */
+void
+draw_image(struct draw_context *draw,
+           uint shader,
+           struct tgsi_image *image)
+{
+   if (shader == PIPE_SHADER_VERTEX) {
+      draw->vs.tgsi.image = image;
+   } else {
+      debug_assert(shader == PIPE_SHADER_GEOMETRY);
+      draw->gs.tgsi.image = image;
+   }
+}
+
 
 
 
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index a5a6df5b72e..5d9870b115c 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,6 +48,7 @@ struct draw_vertex_shader;
 struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
+struct tgsi_image;
 
 /*
  * structure to contain driver internal information 
@@ -154,6 +155,11 @@ draw_texture_sampler(struct draw_context *draw,
                      uint shader_type,
                      struct tgsi_sampler *sampler);
 
+void
+draw_image(struct draw_context *draw,
+           uint shader_type,
+           struct tgsi_image *image);
+
 void
 draw_set_sampler_views(struct draw_context *draw,
                        unsigned shader_stage,
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 2f18df8f789..14db2d6f39d 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -681,7 +681,7 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
    if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(shader->machine,
                                     shader->state.tokens,
-                                    draw->gs.tgsi.sampler, NULL);
+                                    draw->gs.tgsi.sampler, draw->gs.tgsi.image);
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 8774bebd5f9..211bd6f7e70 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -66,6 +66,7 @@ struct draw_stage;
 struct vbuf_render;
 struct tgsi_exec_machine;
 struct tgsi_sampler;
+struct tgsi_image;
 struct draw_pt_front_end;
 struct draw_assembler;
 struct draw_llvm;
@@ -267,6 +268,7 @@ struct draw_context
          struct tgsi_exec_machine *machine;
 
          struct tgsi_sampler *sampler;
+         struct tgsi_image *image;
       } tgsi;
 
       struct translate *fetch;
@@ -286,6 +288,7 @@ struct draw_context
          struct tgsi_exec_machine *machine;
 
          struct tgsi_sampler *sampler;
+         struct tgsi_image *image;
       } tgsi;
 
    } gs;
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index c1266e7ffec..5b53cff29f0 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -70,7 +70,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
    if (evs->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(evs->machine,
                                     shader->state.tokens,
-                                    draw->vs.tgsi.sampler, NULL);
+                                    draw->vs.tgsi.sampler, draw->vs.tgsi.image);
    }
 }
 

From eb9ad9faa3975fc4f044b81d3b4b793866ef5563 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 07:59:35 +1000
Subject: [PATCH 144/238] softpipe: add image support to softpipe (v3)

This adds support for ARB_shader_image_load_store to softpipe.

v2: add RESQ support (Ilia)
v3: constify, cleanup internals, add some comments (Brian).

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.h        |   4 +-
 src/gallium/drivers/softpipe/Makefile.sources |   2 +
 src/gallium/drivers/softpipe/sp_context.c     |  20 +-
 src/gallium/drivers/softpipe/sp_context.h     |   2 +
 src/gallium/drivers/softpipe/sp_flush.c       |  26 +
 src/gallium/drivers/softpipe/sp_flush.h       |   2 +
 src/gallium/drivers/softpipe/sp_fs_exec.c     |   6 +-
 src/gallium/drivers/softpipe/sp_image.c       | 762 ++++++++++++++++++
 src/gallium/drivers/softpipe/sp_image.h       |  37 +
 src/gallium/drivers/softpipe/sp_state.h       |   7 +-
 .../drivers/softpipe/sp_state_derived.c       |   3 +-
 src/gallium/drivers/softpipe/sp_state_image.c |  57 ++
 src/gallium/drivers/softpipe/sp_texture.c     |   8 +-
 src/gallium/drivers/softpipe/sp_texture.h     |   4 +-
 14 files changed, 928 insertions(+), 12 deletions(-)
 create mode 100644 src/gallium/drivers/softpipe/sp_image.c
 create mode 100644 src/gallium/drivers/softpipe/sp_image.h
 create mode 100644 src/gallium/drivers/softpipe/sp_state_image.c

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 2c81d5e827d..45fb8d43c88 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -497,8 +497,10 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+      return PIPE_MAX_SHADER_IMAGES;
+
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
    }
diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources
index 2af3d6af21a..efe88468e3f 100644
--- a/src/gallium/drivers/softpipe/Makefile.sources
+++ b/src/gallium/drivers/softpipe/Makefile.sources
@@ -10,6 +10,7 @@ C_SOURCES := \
 	sp_flush.h \
 	sp_fs_exec.c \
 	sp_fs.h \
+	sp_image.c \
 	sp_limits.h \
 	sp_prim_vbuf.c \
 	sp_prim_vbuf.h \
@@ -31,6 +32,7 @@ C_SOURCES := \
 	sp_state_blend.c \
 	sp_state_clip.c \
 	sp_state_derived.c \
+	sp_state_image.c \
 	sp_state.h \
 	sp_state_rasterizer.c \
 	sp_state_sampler.c \
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index d2a32200e47..30b0276cfe0 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -50,7 +50,7 @@
 #include "sp_query.h"
 #include "sp_screen.h"
 #include "sp_tex_sample.h"
-
+#include "sp_image.h"
 
 static void
 softpipe_destroy( struct pipe_context *pipe )
@@ -199,6 +199,10 @@ softpipe_create_context(struct pipe_screen *screen,
       softpipe->tgsi.sampler[i] = sp_create_tgsi_sampler();
    }
 
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      softpipe->tgsi.image[i] = sp_create_tgsi_image();
+   }
+
    softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
    softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
 
@@ -216,6 +220,7 @@ softpipe_create_context(struct pipe_screen *screen,
    softpipe_init_streamout_funcs(&softpipe->pipe);
    softpipe_init_texture_funcs( &softpipe->pipe );
    softpipe_init_vertex_funcs(&softpipe->pipe);
+   softpipe_init_image_funcs(&softpipe->pipe);
 
    softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
 
@@ -223,7 +228,8 @@ softpipe_create_context(struct pipe_screen *screen,
 
    softpipe->pipe.clear = softpipe_clear;
    softpipe->pipe.flush = softpipe_flush_wrapped;
-
+   softpipe->pipe.texture_barrier = softpipe_texture_barrier;
+   softpipe->pipe.memory_barrier = softpipe_memory_barrier;
    softpipe->pipe.render_condition = softpipe_render_condition;
    
    /*
@@ -272,6 +278,16 @@ softpipe_create_context(struct pipe_screen *screen,
                         (struct tgsi_sampler *)
                            softpipe->tgsi.sampler[PIPE_SHADER_GEOMETRY]);
 
+   draw_image(softpipe->draw,
+              PIPE_SHADER_VERTEX,
+              (struct tgsi_image *)
+              softpipe->tgsi.image[PIPE_SHADER_VERTEX]);
+
+   draw_image(softpipe->draw,
+              PIPE_SHADER_GEOMETRY,
+              (struct tgsi_image *)
+              softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]);
+
    if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index d18bbe693f3..20a12353b38 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -83,6 +83,7 @@ struct softpipe_context {
    struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
    struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
+   struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_index_buffer index_buffer;
@@ -172,6 +173,7 @@ struct softpipe_context {
    /** TGSI exec things */
    struct {
       struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES];
+      struct sp_tgsi_image *image[PIPE_SHADER_TYPES];
    } tgsi;
 
    struct tgsi_exec_machine *fs_machine;
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 5a29e26517d..59b8ad696ec 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -168,3 +168,29 @@ softpipe_flush_resource(struct pipe_context *pipe,
 
    return TRUE;
 }
+
+void softpipe_texture_barrier(struct pipe_context *pipe)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i, sh;
+
+   for (sh = 0; sh < Elements(softpipe->tex_cache); sh++) {
+      for (i = 0; i < softpipe->num_sampler_views[sh]; i++) {
+         sp_flush_tex_tile_cache(softpipe->tex_cache[sh][i]);
+      }
+   }
+
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+      if (softpipe->cbuf_cache[i])
+         sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+   if (softpipe->zsbuf_cache)
+      sp_flush_tile_cache(softpipe->zsbuf_cache);
+
+   softpipe->dirty_render_cache = FALSE;
+}
+
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags)
+{
+   softpipe_texture_barrier(pipe);
+}
diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h
index ab5f77be264..0674b4a7e48 100644
--- a/src/gallium/drivers/softpipe/sp_flush.h
+++ b/src/gallium/drivers/softpipe/sp_flush.h
@@ -55,4 +55,6 @@ softpipe_flush_resource(struct pipe_context *pipe,
                         boolean cpu_access,
                         boolean do_not_block);
 
+void softpipe_texture_barrier(struct pipe_context *pipe);
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags);
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 2c5bf7ef5c9..bfd9a4b7496 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -62,14 +62,15 @@ sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
 static void
 exec_prepare( const struct sp_fragment_shader_variant *var,
               struct tgsi_exec_machine *machine,
-              struct tgsi_sampler *sampler )
+              struct tgsi_sampler *sampler,
+              struct tgsi_image *image )
 {
    /*
     * Bind tokens/shader to the interpreter's machine state.
     */
    tgsi_exec_machine_bind_shader(machine,
                                  var->tokens,
-                                 sampler, NULL);
+                                 sampler, image);
 }
 
 
@@ -127,6 +128,7 @@ exec_run( const struct sp_fragment_shader_variant *var,
    /* convert 0 to 1.0 and 1 to -1.0 */
    machine->Face = (float) (quad->input.facing * -2 + 1);
 
+   machine->NonHelperMask = quad->inout.mask;
    quad->inout.mask &= tgsi_exec_machine_run( machine );
    if (quad->inout.mask == 0)
       return FALSE;
diff --git a/src/gallium/drivers/softpipe/sp_image.c b/src/gallium/drivers/softpipe/sp_image.c
new file mode 100644
index 00000000000..3488fa83185
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_image.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_image.h"
+#include "sp_texture.h"
+
+#include "util/u_format.h"
+
+/*
+ * Get the offset into the base image
+ * first element for a buffer or layer/level for texture.
+ */
+static uint32_t
+get_image_offset(const struct softpipe_resource *spr,
+                 const struct pipe_image_view *iview,
+                 enum pipe_format format, unsigned r_coord)
+{
+   int base_layer = 0;
+
+   if (spr->base.target == PIPE_BUFFER)
+      return iview->u.buf.first_element * util_format_get_blocksize(format);
+
+   if (spr->base.target == PIPE_TEXTURE_1D_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_2D_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_CUBE_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_CUBE ||
+       spr->base.target == PIPE_TEXTURE_3D)
+      base_layer = r_coord + iview->u.tex.first_layer;
+   return softpipe_get_tex_image_offset(spr, iview->u.tex.level, base_layer);
+}
+
+/*
+ * Does this texture instruction have a layer or depth parameter.
+ */
+static inline bool
+has_layer_or_depth(unsigned tgsi_tex_instr)
+{
+   return (tgsi_tex_instr == TGSI_TEXTURE_3D ||
+           tgsi_tex_instr == TGSI_TEXTURE_CUBE ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_CUBE_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY_MSAA);
+}
+
+/*
+ * Is this texture instruction a single non-array coordinate.
+ */
+static inline bool
+has_1coord(unsigned tgsi_tex_instr)
+{
+   return (tgsi_tex_instr == TGSI_TEXTURE_BUFFER ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY);
+}
+
+/*
+ * check the bounds vs w/h/d
+ */
+static inline bool
+bounds_check(int width, int height, int depth,
+             int s, int t, int r)
+{
+   if (s < 0 || s >= width)
+      return false;
+   if (t < 0 || t >= height)
+      return false;
+   if (r < 0 || r >= depth)
+      return false;
+   return true;
+}
+
+/*
+ * Checks if the texture target compatible with the image resource
+ * pipe target.
+ */
+static inline bool
+has_compat_target(unsigned pipe_target, unsigned tgsi_target)
+{
+   switch (pipe_target) {
+   case PIPE_TEXTURE_1D:
+      if (tgsi_target == TGSI_TEXTURE_1D)
+         return true;
+      break;
+   case PIPE_TEXTURE_2D:
+      if (tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_RECT:
+      if (tgsi_target == TGSI_TEXTURE_RECT)
+         return true;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (tgsi_target == TGSI_TEXTURE_3D ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (tgsi_target == TGSI_TEXTURE_CUBE ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_1D_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_1D ||
+          tgsi_target == TGSI_TEXTURE_1D_ARRAY)
+         return true;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_2D ||
+          tgsi_target == TGSI_TEXTURE_2D_ARRAY)
+         return true;
+      break;
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_CUBE ||
+          tgsi_target == TGSI_TEXTURE_CUBE_ARRAY ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_BUFFER:
+      return (tgsi_target == TGSI_TEXTURE_BUFFER);
+   }
+   return false;
+}
+
+static bool
+get_dimensions(const struct pipe_image_view *iview,
+               const struct softpipe_resource *spr,
+               unsigned tgsi_tex_instr,
+               enum pipe_format pformat,
+               unsigned *width,
+               unsigned *height,
+               unsigned *depth)
+{
+   if (tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+      *width = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+      *height = 1;
+      *depth = 1;
+      /*
+       * Bounds check the buffer size from the view
+       * and the buffer size from the underlying buffer.
+       */
+      if (util_format_get_stride(pformat, *width) >
+          util_format_get_stride(spr->base.format, spr->base.width0))
+         return false;
+   } else {
+      unsigned level;
+
+      level = spr->base.target == PIPE_BUFFER ? 0 : iview->u.tex.level;
+      *width = u_minify(spr->base.width0, level);
+      *height = u_minify(spr->base.height0, level);
+
+      if (spr->base.target == TGSI_TEXTURE_3D)
+         *depth = u_minify(spr->base.depth0, level);
+      else
+         *depth = spr->base.array_size;
+
+      /* Make sure the resource and view have compatiable formats */
+      if (util_format_get_blocksize(pformat) >
+          util_format_get_blocksize(spr->base.format))
+         return false;
+   }
+   return true;
+}
+
+static void
+fill_coords(const struct tgsi_image_params *params,
+            unsigned index,
+            const int s[TGSI_QUAD_SIZE],
+            const int t[TGSI_QUAD_SIZE],
+            const int r[TGSI_QUAD_SIZE],
+            int *s_coord, int *t_coord, int *r_coord)
+{
+   *s_coord = s[index];
+   *t_coord = has_1coord(params->tgsi_tex_instr) ? 0 : t[index];
+   *r_coord = has_layer_or_depth(params->tgsi_tex_instr) ?
+      (params->tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ? t[index] : r[index]) : 0;
+}
+/*
+ * Implement the image LOAD operation.
+ */
+static void
+sp_tgsi_load(const struct tgsi_image *image,
+             const struct tgsi_image_params *params,
+             const int s[TGSI_QUAD_SIZE],
+             const int t[TGSI_QUAD_SIZE],
+             const int r[TGSI_QUAD_SIZE],
+             const int sample[TGSI_QUAD_SIZE],
+             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   int c, j;
+   char *data_ptr;
+   unsigned offset = 0;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      goto fail_write_all_zero;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      goto fail_write_all_zero;
+
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       params->format, &width, &height, &depth))
+      return;
+
+   stride = util_format_get_stride(params->format, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+      bool fill_zero = false;
+
+      if (!(params->execmask & (1 << j)))
+         fill_zero = true;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord))
+         fill_zero = true;
+
+      if (fill_zero) {
+         int nc = util_format_get_nr_components(params->format);
+         int ival = util_format_is_pure_integer(params->format);
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+            if (c == 3 && nc < 4) {
+               if (ival)
+                  ((int32_t *)rgba[c])[j] = 1;
+               else
+                  rgba[c][j] = 1.0;
+            }
+         }
+         continue;
+      }
+      offset = get_image_offset(spr, iview, params->format, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      if (util_format_is_pure_sint(params->format)) {
+         int32_t sdata[4];
+
+         util_format_read_4i(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            ((int32_t *)rgba[c])[j] = sdata[c];
+      } else if (util_format_is_pure_uint(params->format)) {
+         uint32_t sdata[4];
+         util_format_read_4ui(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            ((uint32_t *)rgba[c])[j] = sdata[c];
+      } else {
+         float sdata[4];
+         util_format_read_4f(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            rgba[c][j] = sdata[c];
+      }
+   }
+   return;
+fail_write_all_zero:
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      for (c = 0; c < 4; c++)
+         rgba[c][j] = 0;
+   }
+   return;
+}
+
+/*
+ * Implement the image STORE operation.
+ */
+static void
+sp_tgsi_store(const struct tgsi_image *image,
+              const struct tgsi_image_params *params,
+              const int s[TGSI_QUAD_SIZE],
+              const int t[TGSI_QUAD_SIZE],
+              const int r[TGSI_QUAD_SIZE],
+              const int sample[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   char *data_ptr;
+   int j, c;
+   unsigned offset = 0;
+   unsigned pformat = params->format;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      return;
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      return;
+
+   if (params->format == PIPE_FORMAT_NONE)
+      pformat = spr->base.format;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       pformat, &width, &height, &depth))
+      return;
+
+   stride = util_format_get_stride(pformat, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+
+      if (!(params->execmask & (1 << j)))
+         continue;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord))
+         continue;
+
+      offset = get_image_offset(spr, iview, pformat, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      if (util_format_is_pure_sint(pformat)) {
+         int32_t sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = ((int32_t *)rgba[c])[j];
+         util_format_write_4i(pformat, sdata, 0, data_ptr, stride,
+                              s_coord, t_coord, 1, 1);
+      } else if (util_format_is_pure_uint(pformat)) {
+         uint32_t sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = ((uint32_t *)rgba[c])[j];
+         util_format_write_4ui(pformat, sdata, 0, data_ptr, stride,
+                               s_coord, t_coord, 1, 1);
+      } else {
+         float sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = rgba[c][j];
+         util_format_write_4f(pformat, sdata, 0, data_ptr, stride,
+                              s_coord, t_coord, 1, 1);
+      }
+   }
+}
+
+/*
+ * Implement atomic operations on unsigned integers.
+ */
+static void
+handle_op_uint(const struct pipe_image_view *iview,
+               const struct tgsi_image_params *params,
+               bool just_read,
+               char *data_ptr,
+               uint qi,
+               unsigned stride,
+               unsigned opcode,
+               int s,
+               int t,
+               float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+               float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   int nc = util_format_get_nr_components(params->format);
+   unsigned sdata[4];
+
+   util_format_read_4ui(params->format,
+                        sdata, 0,
+                        data_ptr, stride,
+                        s, t, 1, 1);
+
+   if (just_read) {
+      for (c = 0; c < nc; c++) {
+         ((uint32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] += ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] = ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned cmp_x = ((uint32_t *)rgba[c])[qi];
+         unsigned src_x = ((uint32_t *)rgba2[c])[qi];
+         unsigned temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] &= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] |= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] ^= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+   util_format_write_4ui(params->format, sdata, 0, data_ptr, stride,
+                         s, t, 1, 1);
+}
+
+/*
+ * Implement atomic operations on signed integers.
+ */
+static void
+handle_op_int(const struct pipe_image_view *iview,
+              const struct tgsi_image_params *params,
+              bool just_read,
+              char *data_ptr,
+              uint qi,
+              unsigned stride,
+              unsigned opcode,
+              int s,
+              int t,
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   int nc = util_format_get_nr_components(params->format);
+   int sdata[4];
+   util_format_read_4i(params->format,
+                       sdata, 0,
+                       data_ptr, stride,
+                       s, t, 1, 1);
+
+   if (just_read) {
+      for (c = 0; c < nc; c++) {
+         ((int32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] += ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] = ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int cmp_x = ((int32_t *)rgba[c])[qi];
+         int src_x = ((int32_t *)rgba2[c])[qi];
+         int temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] &= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] |= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] ^= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+   util_format_write_4i(params->format, sdata, 0, data_ptr, stride,
+                        s, t, 1, 1);
+}
+
+/*
+ * Implement atomic image operations.
+ */
+static void
+sp_tgsi_op(const struct tgsi_image *image,
+           const struct tgsi_image_params *params,
+           unsigned opcode,
+           const int s[TGSI_QUAD_SIZE],
+           const int t[TGSI_QUAD_SIZE],
+           const int r[TGSI_QUAD_SIZE],
+           const int sample[TGSI_QUAD_SIZE],
+           float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+           float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   int j, c;
+   unsigned offset;
+   char *data_ptr;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      goto fail_write_all_zero;
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       params->format, &width, &height, &depth))
+      goto fail_write_all_zero;
+
+   stride = util_format_get_stride(spr->base.format, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+      bool just_read = false;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord)) {
+         int nc = util_format_get_nr_components(params->format);
+         int ival = util_format_is_pure_integer(params->format);
+         int c;
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+            if (c == 3 && nc < 4) {
+               if (ival)
+                  ((int32_t *)rgba[c])[j] = 1;
+               else
+                  rgba[c][j] = 1.0;
+            }
+         }
+         continue;
+      }
+
+      /* just readback the value for atomic if execmask isn't set */
+      if (!(params->execmask & (1 << j))) {
+         just_read = true;
+      }
+
+      offset = get_image_offset(spr, iview, params->format, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      /* we should see atomic operations on r32 formats */
+      if (util_format_is_pure_uint(params->format))
+         handle_op_uint(iview, params, just_read, data_ptr, j, stride,
+                        opcode, s_coord, t_coord, rgba, rgba2);
+      else if (util_format_is_pure_sint(params->format))
+         handle_op_int(iview, params, just_read, data_ptr, j, stride,
+                       opcode, s_coord, t_coord, rgba, rgba2);
+      else
+         assert(0);
+   }
+   return;
+fail_write_all_zero:
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      for (c = 0; c < 4; c++)
+         rgba[c][j] = 0;
+   }
+   return;
+}
+
+static void
+sp_tgsi_get_dims(const struct tgsi_image *image,
+                 const struct tgsi_image_params *params,
+                 int dims[4])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   int level;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      return;
+
+   if (params->tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+      dims[0] = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+      dims[1] = dims[2] = dims[3] = 0;
+      return;
+   }
+
+   level = iview->u.tex.level;
+   dims[0] = u_minify(spr->base.width0, level);
+   switch (params->tgsi_tex_instr) {
+   case TGSI_TEXTURE_1D_ARRAY:
+      dims[1] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_1D:
+      return;
+   case TGSI_TEXTURE_2D_ARRAY:
+      dims[2] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_RECT:
+      dims[1] = u_minify(spr->base.height0, level);
+      return;
+   case TGSI_TEXTURE_3D:
+      dims[1] = u_minify(spr->base.height0, level);
+      dims[2] = u_minify(spr->base.depth0, level);
+      return;
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      dims[1] = u_minify(spr->base.height0, level);
+      dims[2] = (iview->u.tex.last_layer - iview->u.tex.first_layer + 1) / 6;
+      break;
+   default:
+      assert(!"unexpected texture target in sp_get_dims()");
+      return;
+   }
+}
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void)
+{
+   struct sp_tgsi_image *img = CALLOC_STRUCT(sp_tgsi_image);
+   if (!img)
+      return NULL;
+
+   img->base.load = sp_tgsi_load;
+   img->base.store = sp_tgsi_store;
+   img->base.op = sp_tgsi_op;
+   img->base.get_dims = sp_tgsi_get_dims;
+   return img;
+};
diff --git a/src/gallium/drivers/softpipe/sp_image.h b/src/gallium/drivers/softpipe/sp_image.h
new file mode 100644
index 00000000000..3c73f838efe
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_image.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SP_IMAGE_H
+#define SP_IMAGE_H
+#include "tgsi/tgsi_exec.h"
+
+struct sp_tgsi_image
+{
+   struct tgsi_image base;
+   struct pipe_image_view sp_iview[PIPE_MAX_SHADER_IMAGES];
+};
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 7a2d3715f8b..570bc549cc4 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -56,6 +56,7 @@
 
 
 struct tgsi_sampler;
+struct tgsi_image;
 struct tgsi_exec_machine;
 struct vertex_info;
 
@@ -81,7 +82,8 @@ struct sp_fragment_shader_variant
 
    void (*prepare)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
-		   struct tgsi_sampler *sampler);
+		   struct tgsi_sampler *sampler,
+		   struct tgsi_image *image);
 
    unsigned (*run)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
@@ -149,6 +151,9 @@ softpipe_init_streamout_funcs(struct pipe_context *pipe);
 void
 softpipe_init_vertex_funcs(struct pipe_context *pipe);
 
+void
+softpipe_init_image_funcs(struct pipe_context *pipe);
+
 void
 softpipe_set_framebuffer_state(struct pipe_context *,
                                const struct pipe_framebuffer_state *);
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index d4d03f1be50..65679e73515 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -343,7 +343,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim)
       softpipe->fs_variant->prepare(softpipe->fs_variant, 
                                     softpipe->fs_machine,
                                     (struct tgsi_sampler *) softpipe->
-                                    tgsi.sampler[PIPE_SHADER_FRAGMENT]);
+                                    tgsi.sampler[PIPE_SHADER_FRAGMENT],
+                                    (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]);
    }
    else {
       softpipe->fs_variant = NULL;
diff --git a/src/gallium/drivers/softpipe/sp_state_image.c b/src/gallium/drivers/softpipe/sp_state_image.c
new file mode 100644
index 00000000000..8909fa26864
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_image.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_image.h"
+
+static void softpipe_set_shader_images(struct pipe_context *pipe,
+                                       unsigned shader,
+                                       unsigned start,
+                                       unsigned num,
+                                       struct pipe_image_view *images)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(softpipe->sampler_views[shader]));
+
+   /* set the new images */
+   for (i = 0; i < num; i++) {
+      int idx = start + i;
+
+      if (images) {
+         pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, images[i].resource);
+         softpipe->tgsi.image[shader]->sp_iview[idx] = images[i];
+      }
+      else {
+         pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, NULL);
+         memset(&softpipe->tgsi.image[shader]->sp_iview[idx], 0, sizeof(struct pipe_image_view));
+      }
+   }
+}
+
+void softpipe_init_image_funcs(struct pipe_context *pipe)
+{
+   pipe->set_shader_images = softpipe_set_shader_images;
+}
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 52ec373f8f2..64666fee03f 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -270,9 +270,9 @@ softpipe_resource_get_handle(struct pipe_screen *screen,
  * Helper function to compute offset (in bytes) for a particular
  * texture level/face/slice from the start of the buffer.
  */
-static unsigned
-sp_get_tex_image_offset(const struct softpipe_resource *spr,
-                        unsigned level, unsigned layer)
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+                              unsigned level, unsigned layer)
 {
    unsigned offset = spr->level_offset[level];
 
@@ -422,7 +422,7 @@ softpipe_transfer_map(struct pipe_context *pipe,
    pt->stride = spr->stride[level];
    pt->layer_stride = spr->img_stride[level];
 
-   spt->offset = sp_get_tex_image_offset(spr, level, box->z);
+   spt->offset = softpipe_get_tex_image_offset(spr, level, box->z);
 
    spt->offset +=
          box->y / util_format_get_blockheight(format) * spt->base.stride +
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index fbf741a9c72..450c4b1cefc 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -116,5 +116,7 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
 extern void
 softpipe_init_texture_funcs(struct pipe_context *pipe);
 
-
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+                              unsigned level, unsigned layer);
 #endif /* SP_TEXTURE */

From c9367c13ca420528cd0d9a9e8dacd2fd5d6a0a41 Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Tue, 22 Mar 2016 08:00:52 +1000
Subject: [PATCH 145/238] docs: update softpipe status for
 shader_image_load_store.

Reviewed-by: Brian Paul <brianp@vmware.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt              | 2 +-
 docs/relnotes/11.3.0.html | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 03ebf70fb0f..804a96c4a2b 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -154,7 +154,7 @@ GL 4.2, GLSL 4.20:
   GL_ARB_texture_storage                                DONE (all drivers)
   GL_ARB_transform_feedback_instanced                   DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_base_instance                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_shader_image_load_store                        DONE (i965, radeonsi)
+  GL_ARB_shader_image_load_store                        DONE (i965, radeonsi, softpipe)
   GL_ARB_conservative_depth                             DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_420pack                       DONE (all drivers that support GLSL 1.30)
   GL_ARB_shading_language_packing                       DONE (all drivers)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 508fbd34901..4e23959e314 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -46,7 +46,7 @@ Note: some of the new features are only available with certain drivers.
 <ul>
 <li>GL_ARB_internalformat_query2 on all drivers</li>
 <li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
-<li>GL_ARB_shader_image_load_store on radeonsi</li>
+<li>GL_ARB_shader_image_load_store on radeonsi, softpipe</li>
 <li>GL_ARB_shader_image_size on radeonsi</li>
 <li>GL_ATI_fragment_shader on all Gallium drivers</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>

From 7ebc3deaad77d11aa7086720ba4c3469a8878de3 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 9 Mar 2016 16:58:29 +1100
Subject: [PATCH 146/238] glsl: Fix segfault when lhs is error_type in TCS

It seems expected that both lhs and rhs could be of type error_type
in this code however the TCS case wasn't expecting it.

Fixes segfault in an enhanced layouts GL CTS test.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index fcc542ab5e0..0c686b27229 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -819,7 +819,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
     * if the expression indicating the vertex number is not the identifier
     * `gl_InvocationID`.
     */
-   if (state->stage == MESA_SHADER_TESS_CTRL) {
+   if (state->stage == MESA_SHADER_TESS_CTRL && !lhs->type->is_error()) {
       ir_variable *var = lhs->variable_referenced();
       if (var->data.mode == ir_var_shader_out && !var->data.patch) {
          ir_rvalue *index = find_innermost_array_index(lhs);

From 8765a9e0fe2987caa6af7473cbc4c55754621806 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 11 Mar 2016 16:15:02 +1100
Subject: [PATCH 147/238] glsl: generate named interface block names correctly

Firstly this updates the named interface lowering pass to store the
interface without the arrays removed.

Note we need to remove the arrays in the interface/varying matching
code to not regress things but in future this should be fixed
futher as it would seem we currently successfully match interface
blocks with differnt array sizes.

Since we now know if the interface was an array we can reduce the
IR flags from_named_ifc_block_array and from_named_ifc_block_nonarray
to just from_named_ifc_block.

Next rather than having a different code path for named interface
blocks in program_resource_visitor we just make use of the one used
by UBOs this allows us to now handle arrays of arrays correctly.

Finally we add a new param to the recursion function
named_ifc_member this is because we only want to process a single
member at a time. Note that this is also the glsl_struct_field
from the original ifc type before lowering rather than the type
from the lowered variable. This fixes a bug in Mesa where we would
generate the names like WithInstArray[0].g[0][0] when it should be
WithInstArray[0].g[0] for the following interface.

   out WithInstArray {
      float g[3];
   } instArray[2];

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ir.h                        | 16 +---
 src/compiler/glsl/link_interface_blocks.cpp   |  6 +-
 src/compiler/glsl/link_uniforms.cpp           | 95 +++++--------------
 src/compiler/glsl/link_varyings.cpp           |  8 +-
 src/compiler/glsl/linker.h                    |  3 +-
 .../glsl/lower_named_interface_blocks.cpp     |  5 +-
 6 files changed, 38 insertions(+), 95 deletions(-)

diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index b74d68a605b..56ed13e7153 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -742,21 +742,9 @@ public:
 
       /**
        * Non-zero if this variable was created by lowering a named interface
-       * block which was not an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_array will never
-       * both be non-zero.
+       * block.
        */
-      unsigned from_named_ifc_block_nonarray:1;
-
-      /**
-       * Non-zero if this variable was created by lowering a named interface
-       * block which was an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_nonarray will never
-       * both be non-zero.
-       */
-      unsigned from_named_ifc_block_array:1;
+      unsigned from_named_ifc_block:1;
 
       /**
        * Non-zero if the variable must be a shader input. This is useful for
diff --git a/src/compiler/glsl/link_interface_blocks.cpp b/src/compiler/glsl/link_interface_blocks.cpp
index 4c6fb56f891..26072591b0e 100644
--- a/src/compiler/glsl/link_interface_blocks.cpp
+++ b/src/compiler/glsl/link_interface_blocks.cpp
@@ -242,7 +242,8 @@ public:
          return entry ? (ir_variable *) entry->data : NULL;
       } else {
          const struct hash_entry *entry =
-            _mesa_hash_table_search(ht, var->get_interface_type()->name);
+            _mesa_hash_table_search(ht,
+               var->get_interface_type()->without_array()->name);
          return entry ? (ir_variable *) entry->data : NULL;
       }
    }
@@ -263,7 +264,8 @@ public:
          snprintf(location_str, 11, "%d", var->data.location);
          _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var);
       } else {
-         _mesa_hash_table_insert(ht, var->get_interface_type()->name, var);
+         _mesa_hash_table_insert(ht,
+            var->get_interface_type()->without_array()->name, var);
       }
    }
 
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index cd487ab6dd0..0a230cad034 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -68,7 +68,7 @@ program_resource_visitor::process(const glsl_type *type, const char *name)
    unsigned packing = type->interface_packing;
 
    recursion(type, &name_copy, strlen(name), false, NULL, packing, false,
-             record_array_count);
+             record_array_count, NULL);
    ralloc_free(name_copy);
 }
 
@@ -76,8 +76,6 @@ void
 program_resource_visitor::process(ir_variable *var)
 {
    unsigned record_array_count = 1;
-   const glsl_type *t = var->type;
-   const glsl_type *t_without_array = var->type->without_array();
    const bool row_major =
       var->data.matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
 
@@ -85,80 +83,28 @@ program_resource_visitor::process(ir_variable *var)
       var->get_interface_type()->interface_packing :
       var->type->interface_packing;
 
+   const glsl_type *t =
+      var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+   const glsl_type *t_without_array = t->without_array();
+
    /* false is always passed for the row_major parameter to the other
     * processing functions because no information is available to do
     * otherwise.  See the warning in linker.h.
     */
-
-   /* Only strdup the name if we actually will need to modify it. */
-   if (var->data.from_named_ifc_block_array) {
-      /* lower_named_interface_blocks created this variable by lowering an
-       * interface block array to an array variable.  For example if the
-       * original source code was:
-       *
-       *     out Blk { vec4 bar } foo[3];
-       *
-       * Then the variable is now:
-       *
-       *     out vec4 bar[3];
-       *
-       * We need to visit each array element using the names constructed like
-       * so:
-       *
-       *     Blk[0].bar
-       *     Blk[1].bar
-       *     Blk[2].bar
-       */
-      assert(t->is_array());
-      const glsl_type *ifc_type = var->get_interface_type();
-      char *name = ralloc_strdup(NULL, ifc_type->name);
-      size_t name_length = strlen(name);
-      for (unsigned i = 0; i < t->length; i++) {
-         size_t new_length = name_length;
-         ralloc_asprintf_rewrite_tail(&name, &new_length, "[%u].%s", i,
-                                      var->name);
-         /* Note: row_major is only meaningful for uniform blocks, and
-          * lowering is only applied to non-uniform interface blocks, so we
-          * can safely pass false for row_major.
-          */
-         recursion(var->type, &name, new_length, row_major, NULL, packing,
-                   false, record_array_count);
-      }
-      ralloc_free(name);
-   } else if (var->data.from_named_ifc_block_nonarray) {
-      /* lower_named_interface_blocks created this variable by lowering a
-       * named interface block (non-array) to an ordinary variable.  For
-       * example if the original source code was:
-       *
-       *     out Blk { vec4 bar } foo;
-       *
-       * Then the variable is now:
-       *
-       *     out vec4 bar;
-       *
-       * We need to visit this variable using the name:
-       *
-       *     Blk.bar
-       */
-      const glsl_type *ifc_type = var->get_interface_type();
-      char *name = ralloc_asprintf(NULL, "%s.%s", ifc_type->name, var->name);
-      /* Note: row_major is only meaningful for uniform blocks, and lowering
-       * is only applied to non-uniform interface blocks, so we can safely
-       * pass false for row_major.
-       */
-      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
-      ralloc_free(name);
-   } else if (t_without_array->is_record() ||
+   if (t_without_array->is_record() ||
               (t->is_array() && t->fields.array->is_array())) {
       char *name = ralloc_strdup(NULL, var->name);
       recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
+                false, record_array_count, NULL);
       ralloc_free(name);
    } else if (t_without_array->is_interface()) {
       char *name = ralloc_strdup(NULL, t_without_array->name);
-      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
+      const glsl_struct_field *ifc_member = var->data.from_named_ifc_block ?
+         &t_without_array->
+            fields.structure[t_without_array->field_index(var->name)] : NULL;
+
+      recursion(t, &name, strlen(name), row_major, NULL, packing,
+                false, record_array_count, ifc_member);
       ralloc_free(name);
    } else {
       this->set_record_array_count(record_array_count);
@@ -172,7 +118,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
                                     const glsl_type *record_type,
                                     const unsigned packing,
                                     bool last_field,
-                                    unsigned record_array_count)
+                                    unsigned record_array_count,
+                                    const glsl_struct_field *named_ifc_member)
 {
    /* Records need to have each field processed individually.
     *
@@ -180,7 +127,12 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
     * individually, then each field of the resulting array elements processed
     * individually.
     */
-   if (t->is_record() || t->is_interface()) {
+   if (t->is_interface() && named_ifc_member) {
+      ralloc_asprintf_rewrite_tail(name, &name_length, ".%s",
+                                   named_ifc_member->name);
+      recursion(named_ifc_member->type, name, name_length, row_major, NULL,
+                packing, false, record_array_count, NULL);
+   } else if (t->is_record() || t->is_interface()) {
       if (record_type == NULL && t->is_record())
          record_type = t;
 
@@ -223,7 +175,7 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
                    field_row_major,
                    record_type,
                    packing,
-                   (i + 1) == t->length, record_array_count);
+                   (i + 1) == t->length, record_array_count, NULL);
 
          /* Only the first leaf-field of the record gets called with the
           * record type pointer.
@@ -258,7 +210,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
          recursion(t->fields.array, name, new_length, row_major,
                    record_type,
                    packing,
-                   (i + 1) == t->length, record_array_count);
+                   (i + 1) == t->length, record_array_count,
+                   named_ifc_member);
 
          /* Only the first leaf-field of the record gets called with the
           * record type pointer.
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 44fc8f617f8..dadbf1e6859 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -1466,8 +1466,8 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
          } else if (input_var->get_interface_type() != NULL) {
             char *const iface_field_name =
                ralloc_asprintf(mem_ctx, "%s.%s",
-                               input_var->get_interface_type()->name,
-                               input_var->name);
+                  input_var->get_interface_type()->without_array()->name,
+                  input_var->name);
             hash_table_insert(consumer_interface_inputs, input_var,
                               iface_field_name);
          } else {
@@ -1498,8 +1498,8 @@ get_matching_input(void *mem_ctx,
    } else if (output_var->get_interface_type() != NULL) {
       char *const iface_field_name =
          ralloc_asprintf(mem_ctx, "%s.%s",
-                         output_var->get_interface_type()->name,
-                         output_var->name);
+            output_var->get_interface_type()->without_array()->name,
+            output_var->name);
       input_var =
          (ir_variable *) hash_table_find(consumer_interface_inputs,
                                          iface_field_name);
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index 4311d1659ec..97144df8ff7 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -197,7 +197,8 @@ private:
    void recursion(const glsl_type *t, char **name, size_t name_length,
                   bool row_major, const glsl_type *record_type,
                   const unsigned packing,
-                  bool last_field, unsigned record_array_count);
+                  bool last_field, unsigned record_array_count,
+                  const glsl_struct_field *named_ifc_member);
 };
 
 void
diff --git a/src/compiler/glsl/lower_named_interface_blocks.cpp b/src/compiler/glsl/lower_named_interface_blocks.cpp
index f29eba4f75f..434cea90920 100644
--- a/src/compiler/glsl/lower_named_interface_blocks.cpp
+++ b/src/compiler/glsl/lower_named_interface_blocks.cpp
@@ -169,7 +169,6 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                   new(mem_ctx) ir_variable(iface_t->fields.structure[i].type,
                                            var_name,
                                            (ir_variable_mode) var->data.mode);
-               new_var->data.from_named_ifc_block_nonarray = 1;
             } else {
                const glsl_type *new_array_type =
                   process_array_type(var->type, i);
@@ -177,7 +176,6 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                   new(mem_ctx) ir_variable(new_array_type,
                                            var_name,
                                            (ir_variable_mode) var->data.mode);
-               new_var->data.from_named_ifc_block_array = 1;
             }
             new_var->data.location = iface_t->fields.structure[i].location;
             new_var->data.explicit_location = (new_var->data.location >= 0);
@@ -188,8 +186,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
             new_var->data.patch = iface_t->fields.structure[i].patch;
             new_var->data.stream = var->data.stream;
             new_var->data.how_declared = var->data.how_declared;
+            new_var->data.from_named_ifc_block = 1;
 
-            new_var->init_interface_type(iface_t);
+            new_var->init_interface_type(var->type);
             hash_table_insert(interface_namespace, new_var,
                               iface_field_name);
             insert_pos->insert_after(new_var);

From 52caeee7e78c2bed0329dcb7a5984826fa5960a6 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 13 Jan 2016 14:40:05 +1100
Subject: [PATCH 148/238] glsl: add transform feedback built-in constants

These are new built-ins added by ARB_enhanced_layouts.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/builtin_variables.cpp  | 7 +++++++
 src/compiler/glsl/glsl_parser_extras.cpp | 4 ++++
 src/compiler/glsl/glsl_parser_extras.h   | 4 ++++
 3 files changed, 15 insertions(+)

diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 4e2de37fbba..c39ae507b62 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -812,6 +812,13 @@ builtin_variable_generator::generate_constants()
        */
    }
 
+   if (state->has_enhanced_layouts()) {
+      add_const("gl_MaxTransformFeedbackBuffers",
+                state->Const.MaxTransformFeedbackBuffers);
+      add_const("gl_MaxTransformFeedbackInterleavedComponents",
+                state->Const.MaxTransformFeedbackInterleavedComponents);
+   }
+
    if (state->is_version(420, 310) ||
        state->ARB_shader_image_load_store_enable) {
       add_const("gl_MaxImageUnits",
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index ea9639b728d..a8fa75395e0 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -138,6 +138,10 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxAtomicCounterBufferSize =
       ctx->Const.MaxAtomicBufferSize;
 
+   /* ARB_enhanced_layouts constants */
+   this->Const.MaxTransformFeedbackBuffers = ctx->Const.MaxTransformFeedbackBuffers;
+   this->Const.MaxTransformFeedbackInterleavedComponents = ctx->Const.MaxTransformFeedbackInterleavedComponents;
+
    /* Compute shader constants */
    for (unsigned i = 0; i < ARRAY_SIZE(this->Const.MaxComputeWorkGroupCount); i++)
       this->Const.MaxComputeWorkGroupCount[i] = ctx->Const.MaxComputeWorkGroupCount[i];
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 24195f97f18..86008b48519 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -383,6 +383,10 @@ struct _mesa_glsl_parse_state {
       /* ARB_draw_buffers */
       unsigned MaxDrawBuffers;
 
+      /* ARB_enhanced_layouts */
+      unsigned MaxTransformFeedbackBuffers;
+      unsigned MaxTransformFeedbackInterleavedComponents;
+
       /* ARB_blend_func_extended */
       unsigned MaxDualSourceDrawBuffers;
 

From 13f6c788ebc4f9969d2d12c1a8ba64fdcf5dc12e Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sat, 5 Mar 2016 12:16:22 +1100
Subject: [PATCH 149/238] glsl: move process_qualifier_constant() to
 ast_type.cpp

We will make use of this function being here in the following patch.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast.h          |  6 +++++
 src/compiler/glsl/ast_to_hir.cpp | 41 --------------------------------
 src/compiler/glsl/ast_type.cpp   | 41 ++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 41 deletions(-)

diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 9f46340e6e2..b144e24de88 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -1205,4 +1205,10 @@ extern void _mesa_ast_process_interface_block(YYLTYPE *locp,
                                               ast_interface_block *const block,
                                               const struct ast_type_qualifier &q);
 
+extern bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                           YYLTYPE *loc,
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value);
 #endif /* AST_H */
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 0c686b27229..673ce8f716f 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -2584,47 +2584,6 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
    }
 }
 
-static bool
-process_qualifier_constant(struct _mesa_glsl_parse_state *state,
-                           YYLTYPE *loc,
-                           const char *qual_indentifier,
-                           ast_expression *const_expression,
-                           unsigned *value)
-{
-   exec_list dummy_instructions;
-
-   if (const_expression == NULL) {
-      *value = 0;
-      return true;
-   }
-
-   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
-
-   ir_constant *const const_int = ir->constant_expression_value();
-   if (const_int == NULL || !const_int->type->is_integer()) {
-      _mesa_glsl_error(loc, state, "%s must be an integral constant "
-                       "expression", qual_indentifier);
-      return false;
-   }
-
-   if (const_int->value.i[0] < 0) {
-      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
-                       qual_indentifier, const_int->value.u[0]);
-      return false;
-   }
-
-   /* If the location is const (and we've verified that
-    * it is) then no instructions should have been emitted
-    * when we converted it to HIR. If they were emitted,
-    * then either the location isn't const after all, or
-    * we are emitting unnecessary instructions.
-    */
-   assert(dummy_instructions.is_empty());
-
-   *value = const_int->value.u[0];
-   return true;
-}
-
 static bool
 validate_stream_qualifier(YYLTYPE *loc, struct _mesa_glsl_parse_state *state,
                           unsigned stream)
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index 07ed4f2356c..ede6cc4500f 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -566,3 +566,44 @@ ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state
 
    return true;
 }
+
+bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                           YYLTYPE *loc,
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value)
+{
+   exec_list dummy_instructions;
+
+   if (const_expression == NULL) {
+      *value = 0;
+      return true;
+   }
+
+   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+
+   ir_constant *const const_int = ir->constant_expression_value();
+   if (const_int == NULL || !const_int->type->is_integer()) {
+      _mesa_glsl_error(loc, state, "%s must be an integral constant "
+                       "expression", qual_indentifier);
+      return false;
+   }
+
+   if (const_int->value.i[0] < 0) {
+      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
+                       qual_indentifier, const_int->value.u[0]);
+      return false;
+   }
+
+   /* If the location is const (and we've verified that
+    * it is) then no instructions should have been emitted
+    * when we converted it to HIR. If they were emitted,
+    * then either the location isn't const after all, or
+    * we are emitting unnecessary instructions.
+    */
+   assert(dummy_instructions.is_empty());
+
+   *value = const_int->value.u[0];
+   return true;
+}

From c9afd94af6fa129370eb001077724a77093ecd5a Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 22 Jan 2016 12:45:10 +1100
Subject: [PATCH 150/238] glsl: parse new transform feedback layout qualifiers

We reuse the existing offset field for holding the xfb_offset
expression but create a new flag as to avoid hitting the rules
for the offset qualifier for UBOs.

xfb_buffer qualifiers require extra processing when merging as
they can be applied to global out defaults. We just apply the
same rules as we do for the stream qualifier as the spec says:

   "The *xfb_buffer* qualifier follows the same conventions,
    behavior, defaults, and inheritance rules as the qualifier
    stream, and the examples for stream apply here as well."

For xfb_stride we push everything into a global out field for
later processing as xfb_stride applies to the entire buffer.
We still need to have a separate field to store per variable
strides because they can still effect implicit offsets
e.g. when applied to block members with implicit offsets.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast.h                  | 23 +++++++++++--
 src/compiler/glsl/ast_type.cpp           | 43 +++++++++++++++++++++++-
 src/compiler/glsl/glsl_parser.yy         | 25 ++++++++++++++
 src/compiler/glsl/glsl_parser_extras.cpp |  7 ++++
 4 files changed, 95 insertions(+), 3 deletions(-)

diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index b144e24de88..7436edce88a 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -562,6 +562,15 @@ struct ast_type_qualifier {
          unsigned explicit_stream:1; /**< stream value assigned explicitly by shader code */
          /** \} */
 
+         /** \name Layout qualifiers for GL_ARB_enhanced_layouts */
+         /** \{ */
+         unsigned explicit_xfb_offset:1; /**< xfb_offset value assigned explicitly by shader code */
+         unsigned xfb_buffer:1; /**< Has xfb_buffer value assigned  */
+         unsigned explicit_xfb_buffer:1; /**< xfb_buffer value assigned explicitly by shader code */
+         unsigned xfb_stride:1; /**< Is xfb_stride value yet to be merged with global values  */
+         unsigned explicit_xfb_stride:1; /**< xfb_stride value assigned explicitly by shader code */
+         /** \} */
+
 	 /** \name Layout qualifiers for GL_ARB_tessellation_shader */
 	 /** \{ */
 	 /* tess eval input layout */
@@ -618,6 +627,15 @@ struct ast_type_qualifier {
    /** Stream in GLSL 1.50 geometry shaders. */
    ast_expression *stream;
 
+   /** xfb_buffer specified via the GL_ARB_enhanced_layouts keyword. */
+   ast_expression *xfb_buffer;
+
+   /** xfb_stride specified via the GL_ARB_enhanced_layouts keyword. */
+   ast_expression *xfb_stride;
+
+   /** global xfb_stride values for each buffer */
+   ast_layout_expression *out_xfb_stride[MAX_FEEDBACK_BUFFERS];
+
    /**
     * Input or output primitive type in GLSL 1.50 geometry shaders
     * and tessellation shaders.
@@ -633,8 +651,9 @@ struct ast_type_qualifier {
    ast_expression *binding;
 
    /**
-    * Offset specified via GL_ARB_shader_atomic_counter's "offset"
-    * keyword.
+    * Offset specified via GL_ARB_shader_atomic_counter's or
+    * GL_ARB_enhanced_layouts "offset" keyword, or by GL_ARB_enhanced_layouts
+    * "xfb_offset" keyword.
     *
     * \note
     * This field is only valid if \c explicit_offset is set.
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index ede6cc4500f..ab86577cc3e 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -229,6 +229,43 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       }
    }
 
+   if (state->has_enhanced_layouts()) {
+      if (!this->flags.q.explicit_xfb_buffer) {
+         if (q.flags.q.xfb_buffer) {
+            this->flags.q.xfb_buffer = 1;
+            this->xfb_buffer = q.xfb_buffer;
+         } else if (!this->flags.q.xfb_buffer && this->flags.q.out) {
+            /* Assign global xfb_buffer value */
+            this->flags.q.xfb_buffer = 1;
+            this->xfb_buffer = state->out_qualifier->xfb_buffer;
+         }
+      }
+
+      if (q.flags.q.explicit_xfb_stride)
+         this->xfb_stride = q.xfb_stride;
+
+      /* Merge all we xfb_stride qualifiers into the global out */
+      if (q.flags.q.explicit_xfb_stride || this->flags.q.xfb_stride) {
+
+         /* Set xfb_stride flag to 0 to avoid adding duplicates every time
+          * there is a merge.
+          */
+         this->flags.q.xfb_stride = 0;
+
+         unsigned buff_idx;
+         if (process_qualifier_constant(state, loc, "xfb_buffer",
+                                        this->xfb_buffer, &buff_idx)) {
+            if (state->out_qualifier->out_xfb_stride[buff_idx]) {
+               state->out_qualifier->out_xfb_stride[buff_idx]->merge_qualifier(
+                  new(state) ast_layout_expression(*loc, this->xfb_stride));
+            } else {
+               state->out_qualifier->out_xfb_stride[buff_idx] =
+                  new(state) ast_layout_expression(*loc, this->xfb_stride);
+            }
+         }
+      }
+   }
+
    if (q.flags.q.vertices) {
       if (this->vertices) {
          this->vertices->merge_qualifier(q.vertices);
@@ -300,7 +337,7 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    if (q.flags.q.explicit_binding)
       this->binding = q.binding;
 
-   if (q.flags.q.explicit_offset)
+   if (q.flags.q.explicit_offset || q.flags.q.explicit_xfb_offset)
       this->offset = q.offset;
 
    if (q.precision != ast_precision_none)
@@ -349,6 +386,10 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
                        "tessellation control or geometry shaders");
    }
 
+   /* Allow future assigments of global out's */
+   this->flags.q.explicit_xfb_buffer = 0;
+   this->flags.q.explicit_xfb_stride = 0;
+
    return r;
 }
 
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index 5ed051a6705..1cecc09b8c8 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -1541,6 +1541,25 @@ layout_qualifier_id:
          }
       }
 
+      if (state->has_enhanced_layouts()) {
+         if (match_layout_qualifier("xfb_buffer", $1, state) == 0) {
+            $$.flags.q.xfb_buffer = 1;
+            $$.flags.q.explicit_xfb_buffer = 1;
+            $$.xfb_buffer = $3;
+         }
+
+         if (match_layout_qualifier("xfb_offset", $1, state) == 0) {
+            $$.flags.q.explicit_xfb_offset = 1;
+            $$.offset = $3;
+         }
+
+         if (match_layout_qualifier("xfb_stride", $1, state) == 0) {
+            $$.flags.q.xfb_stride = 1;
+            $$.flags.q.explicit_xfb_stride = 1;
+            $$.xfb_stride = $3;
+         }
+      }
+
       static const char * const local_size_qualifiers[3] = {
          "local_size_x",
          "local_size_y",
@@ -1915,6 +1934,12 @@ storage_qualifier:
           $$.flags.q.explicit_stream = 0;
           $$.stream = state->out_qualifier->stream;
       }
+
+      if (state->has_enhanced_layouts()) {
+          $$.flags.q.xfb_buffer = 1;
+          $$.flags.q.explicit_xfb_buffer = 0;
+          $$.xfb_buffer = state->out_qualifier->xfb_buffer;
+      }
    }
    | UNIFORM
    {
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index a8fa75395e0..76ae0f88167 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -939,6 +939,13 @@ _mesa_ast_process_interface_block(YYLTYPE *locp,
       block->layout.stream = state->out_qualifier->stream;
    }
 
+   if (state->has_enhanced_layouts() && block->layout.flags.q.out) {
+      /* Assign global layout's xfb_buffer value. */
+      block->layout.flags.q.xfb_buffer = 1;
+      block->layout.flags.q.explicit_xfb_buffer = 0;
+      block->layout.xfb_buffer = state->out_qualifier->xfb_buffer;
+   }
+
    foreach_list_typed (ast_declarator_list, member, link, &block->declarations) {
       ast_type_qualifier& qualifier = member->type->qualifier;
       if ((qualifier.flags.i & interface_type_mask) == 0) {

From 7b407fececeb0e0bcc0e54929db3ef6809655632 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 18 Jan 2016 20:46:29 +1100
Subject: [PATCH 151/238] glsl: relax stage restrictions on layout defaults for
 outputs

The new xfb_buffer and xfb_stride global qualifiers are allowed in
geom, tess and vertex stages.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_type.cpp           | 5 +++--
 src/compiler/glsl/glsl_parser_extras.cpp | 9 ++++-----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index ab86577cc3e..8a3b175b63b 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -381,9 +381,10 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
       if (create_node) {
          node = new(mem_ctx) ast_tcs_output_layout(*loc);
       }
-   } else {
+   } else if (!(state->stage == MESA_SHADER_TESS_EVAL ||
+                state->stage == MESA_SHADER_VERTEX)) {
       _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
-                       "tessellation control or geometry shaders");
+                       "geometry, tessellation and vertex shaders");
    }
 
    /* Allow future assigments of global out's */
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 76ae0f88167..b88b6220513 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -1595,13 +1595,12 @@ set_shader_inout_layout(struct gl_shader *shader,
 		     struct _mesa_glsl_parse_state *state)
 {
    /* Should have been prevented by the parser. */
-   if (shader->Stage == MESA_SHADER_TESS_CTRL) {
+   if (shader->Stage == MESA_SHADER_TESS_CTRL ||
+       shader->Stage == MESA_SHADER_VERTEX) {
       assert(!state->in_qualifier->flags.i);
-   } else if (shader->Stage == MESA_SHADER_TESS_EVAL) {
-      assert(!state->out_qualifier->flags.i);
-   } else if (shader->Stage != MESA_SHADER_GEOMETRY) {
+   } else if (shader->Stage != MESA_SHADER_GEOMETRY &&
+              shader->Stage != MESA_SHADER_TESS_EVAL) {
       assert(!state->in_qualifier->flags.i);
-      assert(!state->out_qualifier->flags.i);
    }
 
    if (shader->Stage != MESA_SHADER_COMPUTE) {

From 5c2516fc3373f2e0ea234ef68791c8c0e4268e41 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 9 Feb 2016 22:24:30 +1100
Subject: [PATCH 152/238] glsl: add validation for out layout qualifiers

This adds validation for all qualifiers as allowed by the
table in Section 4.4 (Layout Qualifiers) of the GLSL 4.5 spec.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_type.cpp | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index 8a3b175b63b..9f0f578be86 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -359,6 +359,8 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
 {
    void *mem_ctx = state;
    const bool r = this->merge_qualifier(loc, state, q, false);
+   ast_type_qualifier valid_out_mask;
+   valid_out_mask.flags.i = 0;
 
    if (state->stage == MESA_SHADER_GEOMETRY) {
       if (q.flags.q.prim_type) {
@@ -377,20 +379,47 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
 
       /* Allow future assigments of global out's stream id value */
       this->flags.q.explicit_stream = 0;
+
+      valid_out_mask.flags.q.stream = 1;
+      valid_out_mask.flags.q.explicit_stream = 1;
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+      valid_out_mask.flags.q.max_vertices = 1;
+      valid_out_mask.flags.q.prim_type = 1;
    } else if (state->stage == MESA_SHADER_TESS_CTRL) {
       if (create_node) {
          node = new(mem_ctx) ast_tcs_output_layout(*loc);
       }
-   } else if (!(state->stage == MESA_SHADER_TESS_EVAL ||
-                state->stage == MESA_SHADER_VERTEX)) {
+      valid_out_mask.flags.q.vertices = 1;
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+   } else if (state->stage == MESA_SHADER_TESS_EVAL ||
+              state->stage == MESA_SHADER_VERTEX) {
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+   } else {
       _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
                        "geometry, tessellation and vertex shaders");
+      return false;
    }
 
    /* Allow future assigments of global out's */
    this->flags.q.explicit_xfb_buffer = 0;
    this->flags.q.explicit_xfb_stride = 0;
 
+   /* Generate an error when invalid input layout qualifiers are used. */
+   if ((q.flags.i & ~valid_out_mask.flags.i) != 0) {
+      _mesa_glsl_error(loc, state,
+		       "invalid output layout qualifiers used");
+      return false;
+   }
+
    return r;
 }
 

From 2dbcecb7a9f483e70875d60d9f18811088122861 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 21 Jan 2016 16:22:12 +1100
Subject: [PATCH 153/238] glsl: add IR fields for transform feedback layout
 qualifiers

Adds xfb_buffer/stride fields and adds comment to offset field
which is reused for xfb_offset.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ir.h | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index 56ed13e7153..b1a1d5656d1 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -726,6 +726,21 @@ public:
        */
       unsigned is_xfb_only:1;
 
+      /**
+       * Was a transfor feedback buffer set in the shader?
+       */
+      unsigned explicit_xfb_buffer:1;
+
+      /**
+       * Was a transfor feedback offset set in the shader?
+       */
+      unsigned explicit_xfb_offset:1;
+
+      /**
+       * Was a transfor feedback stride set in the shader?
+       */
+      unsigned explicit_xfb_stride:1;
+
       /**
        * If non-zero, then this variable may be packed along with other variables
        * into a single varying slot, so this offset should be applied when
@@ -861,7 +876,7 @@ public:
       unsigned stream;
 
       /**
-       * Atomic or block member offset.
+       * Atomic, transform feedback or block member offset.
        */
       unsigned offset;
 
@@ -872,6 +887,16 @@ public:
        */
       unsigned max_array_access;
 
+      /**
+       * Transform feedback buffer.
+       */
+      unsigned xfb_buffer;
+
+      /**
+       * Transform feedback stride.
+       */
+      unsigned xfb_stride;
+
       /**
        * Allow (only) ir_variable direct access private members.
        */

From 733f1b2a55aa396dd01ec516f93339d95ef32a42 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 16:06:00 +1100
Subject: [PATCH 154/238] glsl: add xfb_* qualifiers to glsl_struct_field

These will be used to hold qualifier values for interface and
struct members.

Support is added to the struct/interface constructors to copy these
fields upon creation.

We also update record_compare() to ensure we don't reuse a glsl_type
with the wrong xfb_* qualifier values.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/builtin_variables.cpp |  2 ++
 src/compiler/glsl_types.cpp             | 10 ++++++++++
 src/compiler/glsl_types.h               | 16 ++++++++++++++--
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index c39ae507b62..24e0b1a3667 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -334,6 +334,8 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].image_coherent = 0;
    this->fields[this->num_fields].image_volatile = 0;
    this->fields[this->num_fields].image_restrict = 0;
+   this->fields[this->num_fields].xfb_buffer = -1;
+   this->fields[this->num_fields].xfb_stride = -1;
    this->num_fields++;
 }
 
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 2421bd61954..3b77b5e690f 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -132,6 +132,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+      this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -172,6 +174,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+      this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -915,6 +919,12 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].precision
           != b->fields.structure[i].precision)
          return false;
+      if (this->fields.structure[i].xfb_buffer
+          != b->fields.structure[i].xfb_buffer)
+         return false;
+      if (this->fields.structure[i].xfb_stride
+          != b->fields.structure[i].xfb_stride)
+         return false;
    }
 
    return true;
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index b0e6f3f730f..9b5f8b1290b 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -839,12 +839,24 @@ struct glsl_struct_field {
 
    /**
     * For interface blocks, members may have an explicit byte offset
-    * specified; -1 otherwise.
+    * specified; -1 otherwise. Also used for xfb_offset layout qualifier.
     *
-    * Ignored for structs.
+    * Unless used for xfb_offset this field is ignored for structs.
     */
    int offset;
 
+   /**
+    * For interface blocks, members may define a transform feedback buffer;
+    * -1 otherwise.
+    */
+   int xfb_buffer;
+
+   /**
+    * For interface blocks, members may define a transform feedback stride;
+    * -1 otherwise.
+    */
+   int xfb_stride;
+
    /**
     * For interface blocks, the interpolation mode (as in
     * ir_variable::interpolation).  0 otherwise.

From 04d2f770c868537c2aa7329e923d526e7014d0b3 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 15:18:09 +1100
Subject: [PATCH 155/238] glsl: add field to track if xfb_buffer is an explicit
 or implicit value

Since any of the xfb_* qualifiers trigger the shader to be in
transform feedback mode we need an extra field to track if
the xfb_buffer on interface members was set explicitly since
xfb_buffer will always have a default value.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_to_hir.cpp        | 2 ++
 src/compiler/glsl/builtin_variables.cpp | 1 +
 src/compiler/glsl_types.cpp             | 7 +++++++
 src/compiler/glsl_types.h               | 7 +++++++
 4 files changed, 17 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 673ce8f716f..15001b768cb 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -6955,6 +6955,8 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].patch;
             fields[i].precision =
                earlier_per_vertex->fields.structure[j].precision;
+            fields[i].explicit_xfb_buffer =
+               earlier_per_vertex->fields.structure[j].explicit_xfb_buffer;
          }
       }
 
diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 24e0b1a3667..76a22cee29c 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -334,6 +334,7 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].image_coherent = 0;
    this->fields[this->num_fields].image_volatile = 0;
    this->fields[this->num_fields].image_restrict = 0;
+   this->fields[this->num_fields].explicit_xfb_buffer = 0;
    this->fields[this->num_fields].xfb_buffer = -1;
    this->fields[this->num_fields].xfb_stride = -1;
    this->num_fields++;
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 3b77b5e690f..c6a742e3aaf 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -132,6 +132,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].explicit_xfb_buffer =
+         fields[i].explicit_xfb_buffer;
       this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
       this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
    }
@@ -174,6 +176,8 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].explicit_xfb_buffer =
+         fields[i].explicit_xfb_buffer;
       this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
       this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
    }
@@ -919,6 +923,9 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].precision
           != b->fields.structure[i].precision)
          return false;
+      if (this->fields.structure[i].explicit_xfb_buffer
+          != b->fields.structure[i].explicit_xfb_buffer)
+         return false;
       if (this->fields.structure[i].xfb_buffer
           != b->fields.structure[i].xfb_buffer)
          return false;
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index 9b5f8b1290b..4f4cfea1201 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -901,6 +901,13 @@ struct glsl_struct_field {
    unsigned image_volatile:1;
    unsigned image_restrict:1;
 
+   /**
+    * Any of the xfb_* qualifiers trigger the shader to be in transform
+    * feedback mode so we need to keep track of whether the buffer was
+    * explicitly set or if its just been assigned the default global value.
+    */
+   unsigned explicit_xfb_buffer:1;
+
 #ifdef __cplusplus
    glsl_struct_field(const struct glsl_type *_type, const char *_name)
       : type(_type), name(_name), location(-1), interpolation(0), centroid(0),

From f6a8c7ef2170516b405c6d5e719358bfc14c724e Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 11 Mar 2016 23:00:16 +1100
Subject: [PATCH 156/238] glsl: add xfb_buffer compile time rules

Also copies the qualifier values to GLSL IR.

From the ARB_enhanced_layouts spec:

    "The *xfb_buffer* qualifier can be applied to the qualifier out,
    to output variables, to output blocks, and to output block
    members.  Shaders in the transform  feedback capturing mode have
    an initial global default of

        layout(xfb_buffer = 0) out;

    This default can be changed by declaring a different buffer with
    xfb_buffer on the interface qualifier out.  This is the only way
    the global default can be changed.  When a variable or output block
    is declared without an  xfb_buffer qualifier, it inherits the global
    default buffer.  When a variable or output block is declared with an
    xfb_buffer qualifier, it has that declared buffer.  All members of a
    block inherit the block's buffer.  A  member is allowed to declare
    an xfb_buffer, but it must match the buffer inherited from its
    block, or a compile-time error results.

    The *xfb_buffer* qualifier follows the same conventions, behavior,
    defaults, and inheritance rules as the qualifier stream, and the
    examples for stream apply here as well.  This includes a block's
    inheritance of the current global default buffer, a block member's
    inheritance of  the block's buffer, and the requirement that any
    *xfb_buffer* declared on a block member must match the buffer
    inherited from the block.

    ...

    It is a compile-time error to specify an *xfb_buffer* that is
    greater than  the implementation-dependent constant
    gl_MaxTransformFeedbackBuffers."

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 66 ++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 15001b768cb..7a17ba7c6f0 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -2584,6 +2584,22 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
    }
 }
 
+static bool
+validate_xfb_buffer_qualifier(YYLTYPE *loc,
+                              struct _mesa_glsl_parse_state *state,
+                              unsigned xfb_buffer) {
+   if (xfb_buffer >= state->Const.MaxTransformFeedbackBuffers) {
+      _mesa_glsl_error(loc, state,
+                       "invalid xfb_buffer specified %d is larger than "
+                       "MAX_TRANSFORM_FEEDBACK_BUFFERS - 1 (%d).",
+                       xfb_buffer,
+                       state->Const.MaxTransformFeedbackBuffers - 1);
+      return false;
+   }
+
+   return true;
+}
+
 static bool
 validate_stream_qualifier(YYLTYPE *loc, struct _mesa_glsl_parse_state *state,
                           unsigned stream)
@@ -3145,6 +3161,17 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
+   if (qual->flags.q.out && qual->flags.q.xfb_buffer) {
+      unsigned qual_xfb_buffer;
+      if (process_qualifier_constant(state, loc, "xfb_buffer",
+                                     qual->xfb_buffer, &qual_xfb_buffer) &&
+          validate_xfb_buffer_qualifier(loc, state, qual_xfb_buffer)) {
+         var->data.xfb_buffer = qual_xfb_buffer;
+         if (qual->flags.q.explicit_xfb_buffer)
+            var->data.explicit_xfb_buffer = true;
+      }
+   }
+
    if (var->type->contains_atomic()) {
       if (var->data.mode == ir_var_uniform) {
          if (var->data.explicit_binding) {
@@ -6257,6 +6284,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           ir_variable_mode var_mode,
                                           ast_type_qualifier *layout,
                                           unsigned block_stream,
+                                          unsigned block_xfb_buffer,
                                           unsigned expl_location,
                                           unsigned expl_align)
 {
@@ -6412,6 +6440,26 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          }
       }
 
+      int xfb_buffer;
+      unsigned explicit_xfb_buffer = 0;
+      if (qual->flags.q.explicit_xfb_buffer) {
+         unsigned qual_xfb_buffer;
+         if (process_qualifier_constant(state, &loc, "xfb_buffer",
+                                        qual->xfb_buffer, &qual_xfb_buffer)) {
+            explicit_xfb_buffer = 1;
+            if (qual_xfb_buffer != block_xfb_buffer)
+               _mesa_glsl_error(&loc, state, "xfb_buffer layout qualifier on "
+                                "interface block member does not match "
+                                "the interface block (%u vs %u)",
+                                qual_xfb_buffer, block_xfb_buffer);
+         }
+         xfb_buffer = (int) qual_xfb_buffer;
+      } else {
+         if (layout)
+            explicit_xfb_buffer = layout->flags.q.xfb_buffer;
+         xfb_buffer = (int) block_xfb_buffer;
+      }
+
       if (qual->flags.q.uniform && qual->has_interpolation()) {
          _mesa_glsl_error(&loc, state,
                           "interpolation qualifiers cannot be used "
@@ -6457,6 +6505,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          fields[i].sample = qual->flags.q.sample ? 1 : 0;
          fields[i].patch = qual->flags.q.patch ? 1 : 0;
          fields[i].precision = qual->precision;
+         fields[i].explicit_xfb_buffer = explicit_xfb_buffer;
+         fields[i].xfb_buffer = xfb_buffer;
 
          if (qual->flags.q.explicit_location) {
             unsigned qual_location;
@@ -6647,6 +6697,7 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                 ir_var_auto,
                                                 layout,
                                                 0, /* for interface only */
+                                                0, /* for interface only */
                                                 expl_location,
                                                 0 /* for interface only */);
 
@@ -6806,6 +6857,13 @@ ast_interface_block::hir(exec_list *instructions,
       return NULL;
    }
 
+   unsigned qual_xfb_buffer;
+   if (!process_qualifier_constant(state, &loc, "xfb_buffer",
+                                   layout.xfb_buffer, &qual_xfb_buffer) ||
+       !validate_xfb_buffer_qualifier(&loc, state, qual_xfb_buffer)) {
+      return NULL;
+   }
+
    unsigned expl_location = 0;
    if (layout.flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
@@ -6841,6 +6899,7 @@ ast_interface_block::hir(exec_list *instructions,
                                                 var_mode,
                                                 &this->layout,
                                                 qual_stream,
+                                                qual_xfb_buffer,
                                                 expl_location,
                                                 expl_align);
 
@@ -6957,6 +7016,8 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].precision;
             fields[i].explicit_xfb_buffer =
                earlier_per_vertex->fields.structure[j].explicit_xfb_buffer;
+            fields[i].xfb_buffer =
+               earlier_per_vertex->fields.structure[j].xfb_buffer;
          }
       }
 
@@ -7208,8 +7269,13 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.patch = fields[i].patch;
          var->data.stream = qual_stream;
          var->data.location = fields[i].location;
+
          if (fields[i].location != -1)
             var->data.explicit_location = true;
+
+         var->data.explicit_xfb_buffer = fields[i].explicit_xfb_buffer;
+         var->data.xfb_buffer = fields[i].xfb_buffer;
+
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)

From edddad0eee15c1f97443fc262d731e06d9604d4e Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 15:21:59 +1100
Subject: [PATCH 157/238] glsl: add xfb_offset compile time rules

We also copy the qualifier values to the IR in this step.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 125 ++++++++++++++++++++++++++++++-
 1 file changed, 123 insertions(+), 2 deletions(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 7a17ba7c6f0..9ff23757533 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -2600,6 +2600,67 @@ validate_xfb_buffer_qualifier(YYLTYPE *loc,
    return true;
 }
 
+/* From the ARB_enhanced_layouts spec:
+ *
+ *    "Variables and block members qualified with *xfb_offset* can be
+ *    scalars, vectors, matrices, structures, and (sized) arrays of these.
+ *    The offset must be a multiple of the size of the first component of
+ *    the first qualified variable or block member, or a compile-time error
+ *    results.  Further, if applied to an aggregate containing a double,
+ *    the offset must also be a multiple of 8, and the space taken in the
+ *    buffer will be a multiple of 8.
+ */
+static bool
+validate_xfb_offset_qualifier(YYLTYPE *loc,
+                              struct _mesa_glsl_parse_state *state,
+                              int xfb_offset, const glsl_type *type,
+                              unsigned component_size) {
+  const glsl_type *t_without_array = type->without_array();
+
+   if (xfb_offset != -1 && type->is_unsized_array()) {
+      _mesa_glsl_error(loc, state,
+                       "xfb_offset can't be used with unsized arrays.");
+      return false;
+   }
+
+   /* Make sure nested structs don't contain unsized arrays, and validate
+    * any xfb_offsets on interface members.
+    */
+   if (t_without_array->is_record() || t_without_array->is_interface())
+      for (unsigned int i = 0; i < t_without_array->length; i++) {
+         const glsl_type *member_t = t_without_array->fields.structure[i].type;
+
+         /* When the interface block doesn't have an xfb_offset qualifier then
+          * we apply the component size rules at the member level.
+          */
+         if (xfb_offset == -1)
+            component_size = member_t->contains_double() ? 8 : 4;
+
+         int xfb_offset = t_without_array->fields.structure[i].offset;
+         validate_xfb_offset_qualifier(loc, state, xfb_offset, member_t,
+                                       component_size);
+      }
+
+  /* Nested structs or interface block without offset may not have had an
+   * offset applied yet so return.
+   */
+   if (xfb_offset == -1) {
+     return true;
+   }
+
+   if (xfb_offset % component_size) {
+      _mesa_glsl_error(loc, state,
+                       "invalid qualifier xfb_offset=%d must be a multiple "
+                       "of the first component size of the first qualified "
+                       "variable or block member. Or double if an aggregate "
+                       "that contains a double (%d).",
+                       xfb_offset, component_size);
+      return false;
+   }
+
+   return true;
+}
+
 static bool
 validate_stream_qualifier(YYLTYPE *loc, struct _mesa_glsl_parse_state *state,
                           unsigned stream)
@@ -3172,6 +3233,19 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
+   if (qual->flags.q.explicit_xfb_offset) {
+      unsigned qual_xfb_offset;
+      unsigned component_size = var->type->contains_double() ? 8 : 4;
+
+      if (process_qualifier_constant(state, loc, "xfb_offset",
+                                     qual->offset, &qual_xfb_offset) &&
+          validate_xfb_offset_qualifier(loc, state, (int) qual_xfb_offset,
+                                        var->type, component_size)) {
+         var->data.offset = qual_xfb_offset;
+         var->data.explicit_xfb_offset = true;
+      }
+   }
+
    if (var->type->contains_atomic()) {
       if (var->data.mode == ir_var_uniform) {
          if (var->data.explicit_binding) {
@@ -6285,6 +6359,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           ast_type_qualifier *layout,
                                           unsigned block_stream,
                                           unsigned block_xfb_buffer,
+                                          unsigned block_xfb_offset,
                                           unsigned expl_location,
                                           unsigned expl_align)
 {
@@ -6505,6 +6580,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          fields[i].sample = qual->flags.q.sample ? 1 : 0;
          fields[i].patch = qual->flags.q.patch ? 1 : 0;
          fields[i].precision = qual->precision;
+         fields[i].offset = -1;
          fields[i].explicit_xfb_buffer = explicit_xfb_buffer;
          fields[i].xfb_buffer = xfb_buffer;
 
@@ -6569,8 +6645,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                    "with std430 and std140 layouts");
                }
             }
-         } else {
-            fields[i].offset = -1;
          }
 
          if (qual->flags.q.explicit_align || expl_align != 0) {
@@ -6603,6 +6677,31 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                next_offset = glsl_align(next_offset + size, align);
          }
 
+         /* From the ARB_enhanced_layouts spec:
+          *
+          *    "The given offset applies to the first component of the first
+          *    member of the qualified entity.  Then, within the qualified
+          *    entity, subsequent components are each assigned, in order, to
+          *    the next available offset aligned to a multiple of that
+          *    component's size.  Aggregate types are flattened down to the
+          *    component level to get this sequence of components."
+          */
+         if (qual->flags.q.explicit_xfb_offset) {
+            unsigned xfb_offset;
+            if (process_qualifier_constant(state, &loc, "xfb_offset",
+                                           qual->offset, &xfb_offset)) {
+               fields[i].offset = xfb_offset;
+               block_xfb_offset = fields[i].offset +
+                  4 * field_type->component_slots();
+            }
+         } else {
+            if (layout && layout->flags.q.explicit_xfb_offset) {
+               unsigned align = field_type->is_double() ? 8 : 4;
+               fields[i].offset = glsl_align(block_xfb_offset, align);
+               block_xfb_offset += 4 * field_type->component_slots();
+            }
+         }
+
          /* Propogate row- / column-major information down the fields of the
           * structure or interface block.  Structures need this data because
           * the structure may contain a structure that contains ... a matrix
@@ -6698,6 +6797,7 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                 layout,
                                                 0, /* for interface only */
                                                 0, /* for interface only */
+                                                0, /* for interface only */
                                                 expl_location,
                                                 0 /* for interface only */);
 
@@ -6864,6 +6964,14 @@ ast_interface_block::hir(exec_list *instructions,
       return NULL;
    }
 
+   unsigned qual_xfb_offset;
+   if (layout.flags.q.explicit_xfb_offset) {
+      if (!process_qualifier_constant(state, &loc, "xfb_offset",
+                                      layout.offset, &qual_xfb_offset)) {
+         return NULL;
+      }
+   }
+
    unsigned expl_location = 0;
    if (layout.flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
@@ -6900,6 +7008,7 @@ ast_interface_block::hir(exec_list *instructions,
                                                 &this->layout,
                                                 qual_stream,
                                                 qual_xfb_buffer,
+                                                qual_xfb_offset,
                                                 expl_location,
                                                 expl_align);
 
@@ -7018,6 +7127,8 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].explicit_xfb_buffer;
             fields[i].xfb_buffer =
                earlier_per_vertex->fields.structure[j].xfb_buffer;
+            fields[i].xfb_stride =
+               earlier_per_vertex->fields.structure[j].xfb_stride;
          }
       }
 
@@ -7048,6 +7159,12 @@ ast_interface_block::hir(exec_list *instructions,
                                         packing,
                                         this->block_name);
 
+   unsigned component_size = block_type->contains_double() ? 8 : 4;
+   int xfb_offset =
+      layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
+   validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type,
+                                 component_size);
+
    if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
       YYLTYPE loc = this->get_location();
       _mesa_glsl_error(&loc, state, "interface block `%s' with type `%s' "
@@ -7276,6 +7393,10 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.explicit_xfb_buffer = fields[i].explicit_xfb_buffer;
          var->data.xfb_buffer = fields[i].xfb_buffer;
 
+         if (fields[i].offset != -1)
+            var->data.explicit_xfb_offset = true;
+         var->data.offset = fields[i].offset;
+
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)

From 04a72e6e57ea97db2023bec50a10f2106f5d5b24 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sat, 13 Feb 2016 14:53:45 +1100
Subject: [PATCH 158/238] glsl: add xfb_stride compile time rules

From the ARB_enhanced_layouts spec:

   "The *xfb_stride* qualifier specifies how many bytes are consumed
   by each captured vertex.  It applies to the transform feedback
   buffer for that declaration, whether it is inherited or explicitly
   declared. It can be applied to variables, blocks, block members,
   or just the qualifier out.  If the buffer is capturing any
   double-typed outputs, the stride must be a multiple of 8, otherwise
   it must be a multiple of 4, or a compile-time or link-time error
   results.

   ...

   The resulting stride (implicit or explicit) must be less than or
   equal to the implementation-dependent constant
   gl_MaxTransformFeedbackInterleavedComponents."

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 9ff23757533..4fd2fd8ff05 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -3246,6 +3246,15 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
+   if (qual->flags.q.explicit_xfb_stride) {
+      unsigned qual_xfb_stride;
+      if (process_qualifier_constant(state, loc, "xfb_stride",
+                                     qual->xfb_stride, &qual_xfb_stride)) {
+         var->data.xfb_stride = qual_xfb_stride;
+         var->data.explicit_xfb_stride = true;
+      }
+   }
+
    if (var->type->contains_atomic()) {
       if (var->data.mode == ir_var_uniform) {
          if (var->data.explicit_binding) {
@@ -6535,6 +6544,15 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          xfb_buffer = (int) block_xfb_buffer;
       }
 
+      int xfb_stride = -1;
+      if (qual->flags.q.explicit_xfb_stride) {
+         unsigned qual_xfb_stride;
+         if (process_qualifier_constant(state, &loc, "xfb_stride",
+                                        qual->xfb_stride, &qual_xfb_stride)) {
+            xfb_stride = (int) qual_xfb_stride;
+         }
+      }
+
       if (qual->flags.q.uniform && qual->has_interpolation()) {
          _mesa_glsl_error(&loc, state,
                           "interpolation qualifiers cannot be used "
@@ -6583,6 +6601,7 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          fields[i].offset = -1;
          fields[i].explicit_xfb_buffer = explicit_xfb_buffer;
          fields[i].xfb_buffer = xfb_buffer;
+         fields[i].xfb_stride = xfb_stride;
 
          if (qual->flags.q.explicit_location) {
             unsigned qual_location;
@@ -6972,6 +6991,14 @@ ast_interface_block::hir(exec_list *instructions,
       }
    }
 
+   unsigned qual_xfb_stride;
+   if (layout.flags.q.explicit_xfb_stride) {
+      if (!process_qualifier_constant(state, &loc, "xfb_stride",
+                                      layout.xfb_stride, &qual_xfb_stride)) {
+         return NULL;
+      }
+   }
+
    unsigned expl_location = 0;
    if (layout.flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",

From 598790e8564280b8f3f105c0ff6de9fff4d45e30 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 11:51:48 +1100
Subject: [PATCH 159/238] glsl: apply xfb_stride to implicit offsets for ifc
 block members

When we have an interface block like:

layout (xfb_buffer = 0, xfb_offset = 0) out Block {
                             vec4 var1;
    layout (xfb_stride = 32) vec4 var2;
                             vec4 var3;
};

We take into account the stride of var2 when calculating the offset
for var3.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_to_hir.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 4fd2fd8ff05..a0312319161 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -54,6 +54,7 @@
 #include "ast.h"
 #include "compiler/glsl_types.h"
 #include "program/hash_table.h"
+#include "main/macros.h"
 #include "main/shaderobj.h"
 #include "ir.h"
 #include "ir_builder.h"
@@ -6711,13 +6712,14 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                            qual->offset, &xfb_offset)) {
                fields[i].offset = xfb_offset;
                block_xfb_offset = fields[i].offset +
-                  4 * field_type->component_slots();
+                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
             }
          } else {
             if (layout && layout->flags.q.explicit_xfb_offset) {
                unsigned align = field_type->is_double() ? 8 : 4;
                fields[i].offset = glsl_align(block_xfb_offset, align);
-               block_xfb_offset += 4 * field_type->component_slots();
+               block_xfb_offset +=
+                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
             }
          }
 

From 4a873ef049bce855e8b5f254d428956de8ce45f8 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 22 Jan 2016 16:22:40 +1100
Subject: [PATCH 160/238] glsl: add xfb qualifiers to has_layout helper

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/ast_type.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index 9f0f578be86..c3d38cbbf8a 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -79,7 +79,10 @@ ast_type_qualifier::has_layout() const
           || this->flags.q.explicit_index
           || this->flags.q.explicit_binding
           || this->flags.q.explicit_offset
-          || this->flags.q.explicit_stream;
+          || this->flags.q.explicit_stream
+          || this->flags.q.explicit_xfb_buffer
+          || this->flags.q.explicit_xfb_offset
+          || this->flags.q.explicit_xfb_stride;
 }
 
 bool

From ba7a7d4c39c06c6231e3f9a05f5e32378b76db6a Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 15 Feb 2016 13:27:55 +1100
Subject: [PATCH 161/238] glsl: add xfb qualifier lowering support for named
 blocks

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/lower_named_interface_blocks.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/compiler/glsl/lower_named_interface_blocks.cpp b/src/compiler/glsl/lower_named_interface_blocks.cpp
index 434cea90920..2c361997738 100644
--- a/src/compiler/glsl/lower_named_interface_blocks.cpp
+++ b/src/compiler/glsl/lower_named_interface_blocks.cpp
@@ -179,6 +179,13 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
             }
             new_var->data.location = iface_t->fields.structure[i].location;
             new_var->data.explicit_location = (new_var->data.location >= 0);
+            new_var->data.offset = iface_t->fields.structure[i].offset;
+            new_var->data.explicit_xfb_offset =
+               (iface_t->fields.structure[i].offset >= 0);
+            new_var->data.xfb_buffer =
+               iface_t->fields.structure[i].xfb_buffer;
+            new_var->data.explicit_xfb_buffer =
+               iface_t->fields.structure[i].explicit_xfb_buffer;
             new_var->data.interpolation =
                iface_t->fields.structure[i].interpolation;
             new_var->data.centroid = iface_t->fields.structure[i].centroid;

From 8b6f8fe5030a0bcc6cce6bf3aae48795802b6fb6 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Tue, 15 Mar 2016 17:52:06 +1100
Subject: [PATCH 162/238] glsl: add helper for counting varyings

This will be used to get a count of the number of varying name
strings we are required to generate for use with the query api.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl_types.cpp | 32 ++++++++++++++++++++++++++++++++
 src/compiler/glsl_types.h   |  6 ++++++
 2 files changed, 38 insertions(+)

diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index c6a742e3aaf..39585bff3b9 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -1350,6 +1350,38 @@ glsl_type::uniform_locations() const
    }
 }
 
+unsigned
+glsl_type::varying_count() const
+{
+   unsigned size = 0;
+
+   switch (this->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_BOOL:
+      return 1;
+
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE:
+      for (unsigned i = 0; i < this->length; i++)
+         size += this->fields.structure[i].type->varying_count();
+      return size;
+   case GLSL_TYPE_ARRAY:
+      /* Don't count innermost array elements */
+      if (this->without_array()->is_record() ||
+          this->without_array()->is_interface() ||
+          this->fields.array->is_array())
+         return this->length * this->fields.array->varying_count();
+      else
+         return this->fields.array->varying_count();
+   default:
+      assert(!"unsupported varying type");
+      return 0;
+   }
+}
+
 bool
 glsl_type::can_implicitly_convert_to(const glsl_type *desired,
                                      _mesa_glsl_parse_state *state) const
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index 4f4cfea1201..dd46479755a 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -326,6 +326,12 @@ struct glsl_type {
     */
    unsigned uniform_locations() const;
 
+   /**
+    * Used to count the number of varyings contained in the type ignoring
+    * innermost array elements.
+    */
+   unsigned varying_count() const;
+
    /**
     * Calculate the number of attribute slots required to hold this type
     *

From 707fd3972f3c2e16c710cd7ce819d0c5439c28fd Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 15:40:31 +1100
Subject: [PATCH 163/238] glsl: add helper to generate xfb varying names

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 43 +++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index dadbf1e6859..dae6fb216c7 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -63,6 +63,49 @@ get_varying_type(const ir_variable *var, gl_shader_stage stage)
    return type;
 }
 
+static void
+create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name,
+                         size_t name_length, unsigned *count,
+                         const char *ifc_member_name,
+                         const glsl_type *ifc_member_t, char ***varying_names)
+{
+   if (t->is_interface()) {
+      size_t new_length = name_length;
+
+      assert(ifc_member_name && ifc_member_t);
+      ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", ifc_member_name);
+
+      create_xfb_varying_names(mem_ctx, ifc_member_t, name, new_length, count,
+                               NULL, NULL, varying_names);
+   } else if (t->is_record()) {
+      for (unsigned i = 0; i < t->length; i++) {
+         const char *field = t->fields.structure[i].name;
+         size_t new_length = name_length;
+
+         ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field);
+
+         create_xfb_varying_names(mem_ctx, t->fields.structure[i].type, name,
+                                  new_length, count, NULL, NULL,
+                                  varying_names);
+      }
+   } else if (t->without_array()->is_record() ||
+              t->without_array()->is_interface() ||
+              (t->is_array() && t->fields.array->is_array())) {
+      for (unsigned i = 0; i < t->length; i++) {
+         size_t new_length = name_length;
+
+         /* Append the subscript to the current variable name */
+         ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
+
+         create_xfb_varying_names(mem_ctx, t->fields.array, name, new_length,
+                                  count, ifc_member_name, ifc_member_t,
+                                  varying_names);
+      }
+   } else {
+      (*varying_names)[(*count)++] = ralloc_strdup(mem_ctx, *name);
+   }
+}
+
 /**
  * Validate the types and qualifiers of an output from one stage against the
  * matching input to another stage.

From 0822517936d473f4889b07606e131e1dc3199644 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 16:27:03 +1100
Subject: [PATCH 164/238] glsl: add helper to process xfb qualifiers during
 linking

This function checks for any xfb_* qualifiers which will enable
transform feedback mode and cause any API defined xfb varyings
to be ignored.

It also counts the number of varyings that have a xfb_offset
qualifier and finally it calls the create_xfb_varying_names()
helper to generate the names of varyings to be caputured.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 66 +++++++++++++++++++++++++++++
 src/compiler/glsl/link_varyings.h   |  5 +++
 2 files changed, 71 insertions(+)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index dae6fb216c7..b000012e429 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -106,6 +106,72 @@ create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name,
    }
 }
 
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+                              unsigned *num_tfeedback_decls,
+                              char ***varying_names)
+{
+   bool has_xfb_qualifiers = false;
+
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *var = node->as_variable();
+      if (!var || var->data.mode != ir_var_shader_out)
+         continue;
+
+      /* From the ARB_enhanced_layouts spec:
+       *
+       *    "Any shader making any static use (after preprocessing) of any of
+       *     these *xfb_* qualifiers will cause the shader to be in a
+       *     transform feedback capturing mode and hence responsible for
+       *     describing the transform feedback setup.  This mode will capture
+       *     any output selected by *xfb_offset*, directly or indirectly, to
+       *     a transform feedback buffer."
+       */
+      if (var->data.explicit_xfb_buffer || var->data.explicit_xfb_stride) {
+         has_xfb_qualifiers = true;
+      }
+
+      if (var->data.explicit_xfb_offset) {
+         *num_tfeedback_decls += var->type->varying_count();
+         has_xfb_qualifiers = true;
+      }
+   }
+
+   if (*num_tfeedback_decls == 0)
+      return has_xfb_qualifiers;
+
+   unsigned i = 0;
+   *varying_names = ralloc_array(mem_ctx, char *, *num_tfeedback_decls);
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *var = node->as_variable();
+      if (!var || var->data.mode != ir_var_shader_out)
+         continue;
+
+      if (var->data.explicit_xfb_offset) {
+         char *name;
+         const glsl_type *type, *member_type;
+
+         if (var->data.from_named_ifc_block) {
+            type = var->get_interface_type();
+            /* Find the member type before it was altered by lowering */
+            member_type =
+               type->fields.structure[type->field_index(var->name)].type;
+            name = ralloc_strdup(NULL, type->without_array()->name);
+         } else {
+            type = var->type;
+            member_type = NULL;
+            name = ralloc_strdup(NULL, var->name);
+         }
+         create_xfb_varying_names(mem_ctx, type, &name, strlen(name), &i,
+                                  var->name, member_type, varying_names);
+         ralloc_free(name);
+      }
+   }
+
+   assert(i == *num_tfeedback_decls);
+   return has_xfb_qualifiers;
+}
+
 /**
  * Validate the types and qualifiers of an output from one stage against the
  * matching input to another stage.
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index b2812614ecc..8d504f6f0dc 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -268,6 +268,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                       const void *mem_ctx, unsigned num_names,
                       char **varying_names, tfeedback_decl *decls);
 
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+                              unsigned *num_tfeedback_decls,
+                              char ***varying_names);
+
 void
 remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
                                         gl_shader *sh,

From 4305a60173432635cde2f0f1dea8a715ed327bbc Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 16:37:02 +1100
Subject: [PATCH 165/238] glsl: add xfb helpers and fields to the
 tfeedback_decl class

We also apply any array/struct offsets.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 12 ++++++++++--
 src/compiler/glsl/link_varyings.h   | 20 ++++++++++++++++++++
 2 files changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index b000012e429..57c3d217200 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -506,6 +506,8 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
    this->next_buffer_separator = false;
    this->matched_candidate = NULL;
    this->stream_id = 0;
+   this->buffer = 0;
+   this->offset = 0;
 
    if (ctx->Extensions.ARB_transform_feedback3) {
       /* Parse gl_NextBuffer. */
@@ -598,6 +600,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
       = this->matched_candidate->toplevel_var->data.location * 4
       + this->matched_candidate->toplevel_var->data.location_frac
       + this->matched_candidate->offset;
+   const unsigned dmul =
+      this->matched_candidate->type->without_array()->is_double() ? 2 : 1;
 
    if (this->matched_candidate->type->is_array()) {
       /* Array variable */
@@ -605,8 +609,6 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
          this->matched_candidate->type->fields.array->matrix_columns;
       const unsigned vector_elements =
          this->matched_candidate->type->fields.array->vector_elements;
-      const unsigned dmul =
-         this->matched_candidate->type->fields.array->is_double() ? 2 : 1;
       unsigned actual_array_size;
       switch (this->lowered_builtin_array_variable) {
       case clip_distance:
@@ -684,6 +686,12 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
     */
    this->stream_id = this->matched_candidate->toplevel_var->data.stream;
 
+   unsigned array_offset = this->array_subscript * 4 * dmul;
+   unsigned struct_offset = this->matched_candidate->offset * 4 * dmul;
+   this->buffer = this->matched_candidate->toplevel_var->data.xfb_buffer;
+   this->offset = this->matched_candidate->toplevel_var->data.offset +
+      array_offset + struct_offset;
+
    return true;
 }
 
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index 8d504f6f0dc..339e9cf2f18 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -122,6 +122,16 @@ public:
       return this->stream_id;
    }
 
+   unsigned get_buffer() const
+   {
+      return this->buffer;
+   }
+
+   unsigned get_offset() const
+   {
+      return this->offset;
+   }
+
    /**
     * The total number of varying components taken up by this variable.  Only
     * valid if assign_location() has been called.
@@ -201,6 +211,16 @@ private:
     */
    int location;
 
+   /**
+    * Used to store the buffer assigned by xfb_buffer.
+    */
+   unsigned buffer;
+
+   /**
+    * Used to store the offset assigned by xfb_offset.
+    */
+   unsigned offset;
+
    /**
     * If non-zero, then this variable may be packed along with other variables
     * into a single varying slot, so this offset should be applied when

From 0c66460fc65e1c45ec2268c978ea11b259441212 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Wed, 24 Feb 2016 16:40:32 +1100
Subject: [PATCH 166/238] glsl: basic linking support for xfb qualifiers

This adds the initial infrastructure for enabling transform feedback
mode via in shader qualifiers and adds initial buffer support.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 22 +++++++++++++----
 src/compiler/glsl/link_varyings.h   |  5 ++--
 src/compiler/glsl/linker.cpp        | 38 ++++++++++++++++++++++++-----
 3 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 57c3d217200..33eb7d0e8bc 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -715,7 +715,8 @@ tfeedback_decl::get_num_outputs() const
 bool
 tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
                       struct gl_transform_feedback_info *info,
-                      unsigned buffer, const unsigned max_outputs) const
+                      unsigned buffer, const unsigned max_outputs,
+                      bool has_xfb_qualifiers) const
 {
    assert(!this->next_buffer_separator);
 
@@ -858,7 +859,7 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
-                     tfeedback_decl *tfeedback_decls)
+                     tfeedback_decl *tfeedback_decls, bool has_xfb_qualifiers)
 {
    bool separate_attribs_mode =
       prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS;
@@ -885,11 +886,12 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
 
    unsigned num_buffers = 0;
 
-   if (separate_attribs_mode) {
+   if (!has_xfb_qualifiers && separate_attribs_mode) {
       /* GL_SEPARATE_ATTRIBS */
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
          if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs))
+                                       num_buffers, num_outputs,
+                                       has_xfb_qualifiers))
             return false;
 
          num_buffers++;
@@ -898,6 +900,9 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
    else {
       /* GL_INVERLEAVED_ATTRIBS */
       int buffer_stream_id = -1;
+      unsigned buffer =
+         num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0;
+
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
          if (tfeedback_decls[i].is_next_buffer_separator()) {
             num_buffers++;
@@ -920,9 +925,16 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
             return false;
          }
 
+         if (has_xfb_qualifiers) {
+            buffer = tfeedback_decls[i].get_buffer();
+         } else {
+            buffer = num_buffers;
+         }
+
          if (!tfeedback_decls[i].store(ctx, prog,
                                        &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs))
+                                       num_buffers, num_outputs,
+                                       has_xfb_qualifiers))
             return false;
       }
       num_buffers++;
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index 339e9cf2f18..7165ecb9184 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -98,7 +98,7 @@ public:
    unsigned get_num_outputs() const;
    bool store(struct gl_context *ctx, struct gl_shader_program *prog,
               struct gl_transform_feedback_info *info, unsigned buffer,
-              const unsigned max_outputs) const;
+              const unsigned max_outputs, bool has_xfb_qualifiers) const;
    const tfeedback_candidate *find_candidate(gl_shader_program *prog,
                                              hash_table *tfeedback_candidates);
 
@@ -301,7 +301,8 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
-                     tfeedback_decl *tfeedback_decls);
+                     tfeedback_decl *tfeedback_decls,
+                     bool has_xfb_qualifiers);
 
 bool
 assign_varying_locations(struct gl_context *ctx,
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index cd35464eeeb..3ae958cacb7 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -4169,9 +4169,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       return;
    }
 
-   tfeedback_decl *tfeedback_decls = NULL;
-   unsigned num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+   unsigned num_tfeedback_decls = 0;
    unsigned int num_explicit_uniform_locs = 0;
+   bool has_xfb_qualifiers = false;
+   char **varying_names = NULL;
+   tfeedback_decl *tfeedback_decls = NULL;
 
    void *mem_ctx = ralloc_context(NULL); // temporary linker context
 
@@ -4481,6 +4483,30 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       goto done;
    }
 
+   /* From the ARB_enhanced_layouts spec:
+    *
+    *    "If the shader used to record output variables for transform feedback
+    *    varyings uses the "xfb_buffer", "xfb_offset", or "xfb_stride" layout
+    *    qualifiers, the values specified by TransformFeedbackVaryings are
+    *    ignored, and the set of variables captured for transform feedback is
+    *    instead derived from the specified layout qualifiers."
+    */
+   for (int i = MESA_SHADER_FRAGMENT - 1; i >= 0; i--) {
+      /* Find last stage before fragment shader */
+      if (prog->_LinkedShaders[i]) {
+         has_xfb_qualifiers =
+            process_xfb_layout_qualifiers(mem_ctx, prog->_LinkedShaders[i],
+                                          &num_tfeedback_decls,
+                                          &varying_names);
+         break;
+      }
+   }
+
+   if (!has_xfb_qualifiers) {
+      num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+      varying_names = prog->TransformFeedback.VaryingNames;
+   }
+
    if (num_tfeedback_decls != 0) {
       /* From GL_EXT_transform_feedback:
        *   A program will fail to link if:
@@ -4497,10 +4523,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
 
       tfeedback_decls = ralloc_array(mem_ctx, tfeedback_decl,
-                                     prog->TransformFeedback.NumVarying);
+                                     num_tfeedback_decls);
       if (!parse_tfeedback_decls(ctx, prog, mem_ctx, num_tfeedback_decls,
-                                 prog->TransformFeedback.VaryingNames,
-                                 tfeedback_decls))
+                                 varying_names, tfeedback_decls))
          goto done;
    }
 
@@ -4580,7 +4605,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls))
+   if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls,
+                             has_xfb_qualifiers))
       goto done;
 
    update_array_sizes(prog);

From 99cb5151ed2203842922027fe80512248abad914 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 11 Mar 2016 23:16:16 +1100
Subject: [PATCH 167/238] glsl: sort xfb varyings in offset/buffer order

The existing transform feedback code expects to receive the list
of varyings in increasing buffer order.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 33eb7d0e8bc..8aefb7e201d 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -849,6 +849,17 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
 }
 
 
+static int
+cmp_xfb_offset(const void * x_generic, const void * y_generic)
+{
+   tfeedback_decl *x = (tfeedback_decl *) x_generic;
+   tfeedback_decl *y = (tfeedback_decl *) y_generic;
+
+   if (x->get_buffer() != y->get_buffer())
+      return x->get_buffer() - y->get_buffer();
+   return x->get_offset() - y->get_offset();
+}
+
 /**
  * Store transform feedback location assignments into
  * prog->LinkedTransformFeedback based on the data stored in tfeedback_decls.
@@ -870,6 +881,14 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
    memset(&prog->LinkedTransformFeedback, 0,
           sizeof(prog->LinkedTransformFeedback));
 
+   /* The xfb_offset qualifier does not have to be used in increasing order
+    * however some drivers expect to receive the list of transform feedback
+    * declarations in order so sort it now for convenience.
+    */
+   if (has_xfb_qualifiers)
+      qsort(tfeedback_decls, num_tfeedback_decls, sizeof(*tfeedback_decls),
+            cmp_xfb_offset);
+
    prog->LinkedTransformFeedback.Varyings =
       rzalloc_array(prog,
                     struct gl_transform_feedback_varying_info,

From 258299d87ad932246ae8b1aa979b4a1a398db155 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 3 Mar 2016 13:20:01 +1100
Subject: [PATCH 168/238] glsl: use bitmask of active xfb buffer indices

This allows us to print the correct binding point when not all
buffers declared in the shader are bound.

For example if we use a single buffer:

layout(xfb_buffer=2, offset=0) out vec4 v;

We now print '2' when the buffer is not bound rather than '0'.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp  | 11 +++++++--
 src/mesa/drivers/dri/i965/gen6_sol.c |  2 +-
 src/mesa/main/mtypes.h               |  6 ++---
 src/mesa/main/transformfeedback.c    | 36 +++++++++++++++-------------
 src/mesa/main/transformfeedback.h    |  2 +-
 5 files changed, 33 insertions(+), 24 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 8aefb7e201d..cb0c8baf8aa 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -872,6 +872,11 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
                      tfeedback_decl *tfeedback_decls, bool has_xfb_qualifiers)
 {
+   /* Make sure MaxTransformFeedbackBuffers is less than 32 so the bitmask for
+    * tracking the number of buffers doesn't overflow.
+    */
+   assert(ctx->Const.MaxTransformFeedbackBuffers < 32);
+
    bool separate_attribs_mode =
       prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS;
 
@@ -904,6 +909,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                     num_outputs);
 
    unsigned num_buffers = 0;
+   unsigned buffers = 0;
 
    if (!has_xfb_qualifiers && separate_attribs_mode) {
       /* GL_SEPARATE_ATTRIBS */
@@ -913,6 +919,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                                        has_xfb_qualifiers))
             return false;
 
+         buffers |= 1 << num_buffers;
          num_buffers++;
       }
    }
@@ -949,6 +956,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
          } else {
             buffer = num_buffers;
          }
+         buffers |= 1 << num_buffers;
 
          if (!tfeedback_decls[i].store(ctx, prog,
                                        &prog->LinkedTransformFeedback,
@@ -956,12 +964,11 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                                        has_xfb_qualifiers))
             return false;
       }
-      num_buffers++;
    }
 
    assert(prog->LinkedTransformFeedback.NumOutputs == num_outputs);
 
-   prog->LinkedTransformFeedback.NumBuffers = num_buffers;
+   prog->LinkedTransformFeedback.ActiveBuffers = buffers;
    return true;
 }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 2f6eadffd2e..08d4e1b52ca 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -256,7 +256,7 @@ brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
     * overflowing any of the buffers currently being used for feedback.
     */
    unsigned max_index
-      = _mesa_compute_max_transform_feedback_vertices(xfb_obj,
+      = _mesa_compute_max_transform_feedback_vertices(ctx, xfb_obj,
                                                       linked_xfb_info);
 
    /* Initialize the SVBI 0 register to zero and set the maximum index. */
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index ae0c8a84dda..02e60626b4a 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1649,10 +1649,8 @@ struct gl_transform_feedback_info
 {
    unsigned NumOutputs;
 
-   /**
-    * Number of transform feedback buffers in use by this program.
-    */
-   unsigned NumBuffers;
+   /* Bitmask of active buffer indices. */
+   unsigned ActiveBuffers;
 
    struct gl_transform_feedback_output *Outputs;
 
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index f73a89f6c0f..39ba3dcd7c0 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -347,23 +347,25 @@ compute_transform_feedback_buffer_sizes(
  * enabled transform feedback buffers without overflowing any of them.
  */
 unsigned
-_mesa_compute_max_transform_feedback_vertices(
+_mesa_compute_max_transform_feedback_vertices(struct gl_context *ctx,
       const struct gl_transform_feedback_object *obj,
       const struct gl_transform_feedback_info *info)
 {
    unsigned max_index = 0xffffffff;
    unsigned i;
 
-   for (i = 0; i < info->NumBuffers; ++i) {
-      unsigned stride = info->BufferStride[i];
-      unsigned max_for_this_buffer;
+   for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((info->ActiveBuffers >> i) & 1) {
+         unsigned stride = info->BufferStride[i];
+         unsigned max_for_this_buffer;
 
-      /* Skip any inactive buffers, which have a stride of 0. */
-      if (stride == 0)
-	 continue;
+         /* Skip any inactive buffers, which have a stride of 0. */
+         if (stride == 0)
+	    continue;
 
-      max_for_this_buffer = obj->Size[i] / (4 * stride);
-      max_index = MIN2(max_index, max_for_this_buffer);
+         max_for_this_buffer = obj->Size[i] / (4 * stride);
+         max_index = MIN2(max_index, max_for_this_buffer);
+      }
    }
 
    return max_index;
@@ -445,12 +447,14 @@ _mesa_BeginTransformFeedback(GLenum mode)
       return;
    }
 
-   for (i = 0; i < info->NumBuffers; ++i) {
-      if (obj->BufferNames[i] == 0) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBeginTransformFeedback(binding point %d does not have "
-                     "a buffer object bound)", i);
-         return;
+   for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((info->ActiveBuffers >> i) & 1) {
+         if (obj->BufferNames[i] == 0) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "glBeginTransformFeedback(binding point %d does not "
+                        "have a buffer object bound)", i);
+            return;
+         }
       }
    }
 
@@ -470,7 +474,7 @@ _mesa_BeginTransformFeedback(GLenum mode)
        * feedback.
        */
       unsigned max_vertices
-         = _mesa_compute_max_transform_feedback_vertices(obj, info);
+         = _mesa_compute_max_transform_feedback_vertices(ctx, obj, info);
       obj->GlesRemainingPrims = max_vertices / vertices_per_prim;
    }
 
diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h
index eb274ad6540..c83f917a532 100644
--- a/src/mesa/main/transformfeedback.h
+++ b/src/mesa/main/transformfeedback.h
@@ -50,7 +50,7 @@ extern void
 _mesa_init_transform_feedback_functions(struct dd_function_table *driver);
 
 extern unsigned
-_mesa_compute_max_transform_feedback_vertices(
+_mesa_compute_max_transform_feedback_vertices( struct gl_context *ctx,
       const struct gl_transform_feedback_object *obj,
       const struct gl_transform_feedback_info *info);
 

From cf039a309a36ba537b45f3bfe7e5a154c87d51ad Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 15:00:00 +1100
Subject: [PATCH 169/238] mesa: split transform feedback buffer into its own
 struct

This will be used in a following patch to implement interface
query support for TRANSFORM_FEEDBACK_BUFFER.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp        | 11 ++++----
 src/mesa/drivers/dri/i965/gen6_sol.c       |  4 +--
 src/mesa/drivers/dri/i965/gen7_sol_state.c |  2 +-
 src/mesa/drivers/dri/i965/gen8_sol_state.c |  8 +++---
 src/mesa/main/mtypes.h                     | 30 ++++++++++++++--------
 src/mesa/main/transformfeedback.c          |  2 +-
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  2 +-
 7 files changed, 34 insertions(+), 25 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index cb0c8baf8aa..5645f783f3f 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -722,7 +722,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
 
    /* Handle gl_SkipComponents. */
    if (this->skip_components) {
-      info->BufferStride[buffer] += this->skip_components;
+      info->Buffers[buffer].Stride += this->skip_components;
       return true;
    }
 
@@ -734,7 +734,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
     *       and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
     */
    if (prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS &&
-       info->BufferStride[buffer] + this->num_components() >
+       info->Buffers[buffer].Stride + this->num_components() >
        ctx->Const.MaxTransformFeedbackInterleavedComponents) {
       linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
                    "limit has been exceeded.");
@@ -752,10 +752,11 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       info->Outputs[info->NumOutputs].NumComponents = output_size;
       info->Outputs[info->NumOutputs].StreamId = stream_id;
       info->Outputs[info->NumOutputs].OutputBuffer = buffer;
-      info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer];
+      info->Outputs[info->NumOutputs].DstOffset =
+         info->Buffers[buffer].Stride;
       ++info->NumOutputs;
-      info->BufferStride[buffer] += output_size;
-      info->BufferStream[buffer] = this->stream_id;
+      info->Buffers[buffer].Stride += output_size;
+      info->Buffers[buffer].Stream = this->stream_id;
       num_components -= output_size;
       location++;
       location_frac = 0;
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 08d4e1b52ca..24bb4b41b1e 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -69,13 +69,13 @@ gen6_update_sol_surfaces(struct brw_context *brw)
                brw, xfb_obj->Buffers[buffer],
                &brw->gs.base.surf_offset[surf_index],
                linked_xfb_info->Outputs[i].NumComponents,
-               linked_xfb_info->BufferStride[buffer], buffer_offset);
+               linked_xfb_info->Buffers[buffer].Stride, buffer_offset);
          } else {
             brw_update_sol_surface(
                brw, xfb_obj->Buffers[buffer],
                &brw->ff_gs.surf_offset[surf_index],
                linked_xfb_info->Outputs[i].NumComponents,
-               linked_xfb_info->BufferStride[buffer], buffer_offset);
+               linked_xfb_info->Buffers[buffer].Stride, buffer_offset);
          }
       } else {
          if (!brw->geometry_program)
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index 8cd2fc4b48a..c44572c3438 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -70,7 +70,7 @@ upload_3dstate_so_buffers(struct brw_context *brw)
 	 continue;
       }
 
-      stride = linked_xfb_info->BufferStride[i] * 4;
+      stride = linked_xfb_info->Buffers[i].Stride * 4;
 
       start = xfb_obj->Offset[i];
       assert(start % 4 == 0);
diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
index 58ead68e90c..f30818031f4 100644
--- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sol_state.c
@@ -139,13 +139,13 @@ gen8_upload_3dstate_streamout(struct brw_context *brw, bool active,
 
       /* Set buffer pitches; 0 means unbound. */
       if (xfb_obj->Buffers[0])
-         dw3 |= linked_xfb_info->BufferStride[0] * 4;
+         dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
       if (xfb_obj->Buffers[1])
-         dw3 |= (linked_xfb_info->BufferStride[1] * 4) << 16;
+         dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
       if (xfb_obj->Buffers[2])
-         dw4 |= linked_xfb_info->BufferStride[2] * 4;
+         dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
       if (xfb_obj->Buffers[3])
-         dw4 |= (linked_xfb_info->BufferStride[3] * 4) << 16;
+         dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
    }
 
    BEGIN_BATCH(5);
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 02e60626b4a..90c3851e72a 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1644,6 +1644,24 @@ struct gl_transform_feedback_output
 };
 
 
+struct gl_transform_feedback_buffer
+{
+   unsigned Binding;
+
+   /**
+    * Total number of components stored in each buffer.  This may be used by
+    * hardware back-ends to determine the correct stride when interleaving
+    * multiple transform feedback outputs in the same buffer.
+    */
+   unsigned Stride;
+
+   /**
+    * Which transform feedback stream this buffer binding is associated with.
+    */
+   unsigned Stream;
+};
+
+
 /** Post-link transform feedback info. */
 struct gl_transform_feedback_info
 {
@@ -1661,17 +1679,7 @@ struct gl_transform_feedback_info
    struct gl_transform_feedback_varying_info *Varyings;
    GLint NumVarying;
 
-   /**
-    * Total number of components stored in each buffer.  This may be used by
-    * hardware back-ends to determine the correct stride when interleaving
-    * multiple transform feedback outputs in the same buffer.
-    */
-   unsigned BufferStride[MAX_FEEDBACK_BUFFERS];
-
-   /**
-    * Which transform feedback stream this buffer binding is associated with.
-    */
-   unsigned BufferStream[MAX_FEEDBACK_BUFFERS];
+   struct gl_transform_feedback_buffer Buffers[MAX_FEEDBACK_BUFFERS];
 };
 
 
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index 39ba3dcd7c0..c92f0ccd5a5 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -356,7 +356,7 @@ _mesa_compute_max_transform_feedback_vertices(struct gl_context *ctx,
 
    for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
       if ((info->ActiveBuffers >> i) & 1) {
-         unsigned stride = info->BufferStride[i];
+         unsigned stride = info->Buffers[i].Stride;
          unsigned max_for_this_buffer;
 
          /* Skip any inactive buffers, which have a stride of 0. */
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 06b4bb41a9b..5e18e8be029 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6861,7 +6861,7 @@ st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
    }
 
    for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
-      so->stride[i] = info->BufferStride[i];
+      so->stride[i] = info->Buffers[i].Stride;
    }
    so->num_outputs = info->NumOutputs;
 }

From 8120e869b1cde7fd1a3679291782f2f50296cb45 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 11 Mar 2016 11:57:52 +1100
Subject: [PATCH 170/238] glsl: validate global out xfb_stride qualifiers and
 set stride on empty buffers

Here we use the built-in validation in
ast_layout_expression::process_qualifier_constant() to check for mismatching
global out strides on buffers in a single shader.

From the ARB_enhanced_layouts spec:

   "While *xfb_stride* can be declared multiple times for the same buffer,
   it is a compile-time or link-time error to have different values
   specified for the stride for the same buffer."

For intrastage validation a new helper link_xfb_stride_layout_qualifiers()
is created. We also take this opportunity to make sure stride is at least
a multiple of 4, we will validate doubles at a later stage.

From the ARB_enhanced_layouts spec:

   "If the buffer is capturing any double-typed outputs, the stride must
   be a multiple of 8, otherwise it must be a multiple of 4, or a
   compile-time or link-time error results."

Finally we update store_tfeedback_info() to apply the strides to
LinkedTransformFeedback and update the buffers bitmask to mark any global
buffers with a stride as active. For example a shader with:

layout (xfb_buffer = 0, xfb_offset = 0)  out vec4 gs_fs;
layout (xfb_buffer = 1, xfb_stride = 64) out;

Is expected to have a buffer bound to both 0 and 1.

From the ARB_enhanced_layouts spec:

   "A binding point requires a bound buffer object if and only if its
   associated stride in the program object used for transform feedback
   primitive capture is non-zero."

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/glsl_parser_extras.cpp | 11 ++++
 src/compiler/glsl/link_varyings.cpp      | 11 ++++
 src/compiler/glsl/linker.cpp             | 65 ++++++++++++++++++++++++
 src/mesa/main/mtypes.h                   |  7 +++
 4 files changed, 94 insertions(+)

diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index b88b6220513..0ce89ceb3a8 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -1617,6 +1617,17 @@ set_shader_inout_layout(struct gl_shader *shader,
       assert(!state->fs_early_fragment_tests);
    }
 
+   for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+      if (state->out_qualifier->out_xfb_stride[i]) {
+         unsigned xfb_stride;
+         if (state->out_qualifier->out_xfb_stride[i]->
+                process_qualifier_constant(state, "xfb_stride", &xfb_stride,
+                true)) {
+            shader->TransformFeedback.BufferStride[i] = xfb_stride;
+         }
+      }
+   }
+
    switch (shader->Stage) {
    case MESA_SHADER_TESS_CTRL:
       shader->TessCtrl.VerticesOut = 0;
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 5645f783f3f..d91642dea5c 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -930,6 +930,17 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
       unsigned buffer =
          num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0;
 
+      /* Apply any xfb_stride global qualifiers */
+      if (has_xfb_qualifiers) {
+         for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+            if (prog->TransformFeedback.BufferStride[j]) {
+               buffers |= 1 << j;
+               prog->LinkedTransformFeedback.Buffers[j].Stride =
+                  prog->TransformFeedback.BufferStride[j] / 4;
+            }
+         }
+      }
+
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
          if (tfeedback_decls[i].is_next_buffer_separator()) {
             num_buffers++;
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 3ae958cacb7..4f191c5002d 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1584,6 +1584,69 @@ private:
    hash_table *unnamed_interfaces;
 };
 
+/**
+ * Check for conflicting xfb_stride default qualifiers and store buffer stride
+ * for later use.
+ */
+static void
+link_xfb_stride_layout_qualifiers(struct gl_context *ctx,
+                                  struct gl_shader_program *prog,
+			          struct gl_shader *linked_shader,
+			          struct gl_shader **shader_list,
+			          unsigned num_shaders)
+{
+   for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+      linked_shader->TransformFeedback.BufferStride[i] = 0;
+   }
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+         if (shader->TransformFeedback.BufferStride[j]) {
+	    if (linked_shader->TransformFeedback.BufferStride[j] != 0 &&
+                shader->TransformFeedback.BufferStride[j] != 0 &&
+	        linked_shader->TransformFeedback.BufferStride[j] !=
+                   shader->TransformFeedback.BufferStride[j]) {
+	       linker_error(prog,
+                            "intrastage shaders defined with conflicting "
+                            "xfb_stride for buffer %d (%d and %d)\n", j,
+                            linked_shader->TransformFeedback.BufferStride[j],
+			    shader->TransformFeedback.BufferStride[j]);
+	       return;
+	    }
+
+            if (shader->TransformFeedback.BufferStride[j])
+	       linked_shader->TransformFeedback.BufferStride[j] =
+                  shader->TransformFeedback.BufferStride[j];
+         }
+      }
+   }
+
+   for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+      if (linked_shader->TransformFeedback.BufferStride[j]) {
+         prog->TransformFeedback.BufferStride[j] =
+            linked_shader->TransformFeedback.BufferStride[j];
+
+         /* We will validate doubles at a later stage */
+         if (prog->TransformFeedback.BufferStride[j] % 4) {
+            linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+                         "multiple of 4 or if its applied to a type that is "
+                         "or contains a double a multiple of 8.",
+                         prog->TransformFeedback.BufferStride[j]);
+            return;
+         }
+
+         if (prog->TransformFeedback.BufferStride[j] / 4 >
+             ctx->Const.MaxTransformFeedbackInterleavedComponents) {
+            linker_error(prog,
+                         "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
+                         "limit has been exceeded.");
+                  return;
+         }
+      }
+   }
+}
 
 /**
  * Performs the cross-validation of tessellation control shader vertices and
@@ -2101,6 +2164,8 @@ link_intrastage_shaders(void *mem_ctx,
    link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_xfb_stride_layout_qualifiers(ctx, prog, linked, shader_list,
+                                     num_shaders);
 
    populate_symbol_table(linked);
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 90c3851e72a..1e15b69ab70 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2337,6 +2337,11 @@ struct gl_shader
    bool origin_upper_left;
    bool pixel_center_integer;
 
+   struct {
+      /** Global xfb_stride out qualifier if any */
+      GLuint BufferStride[MAX_FEEDBACK_BUFFERS];
+   } TransformFeedback;
+
    /**
     * Tessellation Control shader state from layout qualifiers.
     */
@@ -2674,6 +2679,8 @@ struct gl_shader_program
     */
    struct {
       GLenum BufferMode;
+      /** Global xfb_stride out qualifier if any */
+      GLuint BufferStride[MAX_FEEDBACK_BUFFERS];
       GLuint NumVarying;
       GLchar **VaryingNames;  /**< Array [NumVarying] of char * */
    } TransformFeedback;

From 2fab85aaea59cb2d31d34ea6de94180ca83fe2dd Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 3 Mar 2016 15:26:53 +1100
Subject: [PATCH 171/238] glsl: add xfb_stride link time validation

From the ARB_enhanced_layous spec:

   "It is a compile-time or link-time error to have any *xfb_offset*
    that overflows *xfb_stride*, whether stated on declarations before
    or after the *xfb_stride*, or in different compilation units.

    ...

    When no *xfb_stride* is specified for a buffer, the stride of a
    buffer will be the smallest needed to hold the variable placed at
    the highest offset, including any required padding."

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 44 ++++++++++++++++++++++++-----
 src/compiler/glsl/link_varyings.h   |  3 +-
 2 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index d91642dea5c..89bc68e277e 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -716,7 +716,7 @@ bool
 tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
                       struct gl_transform_feedback_info *info,
                       unsigned buffer, const unsigned max_outputs,
-                      bool has_xfb_qualifiers) const
+                      bool *explicit_stride, bool has_xfb_qualifiers) const
 {
    assert(!this->next_buffer_separator);
 
@@ -726,6 +726,13 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       return true;
    }
 
+   unsigned xfb_offset = 0;
+   if (has_xfb_qualifiers) {
+      xfb_offset = this->offset / 4;
+   } else {
+      xfb_offset = info->Buffers[buffer].Stride;
+   }
+
    /* From GL_EXT_transform_feedback:
     *   A program will fail to link if:
     *
@@ -752,17 +759,38 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       info->Outputs[info->NumOutputs].NumComponents = output_size;
       info->Outputs[info->NumOutputs].StreamId = stream_id;
       info->Outputs[info->NumOutputs].OutputBuffer = buffer;
-      info->Outputs[info->NumOutputs].DstOffset =
-         info->Buffers[buffer].Stride;
+      info->Outputs[info->NumOutputs].DstOffset = xfb_offset;
       ++info->NumOutputs;
-      info->Buffers[buffer].Stride += output_size;
       info->Buffers[buffer].Stream = this->stream_id;
+      xfb_offset += output_size;
+
       num_components -= output_size;
       location++;
       location_frac = 0;
    }
 
-   info->Varyings[info->NumVarying].Name = ralloc_strdup(prog, this->orig_name);
+   if (explicit_stride && explicit_stride[buffer]) {
+      if (this->is_double() && info->Buffers[buffer].Stride % 2) {
+         linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+                      "multiple of 8 as its applied to a type that is or "
+                      "contains a double.",
+                      info->Buffers[buffer].Stride * 4);
+         return false;
+      }
+
+      if ((this->offset / 4) / info->Buffers[buffer].Stride !=
+          (xfb_offset - 1) / info->Buffers[buffer].Stride) {
+         linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for "
+                      "buffer (%d)", xfb_offset * 4,
+                      info->Buffers[buffer].Stride * 4, buffer);
+         return false;
+      }
+   } else {
+      info->Buffers[buffer].Stride = xfb_offset;
+   }
+
+   info->Varyings[info->NumVarying].Name = ralloc_strdup(prog,
+                                                         this->orig_name);
    info->Varyings[info->NumVarying].Type = this->type;
    info->Varyings[info->NumVarying].Size = this->size;
    info->NumVarying++;
@@ -916,7 +944,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
       /* GL_SEPARATE_ATTRIBS */
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
          if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs,
+                                       num_buffers, num_outputs, NULL,
                                        has_xfb_qualifiers))
             return false;
 
@@ -929,12 +957,14 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
       int buffer_stream_id = -1;
       unsigned buffer =
          num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0;
+      bool explicit_stride[MAX_FEEDBACK_BUFFERS] = { false };
 
       /* Apply any xfb_stride global qualifiers */
       if (has_xfb_qualifiers) {
          for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
             if (prog->TransformFeedback.BufferStride[j]) {
                buffers |= 1 << j;
+               explicit_stride[j] = true;
                prog->LinkedTransformFeedback.Buffers[j].Stride =
                   prog->TransformFeedback.BufferStride[j] / 4;
             }
@@ -973,7 +1003,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
          if (!tfeedback_decls[i].store(ctx, prog,
                                        &prog->LinkedTransformFeedback,
                                        num_buffers, num_outputs,
-                                       has_xfb_qualifiers))
+                                       explicit_stride, has_xfb_qualifiers))
             return false;
       }
    }
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index 7165ecb9184..7919a8d5cd5 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -98,7 +98,8 @@ public:
    unsigned get_num_outputs() const;
    bool store(struct gl_context *ctx, struct gl_shader_program *prog,
               struct gl_transform_feedback_info *info, unsigned buffer,
-              const unsigned max_outputs, bool has_xfb_qualifiers) const;
+              const unsigned max_outputs, bool *explicit_stride,
+              bool has_xfb_qualifiers) const;
    const tfeedback_candidate *find_candidate(gl_shader_program *prog,
                                              hash_table *tfeedback_candidates);
 

From f2a3c87a00bb38aa63dfb3a5818b2d53ca46c663 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Fri, 11 Mar 2016 13:53:13 +1100
Subject: [PATCH 172/238] glsl: generate link error when implicit stride is to
 large

This moves the check until after we have done the stride
calculation and applies it to the xfb_* qualifiers.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 37 +++++++++++++++++------------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 89bc68e277e..50c18d06ee8 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -733,21 +733,6 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       xfb_offset = info->Buffers[buffer].Stride;
    }
 
-   /* From GL_EXT_transform_feedback:
-    *   A program will fail to link if:
-    *
-    *     * the total number of components to capture is greater than
-    *       the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT
-    *       and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
-    */
-   if (prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS &&
-       info->Buffers[buffer].Stride + this->num_components() >
-       ctx->Const.MaxTransformFeedbackInterleavedComponents) {
-      linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
-                   "limit has been exceeded.");
-      return false;
-   }
-
    unsigned location = this->location;
    unsigned location_frac = this->location_frac;
    unsigned num_components = this->num_components();
@@ -789,6 +774,28 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
       info->Buffers[buffer].Stride = xfb_offset;
    }
 
+   /* From GL_EXT_transform_feedback:
+    *   A program will fail to link if:
+    *
+    *     * the total number of components to capture is greater than
+    *       the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT
+    *       and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
+    *
+    * From GL_ARB_enhanced_layouts:
+    *
+    *   "The resulting stride (implicit or explicit) must be less than or
+    *   equal to the implementation-dependent constant
+    *   gl_MaxTransformFeedbackInterleavedComponents."
+    */
+   if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS ||
+        has_xfb_qualifiers) &&
+       info->Buffers[buffer].Stride >
+       ctx->Const.MaxTransformFeedbackInterleavedComponents) {
+      linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
+                   "limit has been exceeded.");
+      return false;
+   }
+
    info->Varyings[info->NumVarying].Name = ralloc_strdup(prog,
                                                          this->orig_name);
    info->Varyings[info->NumVarying].Type = this->type;

From a2fbc5ed44ec8ebf0e8936ff5b21425159511413 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Sun, 13 Mar 2016 16:36:25 +1100
Subject: [PATCH 173/238] glsl: reset current stream tracker

When we move to the next buffer we need to reset the stream
so that we don't generate an error message about streams not
matching.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 50c18d06ee8..c5c392d6140 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -979,6 +979,12 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
       }
 
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+         if (has_xfb_qualifiers &&
+             buffer != tfeedback_decls[i].get_buffer()) {
+            /* we have moved to the next buffer so reset stream id */
+            buffer_stream_id = -1;
+         }
+
          if (tfeedback_decls[i].is_next_buffer_separator()) {
             num_buffers++;
             buffer_stream_id = -1;

From d5c09d40b909cca43936b7f1a3ea16d6568d6203 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 14 Mar 2016 10:17:48 +1100
Subject: [PATCH 174/238] glsl: when lowering named interface set assigned flag

This will be used when checking if xfb should attempt to capture
a varying.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/lower_named_interface_blocks.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/src/compiler/glsl/lower_named_interface_blocks.cpp b/src/compiler/glsl/lower_named_interface_blocks.cpp
index 2c361997738..f780ecacbd2 100644
--- a/src/compiler/glsl/lower_named_interface_blocks.cpp
+++ b/src/compiler/glsl/lower_named_interface_blocks.cpp
@@ -217,12 +217,23 @@ ir_visitor_status
 flatten_named_interface_blocks_declarations::visit_leave(ir_assignment *ir)
 {
    ir_dereference_record *lhs_rec = ir->lhs->as_dereference_record();
+
+   ir_variable *lhs_var =  ir->lhs->variable_referenced();
+   if (lhs_var && lhs_var->get_interface_type()) {
+      lhs_var->data.assigned = 1;
+   }
+
    if (lhs_rec) {
       ir_rvalue *lhs_rec_tmp = lhs_rec;
       handle_rvalue(&lhs_rec_tmp);
       if (lhs_rec_tmp != lhs_rec) {
          ir->set_lhs(lhs_rec_tmp);
       }
+
+      ir_variable *lhs_var =  lhs_rec_tmp->variable_referenced();
+      if (lhs_var) {
+         lhs_var->data.assigned = 1;
+      }
    }
    return rvalue_visit(ir);
 }

From c95e92b14d69c114b79d941c7e8902a0ea62c287 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 14 Mar 2016 10:32:17 +1100
Subject: [PATCH 175/238] glsl: handle varyings that are not written to but
 have an xfb_offset

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 34 ++++++++++++++++++++---------
 src/compiler/glsl/link_varyings.h   |  8 +++++++
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index c5c392d6140..ce6ff0863f0 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -738,14 +738,26 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
    unsigned num_components = this->num_components();
    while (num_components > 0) {
       unsigned output_size = MIN2(num_components, 4 - location_frac);
-      assert(info->NumOutputs < max_outputs);
-      info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
-      info->Outputs[info->NumOutputs].OutputRegister = location;
-      info->Outputs[info->NumOutputs].NumComponents = output_size;
-      info->Outputs[info->NumOutputs].StreamId = stream_id;
-      info->Outputs[info->NumOutputs].OutputBuffer = buffer;
-      info->Outputs[info->NumOutputs].DstOffset = xfb_offset;
-      ++info->NumOutputs;
+      assert((info->NumOutputs == 0 && max_outputs == 0) ||
+             info->NumOutputs < max_outputs);
+
+      /* From the ARB_enhanced_layouts spec:
+       *
+       *    "If such a block member or variable is not written during a shader
+       *    invocation, the buffer contents at the assigned offset will be
+       *    undefined.  Even if there are no static writes to a variable or
+       *    member that is assigned a transform feedback offset, the space is
+       *    still allocated in the buffer and still affects the stride."
+       */
+      if (this->is_varying_written()) {
+         info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
+         info->Outputs[info->NumOutputs].OutputRegister = location;
+         info->Outputs[info->NumOutputs].NumComponents = output_size;
+         info->Outputs[info->NumOutputs].StreamId = stream_id;
+         info->Outputs[info->NumOutputs].OutputBuffer = buffer;
+         info->Outputs[info->NumOutputs].DstOffset = xfb_offset;
+         ++info->NumOutputs;
+      }
       info->Buffers[buffer].Stream = this->stream_id;
       xfb_offset += output_size;
 
@@ -936,8 +948,10 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                     num_tfeedback_decls);
 
    unsigned num_outputs = 0;
-   for (unsigned i = 0; i < num_tfeedback_decls; ++i)
-      num_outputs += tfeedback_decls[i].get_num_outputs();
+   for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+      if (tfeedback_decls[i].is_varying_written())
+         num_outputs += tfeedback_decls[i].get_num_outputs();
+   }
 
    prog->LinkedTransformFeedback.Outputs =
       rzalloc_array(prog,
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index 7919a8d5cd5..9ea79f04fa8 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -108,6 +108,14 @@ public:
       return this->next_buffer_separator;
    }
 
+   bool is_varying_written() const
+   {
+      if (this->next_buffer_separator || this->skip_components)
+         return false;
+
+      return this->matched_candidate->toplevel_var->data.assigned;
+   }
+
    bool is_varying() const
    {
       return !this->next_buffer_separator && !this->skip_components;

From b77c9098782a36cb811891b2bcb572eb61e608ac Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Mon, 14 Mar 2016 11:16:55 +1100
Subject: [PATCH 176/238] glsl: always enable transform feedback mode when
 xfb_stride defined

This enables in shader defined transform feedback mode even if the
only place xfb_stride is defined is on the global out.

We don't worry about xfb_buffer since Issue 22 c) in the spec says:

   "If the shader has an "xfb_buffer" qualifier identifying a buffer,
    but doesn't declare "xfb_offset" on anything associated with it,
    what happens?

    ...

    variables not qualified with "xfb_offset" are not captured, which
    makes the associated "xfb_buffer" qualifier irrelevant."

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index ce6ff0863f0..d486b691f58 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -113,6 +113,16 @@ process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
 {
    bool has_xfb_qualifiers = false;
 
+   /* We still need to enable transform feedback mode even if xfb_stride is
+    * only applied to a global out. Also we don't bother to propagate
+    * xfb_stride to interface block members so this will catch that case also.
+    */
+   for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+      if (sh->TransformFeedback.BufferStride[j]) {
+         has_xfb_qualifiers = true;
+      }
+   }
+
    foreach_in_list(ir_instruction, node, sh->ir) {
       ir_variable *var = node->as_variable();
       if (!var || var->data.mode != ir_var_shader_out)

From 047139e8a027d81141b6e0b1bc939942e873f3ce Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 15:20:32 +1100
Subject: [PATCH 177/238] mesa: rename tranform feeback varying macro XFB to
 XFV

A latter patch will use XFB for buffers.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/shader_query.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 4967e4b1df1..247a26d4fda 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -60,7 +60,7 @@ DECL_RESOURCE_FUNC(VAR, gl_shader_variable);
 DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
 DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
 DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
-DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(XFV, gl_transform_feedback_varying_info);
 DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
@@ -433,7 +433,7 @@ _mesa_program_resource_name(struct gl_program_resource *res)
    case GL_SHADER_STORAGE_BLOCK:
       return RESOURCE_UBO(res)->Name;
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      return RESOURCE_XFB(res)->Name;
+      return RESOURCE_XFV(res)->Name;
    case GL_PROGRAM_INPUT:
       var = RESOURCE_VAR(res);
       /* Special case gl_VertexIDMESA -> gl_VertexID. */
@@ -473,8 +473,8 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
 {
    switch (res->Type) {
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      return RESOURCE_XFB(res)->Size > 1 ?
-             RESOURCE_XFB(res)->Size : 0;
+      return RESOURCE_XFV(res)->Size > 1 ?
+             RESOURCE_XFV(res)->Size : 0;
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->type->length;
@@ -1157,7 +1157,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          *val = RESOURCE_VAR(res)->type->gl_type;
          return 1;
       case GL_TRANSFORM_FEEDBACK_VARYING:
-         *val = RESOURCE_XFB(res)->Type;
+         *val = RESOURCE_XFV(res)->Type;
          return 1;
       default:
          goto invalid_operation;
@@ -1180,7 +1180,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          *val = MAX2(_mesa_program_resource_array_size(res), 1);
          return 1;
       case GL_TRANSFORM_FEEDBACK_VARYING:
-         *val = MAX2(RESOURCE_XFB(res)->Size, 1);
+         *val = MAX2(RESOURCE_XFV(res)->Size, 1);
          return 1;
       default:
          goto invalid_operation;

From 51142e7705a5e0c28de9fc097fa7c8446ba0cffe Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 11:40:37 +1100
Subject: [PATCH 178/238] mesa: add support to query GL_OFFSET for
 GL_TRANSFORM_FEEDBACK_VARYING

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp |  1 +
 src/mesa/main/mtypes.h              |  1 +
 src/mesa/main/shader_query.cpp      | 14 +++++++++++---
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index d486b691f58..b091eaf5512 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -742,6 +742,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
    } else {
       xfb_offset = info->Buffers[buffer].Stride;
    }
+   info->Varyings[info->NumVarying].Offset = xfb_offset * 4;
 
    unsigned location = this->location;
    unsigned location_frac = this->location_frac;
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 1e15b69ab70..320297cdfe9 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1619,6 +1619,7 @@ struct gl_transform_feedback_varying_info
    char *Name;
    GLenum Type;
    GLint Size;
+   GLint Offset;
 };
 
 
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 247a26d4fda..15cc49afbfc 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -1186,9 +1186,17 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          goto invalid_operation;
       }
    case GL_OFFSET:
-      VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE);
-      *val = RESOURCE_UNI(res)->offset;
-      return 1;
+      switch (res->Type) {
+      case GL_UNIFORM:
+      case GL_BUFFER_VARIABLE:
+         *val = RESOURCE_UNI(res)->offset;
+         return 1;
+      case GL_TRANSFORM_FEEDBACK_VARYING:
+         *val = RESOURCE_XFV(res)->Offset;
+         return 1;
+      default:
+         goto invalid_operation;
+      }
    case GL_BLOCK_INDEX:
       VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE);
       *val = RESOURCE_UNI(res)->block_index;

From 9e317271d7694d912da99e524294156b6c2de96e Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 15:57:19 +1100
Subject: [PATCH 179/238] mesa: add support to query
 GL_TRANSFORM_FEEDBACK_BUFFER_INDEX

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp | 15 +++++++++------
 src/compiler/glsl/link_varyings.h   |  4 ++--
 src/mesa/main/mtypes.h              |  1 +
 src/mesa/main/shader_query.cpp      |  6 ++++++
 4 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index b091eaf5512..a3b7e1c65c7 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -725,8 +725,9 @@ tfeedback_decl::get_num_outputs() const
 bool
 tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
                       struct gl_transform_feedback_info *info,
-                      unsigned buffer, const unsigned max_outputs,
-                      bool *explicit_stride, bool has_xfb_qualifiers) const
+                      unsigned buffer, unsigned buffer_index,
+                      const unsigned max_outputs, bool *explicit_stride,
+                      bool has_xfb_qualifiers) const
 {
    assert(!this->next_buffer_separator);
 
@@ -823,6 +824,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
                                                          this->orig_name);
    info->Varyings[info->NumVarying].Type = this->type;
    info->Varyings[info->NumVarying].Size = this->size;
+   info->Varyings[info->NumVarying].BufferIndex = buffer_index;
    info->NumVarying++;
 
    return true;
@@ -976,8 +978,8 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
       /* GL_SEPARATE_ATTRIBS */
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
          if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs, NULL,
-                                       has_xfb_qualifiers))
+                                       num_buffers, num_buffers, num_outputs,
+                                       NULL, has_xfb_qualifiers))
             return false;
 
          buffers |= 1 << num_buffers;
@@ -1008,6 +1010,7 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
              buffer != tfeedback_decls[i].get_buffer()) {
             /* we have moved to the next buffer so reset stream id */
             buffer_stream_id = -1;
+            num_buffers++;
          }
 
          if (tfeedback_decls[i].is_next_buffer_separator()) {
@@ -1036,11 +1039,11 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
          } else {
             buffer = num_buffers;
          }
-         buffers |= 1 << num_buffers;
+         buffers |= 1 << buffer;
 
          if (!tfeedback_decls[i].store(ctx, prog,
                                        &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs,
+                                       buffer, num_buffers, num_outputs,
                                        explicit_stride, has_xfb_qualifiers))
             return false;
       }
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index 9ea79f04fa8..543b80ff29b 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -98,8 +98,8 @@ public:
    unsigned get_num_outputs() const;
    bool store(struct gl_context *ctx, struct gl_shader_program *prog,
               struct gl_transform_feedback_info *info, unsigned buffer,
-              const unsigned max_outputs, bool *explicit_stride,
-              bool has_xfb_qualifiers) const;
+              unsigned buffer_index, const unsigned max_outputs,
+              bool *explicit_stride, bool has_xfb_qualifiers) const;
    const tfeedback_candidate *find_candidate(gl_shader_program *prog,
                                              hash_table *tfeedback_candidates);
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 320297cdfe9..33923d195c9 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1618,6 +1618,7 @@ struct gl_transform_feedback_varying_info
 {
    char *Name;
    GLenum Type;
+   GLint BufferIndex;
    GLint Size;
    GLint Offset;
 };
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 15cc49afbfc..ee2eeab0f8d 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -1322,6 +1322,12 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       default:
          goto invalid_operation;
       }
+
+   case GL_TRANSFORM_FEEDBACK_BUFFER_INDEX:
+      VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_VARYING);
+      *val = RESOURCE_XFV(res)->BufferIndex;
+      return 1;
+
    default:
       goto invalid_enum;
    }

From 7234be0338813c0acd5b700ea2f7f20f7a972e51 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 15:04:02 +1100
Subject: [PATCH 180/238] glsl: add transform feedback buffers to resource list

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/linker.cpp               | 14 +++++++++++++-
 src/compiler/glsl/program.h                |  3 ++-
 src/mesa/drivers/dri/i965/brw_link.cpp     |  2 +-
 src/mesa/program/ir_to_mesa.cpp            |  2 +-
 src/mesa/state_tracker/st_glsl_to_tgsi.cpp |  2 +-
 5 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 4f191c5002d..510a22e5bd3 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -3852,7 +3852,8 @@ write_top_level_array_size_and_stride:
  * resource data.
  */
 void
-build_program_resource_list(struct gl_shader_program *shProg)
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg)
 {
    /* Rebuild resource list. */
    if (shProg->ProgramResourceList) {
@@ -3910,6 +3911,17 @@ build_program_resource_list(struct gl_shader_program *shProg)
       }
    }
 
+   /* Add transform feedback buffers. */
+   for (unsigned i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((shProg->LinkedTransformFeedback.ActiveBuffers >> i) & 1) {
+         shProg->LinkedTransformFeedback.Buffers[i].Binding = i;
+         if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_BUFFER,
+                                   &shProg->LinkedTransformFeedback.Buffers[i],
+                                   0))
+         return;
+      }
+   }
+
    /* Add uniforms from uniform storage. */
    for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
       /* Do not add uniforms internally used by Mesa. */
diff --git a/src/compiler/glsl/program.h b/src/compiler/glsl/program.h
index 31bb9aa2435..8f5a31bd5ba 100644
--- a/src/compiler/glsl/program.h
+++ b/src/compiler/glsl/program.h
@@ -43,7 +43,8 @@ extern void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog);
 
 extern void
-build_program_resource_list(struct gl_shader_program *shProg);
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg);
 
 extern void
 linker_error(struct gl_shader_program *prog, const char *fmt, ...)
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index b512f8b6ee1..c7d6fb8c79b 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -260,6 +260,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
    if (brw->precompile && !brw_shader_precompile(ctx, shProg))
       return false;
 
-   build_program_resource_list(shProg);
+   build_program_resource_list(ctx, shProg);
    return true;
 }
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 1d9047ee6fd..35a68562001 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2976,7 +2976,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       _mesa_reference_program(ctx, &linked_prog, NULL);
    }
 
-   build_program_resource_list(prog);
+   build_program_resource_list(ctx, prog);
    return prog->LinkStatus;
 }
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 5e18e8be029..23786b85529 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6811,7 +6811,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       validate_ir_tree(ir);
    }
 
-   build_program_resource_list(prog);
+   build_program_resource_list(ctx, prog);
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_program *linked_prog;

From c5704bb350425162011367e47be8c69d424a8797 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 16:17:13 +1100
Subject: [PATCH 181/238] mesa: add query support for
 GL_TRANSFORM_FEEDBACK_BUFFER interface

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/compiler/glsl/link_varyings.cpp |  1 +
 src/mesa/main/mtypes.h              |  2 ++
 src/mesa/main/program_resource.c    | 16 ++++++++++++-
 src/mesa/main/shader_query.cpp      | 35 ++++++++++++++++++++++++++++-
 4 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index a3b7e1c65c7..848668c4381 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -826,6 +826,7 @@ tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
    info->Varyings[info->NumVarying].Size = this->size;
    info->Varyings[info->NumVarying].BufferIndex = buffer_index;
    info->NumVarying++;
+   info->Buffers[buffer].NumVaryings++;
 
    return true;
 }
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 33923d195c9..ff0707d03ba 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1650,6 +1650,8 @@ struct gl_transform_feedback_buffer
 {
    unsigned Binding;
 
+   unsigned NumVaryings;
+
    /**
     * Total number of components stored in each buffer.  This may be used by
     * hardware back-ends to determine the correct stride when interleaving
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index 0d9f8aecf08..f2a9f006dd8 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -39,6 +39,7 @@ supported_interface_enum(struct gl_context *ctx, GLenum iface)
    case GL_UNIFORM_BLOCK:
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
+   case GL_TRANSFORM_FEEDBACK_BUFFER:
    case GL_TRANSFORM_FEEDBACK_VARYING:
    case GL_ATOMIC_COUNTER_BUFFER:
    case GL_BUFFER_VARIABLE:
@@ -105,7 +106,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
             (*params)++;
       break;
    case GL_MAX_NAME_LENGTH:
-      if (programInterface == GL_ATOMIC_COUNTER_BUFFER) {
+      if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
+          programInterface == GL_TRANSFORM_FEEDBACK_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetProgramInterfaceiv(%s pname %s)",
                      _mesa_enum_to_string(programInterface),
@@ -165,6 +167,16 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
             }
          }
          break;
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
+         for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+            if (shProg->ProgramResourceList[i].Type == programInterface) {
+               struct gl_transform_feedback_buffer *buffer =
+                  (struct gl_transform_feedback_buffer *)
+                  shProg->ProgramResourceList[i].Data;
+               *params = MAX2(*params, buffer->NumVaryings);
+            }
+         }
+         break;
       default:
         _mesa_error(ctx, GL_INVALID_OPERATION,
                     "glGetProgramInterfaceiv(%s pname %s)",
@@ -289,6 +301,7 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
 
       return _mesa_program_resource_index(shProg, res);
    case GL_ATOMIC_COUNTER_BUFFER:
+   case GL_TRANSFORM_FEEDBACK_BUFFER:
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
                   _mesa_enum_to_string(programInterface));
@@ -318,6 +331,7 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
       return;
 
    if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
+       programInterface == GL_TRANSFORM_FEEDBACK_BUFFER ||
        !supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
                   _mesa_enum_to_string(programInterface));
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index ee2eeab0f8d..993dc863220 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -61,6 +61,7 @@ DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
 DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
 DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
 DECL_RESOURCE_FUNC(XFV, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_buffer);
 DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
@@ -670,6 +671,7 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
       return RESOURCE_SUB(res)->index;
    case GL_UNIFORM_BLOCK:
    case GL_SHADER_STORAGE_BLOCK:
+   case GL_TRANSFORM_FEEDBACK_BUFFER:
    case GL_TRANSFORM_FEEDBACK_VARYING:
    default:
       return calc_resource_index(shProg, res);
@@ -707,6 +709,7 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
       case GL_UNIFORM_BLOCK:
       case GL_ATOMIC_COUNTER_BUFFER:
       case GL_SHADER_STORAGE_BLOCK:
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
          if (_mesa_program_resource_index(shProg, res) == index)
             return res;
          break;
@@ -1009,7 +1012,8 @@ get_buffer_property(struct gl_shader_program *shProg,
    GET_CURRENT_CONTEXT(ctx);
    if (res->Type != GL_UNIFORM_BLOCK &&
        res->Type != GL_ATOMIC_COUNTER_BUFFER &&
-       res->Type != GL_SHADER_STORAGE_BLOCK)
+       res->Type != GL_SHADER_STORAGE_BLOCK &&
+       res->Type != GL_TRANSFORM_FEEDBACK_BUFFER)
       goto invalid_operation;
 
    if (res->Type == GL_UNIFORM_BLOCK) {
@@ -1110,6 +1114,30 @@ get_buffer_property(struct gl_shader_program *shProg,
          }
          return RESOURCE_ATC(res)->NumUniforms;
       }
+   } else if (res->Type == GL_TRANSFORM_FEEDBACK_BUFFER) {
+      switch (prop) {
+      case GL_BUFFER_BINDING:
+         *val = RESOURCE_XFB(res)->Binding;
+         return 1;
+      case GL_NUM_ACTIVE_VARIABLES:
+         *val = RESOURCE_XFB(res)->NumVaryings;
+         return 1;
+      case GL_ACTIVE_VARIABLES:
+         int i = 0;
+         for ( ; i < shProg->LinkedTransformFeedback.NumVarying; i++) {
+            unsigned index =
+               shProg->LinkedTransformFeedback.Varyings[i].BufferIndex;
+            struct gl_program_resource *buf_res =
+               _mesa_program_resource_find_index(shProg,
+                                                 GL_TRANSFORM_FEEDBACK_BUFFER,
+                                                 index);
+            assert(buf_res);
+            if (res == buf_res) {
+               *val++ = i;
+            }
+         }
+         return RESOURCE_XFB(res)->NumVaryings;
+      }
    }
    assert(!"support for property type not implemented");
 
@@ -1140,6 +1168,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
    case GL_NAME_LENGTH:
       switch (res->Type) {
       case GL_ATOMIC_COUNTER_BUFFER:
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
          goto invalid_operation;
       default:
          /* Resource name length + terminator. */
@@ -1327,6 +1356,10 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_VARYING);
       *val = RESOURCE_XFV(res)->BufferIndex;
       return 1;
+   case GL_TRANSFORM_FEEDBACK_BUFFER_STRIDE:
+      VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_BUFFER);
+      *val = RESOURCE_XFB(res)->Stride * 4;
+      return 1;
 
    default:
       goto invalid_enum;

From b273958c747b77fe000b513caa3033cf1fde5422 Mon Sep 17 00:00:00 2001
From: Timothy Arceri <timothy.arceri@collabora.com>
Date: Thu, 10 Mar 2016 16:41:03 +1100
Subject: [PATCH 182/238] docs: mark xfb_* qualifiers as DONE

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 804a96c4a2b..f6248daa1b6 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -198,7 +198,7 @@ GL 4.4, GLSL 4.40:
   - explicit byte offsets for blocks                    DONE
   - forced alignment within blocks                      DONE
   - specified vec4-slot component numbers               in progress
-  - specified transform/feedback layout                 in progress
+  - specified transform/feedback layout                 DONE
   - input/output block locations                        DONE
   GL_ARB_multi_bind                                     DONE (all drivers)
   GL_ARB_query_buffer_object                            DONE (nvc0)

From 05ee6627d6112b2874f373e8302540e22ccc317c Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 30 Mar 2016 19:18:16 -0700
Subject: [PATCH 183/238] nir: Fix typo from commit 6702f1acde9.

---
 src/compiler/nir/nir_opt_algebraic.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index c2e56e71734..2fac9adafe6 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -271,7 +271,7 @@ optimizations = [
 
    # Propagate negation up multiplication chains
    (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
-   (('imul', ('ineg', a), b), ('ineg', ('fmul', a, b))),
+   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
 
    # Misc. lowering
    (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),

From 10b189f985755496a179b663d4b0746f3717093b Mon Sep 17 00:00:00 2001
From: Dave Airlie <airlied@redhat.com>
Date: Thu, 31 Mar 2016 12:35:25 +1000
Subject: [PATCH 184/238] st/mesa: fix fallout from xfb changes.

Failed to update state tracker with new buffer interface.

Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
Signed-off-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/state_tracker/st_cb_xformfb.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 0c01cd5ab78..a5cf3dfd5a9 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -125,7 +125,7 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
 
       if (bo && bo->buffer) {
          unsigned stream =
-            obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+            obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream;
 
          /* Check whether we need to recreate the target. */
          if (!sobj->targets[i] ||
@@ -204,7 +204,7 @@ st_end_transform_feedback(struct gl_context *ctx,
 
    for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
       unsigned stream =
-         obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+         obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream;
 
       /* Is it not bound or already set for this stream? */
       if (!sobj->targets[i] || sobj->draw_count[stream])

From 65bc94022b0fd31d01c8de65f7f1115a86baa65a Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 14 Feb 2016 23:23:53 -0800
Subject: [PATCH 185/238] i965: Remove incorrect cycle estimates.

These printed the cycle count the last basic block (sched.time is set
per basic block!). We have accurate, full program, data printed
elsewhere.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../drivers/dri/i965/brw_schedule_instructions.cpp     | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5b54b51395c..51d9ce1a5d2 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1674,11 +1674,6 @@ fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
                                   cfg->num_blocks, mode);
    sched.run(cfg);
 
-   if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
-      fprintf(stderr, "%s%d estimated execution time: %d cycles\n",
-              stage_abbrev, dispatch_width, sched.time);
-   }
-
    invalidate_live_intervals();
 }
 
@@ -1688,10 +1683,5 @@ vec4_visitor::opt_schedule_instructions()
    vec4_instruction_scheduler sched(this, prog_data->total_grf);
    sched.run(cfg);
 
-   if (unlikely(debug_enabled)) {
-      fprintf(stderr, "%s estimated execution time: %d cycles\n",
-              stage_abbrev, sched.time);
-   }
-
    invalidate_live_intervals();
 }

From 0d253ce34a8074a6c880d405c5bf860766afe358 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sun, 14 Feb 2016 23:21:03 -0800
Subject: [PATCH 186/238] i965: Simplify full scheduling-barrier conditions.

All of these were simply code for "architecture register file" (and in
the case of destinations, "not the null register").

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../dri/i965/brw_schedule_instructions.cpp    | 35 +++++--------------
 1 file changed, 8 insertions(+), 27 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 51d9ce1a5d2..2153898ef29 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -964,10 +964,7 @@ fs_instruction_scheduler::calculate_deps()
             }
          } else if (inst->src[i].is_accumulator()) {
             add_dep(last_accumulator_write, n);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            assert(inst->src[i].file != MRF);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1026,8 +1023,7 @@ fs_instruction_scheduler::calculate_deps()
       } else if (inst->dst.is_accumulator()) {
          add_dep(last_accumulator_write, n);
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1080,10 +1076,7 @@ fs_instruction_scheduler::calculate_deps()
             }
          } else if (inst->src[i].is_accumulator()) {
             add_dep(n, last_accumulator_write, 0);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            assert(inst->src[i].file != MRF);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1140,8 +1133,7 @@ fs_instruction_scheduler::calculate_deps()
          }
       } else if (inst->dst.is_accumulator()) {
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1204,12 +1196,7 @@ vec4_instruction_scheduler::calculate_deps()
          } else if (inst->src[i].is_accumulator()) {
             assert(last_accumulator_write);
             add_dep(last_accumulator_write, n);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            /* No reads from MRF, and ATTR is already translated away */
-            assert(inst->src[i].file != MRF &&
-                   inst->src[i].file != ATTR);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1248,8 +1235,7 @@ vec4_instruction_scheduler::calculate_deps()
       } else if (inst->dst.is_accumulator()) {
          add_dep(last_accumulator_write, n);
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1291,11 +1277,7 @@ vec4_instruction_scheduler::calculate_deps()
             add_dep(n, last_fixed_grf_write);
          } else if (inst->src[i].is_accumulator()) {
             add_dep(n, last_accumulator_write);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            assert(inst->src[i].file != MRF &&
-                   inst->src[i].file != ATTR);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1330,8 +1312,7 @@ vec4_instruction_scheduler::calculate_deps()
          last_fixed_grf_write = n;
       } else if (inst->dst.is_accumulator()) {
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 

From 436bdd7403bfa260ce6dedcbd0ba96df2ae40ce8 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Wed, 17 Feb 2016 11:04:50 -0800
Subject: [PATCH 187/238] Revert "i965: Don't add barrier deps for FB write
 messages."

This reverts commit d0e1d6b7e27bf5f05436e47080d326d7daa63af2.

The change in the vec4 code is a mistake -- there's never an
FS_OPCODE_FB_WRITE in vec4 code.

The change in the fs code had the (harmless) effect of not recognizing
an FB_WRITE as a scheduling barrier even if it was marked EOT --
harmless because the scheduler marked the last instruction of a block as
a barrier, something I'm changing in the following patches.

This will be reimplemented later in the series.
---
 src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 2153898ef29..66eb07e6d1a 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -939,9 +939,8 @@ fs_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       fs_inst *inst = (fs_inst *)n->inst;
 
-      if ((inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-           inst->has_side_effects()) &&
-          inst->opcode != FS_OPCODE_FB_WRITE)
+      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+         inst->has_side_effects())
          add_barrier_deps(n);
 
       /* read-after-write deps. */
@@ -1183,7 +1182,7 @@ vec4_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
-      if (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE)
+      if (inst->has_side_effects())
          add_barrier_deps(n);
 
       /* read-after-write deps. */

From f60750968c66f7aa15181c4ba315bb594e615044 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 14 Mar 2016 17:39:19 -0700
Subject: [PATCH 188/238] i965/vec4/tcs: Set conditional mod on
 TCS_OPCODE_SRC0_010_IS_ZERO.

Missing this causes an assertion failure in the scheduler with the next
patch.

Additionally, this gives cmod propagation enough information to optimize
code better.

total instructions in shared programs: 7112991 -> 7112852 (-0.00%)
instructions in affected programs: 25704 -> 25565 (-0.54%)
helped: 139

total cycles in shared programs: 64812898 -> 64810674 (-0.00%)
cycles in affected programs: 127224 -> 125000 (-1.75%)
helped: 139

Acked-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 1 -
 src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp       | 4 +++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 621c3321c81..8409e820f09 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1931,7 +1931,6 @@ generate_code(struct brw_codegen *p,
       case TCS_OPCODE_SRC0_010_IS_ZERO:
          /* If src_reg had stride like fs_reg, we wouldn't need this. */
          brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
-         brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
          break;
 
       case TCS_OPCODE_RELEASE_INPUT:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 2046b94bca1..84aa89a7865 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -184,7 +184,9 @@ vec4_tcs_visitor::emit_thread_end()
        * we don't have stride in the vec4 world, nor UV immediates in
        * align16, so we need an opcode to get invocation_id<0,4,0>.
        */
-      emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id);
+      set_condmod(BRW_CONDITIONAL_Z,
+                  emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
+                       invocation_id));
       emit(IF(BRW_PREDICATE_NORMAL));
       for (unsigned i = 0; i < key->input_vertices; i += 2) {
          /* If we have an odd number of input vertices, the last will be

From 7b208a731277b4b99b86af3df98c1219099036d7 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 15 Feb 2016 10:05:33 -0800
Subject: [PATCH 189/238] i965: Relax restriction on scheduling last
 instruction.

I think when this code was written, basic blocks were always ended by a
control flow instruction or an end-of-thread message. That's no longer
the case, and removing this restriction actually helps things:

   instructions in affected programs: 7267 -> 7244 (-0.32%)
   helped: 4

   total cycles in shared programs: 66559580 -> 66431900 (-0.19%)
   cycles in affected programs: 28310152 -> 28182472 (-0.45%)
   helped: 9577
   HURT: 879

   GAINED: 2

The addition of the is_control_flow() checks is not a functional change,
since the add_insts_from_block() does not put them in the list of
instructions to schedule. I plan to change this in a later patch.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../dri/i965/brw_schedule_instructions.cpp    | 23 +++----------------
 1 file changed, 3 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 66eb07e6d1a..46b45a5ea01 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -923,15 +923,6 @@ fs_instruction_scheduler::calculate_deps()
     */
    schedule_node *last_fixed_grf_write = NULL;
 
-   /* The last instruction always needs to still be the last
-    * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
-    * WHILE) and scheduling other things after it would disturb the
-    * basic block, or it's FB_WRITE and we should do a better job at
-    * dead code elimination anyway.
-    */
-   schedule_node *last = (schedule_node *)instructions.get_tail();
-   add_barrier_deps(last);
-
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 
@@ -940,7 +931,8 @@ fs_instruction_scheduler::calculate_deps()
       fs_inst *inst = (fs_inst *)n->inst;
 
       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-         inst->has_side_effects())
+          inst->is_control_flow() ||
+          inst->has_side_effects())
          add_barrier_deps(n);
 
       /* read-after-write deps. */
@@ -1166,15 +1158,6 @@ vec4_instruction_scheduler::calculate_deps()
     */
    schedule_node *last_fixed_grf_write = NULL;
 
-   /* The last instruction always needs to still be the last instruction.
-    * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
-    * other things after it would disturb the basic block, or it's the EOT
-    * URB_WRITE and we should do a better job at dead code eliminating
-    * anything that could have been scheduled after it.
-    */
-   schedule_node *last = (schedule_node *)instructions.get_tail();
-   add_barrier_deps(last);
-
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 
@@ -1182,7 +1165,7 @@ vec4_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
-      if (inst->has_side_effects())
+      if (inst->is_control_flow() || inst->has_side_effects())
          add_barrier_deps(n);
 
       /* read-after-write deps. */

From a607f4aa57def51236687ec17d7a6391fb147333 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 15 Feb 2016 10:42:14 -0800
Subject: [PATCH 190/238] i965: Assert that an instruction is not inserted
 around itself.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_shader.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 21977a23130..736deb443dd 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -948,6 +948,8 @@ adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
 void
 backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
 {
+   assert(this != inst);
+
    if (!this->is_head_sentinel())
       assert(inst_is_in_block(block, this) || !"Instruction not in block");
 
@@ -961,6 +963,8 @@ backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
 void
 backend_instruction::insert_before(bblock_t *block, backend_instruction *inst)
 {
+   assert(this != inst);
+
    if (!this->is_tail_sentinel())
       assert(inst_is_in_block(block, this) || !"Instruction not in block");
 

From b4e223cfbf4d46e2ca4c7313f4ebd52798d21551 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Mon, 15 Feb 2016 10:43:39 -0800
Subject: [PATCH 191/238] i965: Remove NOP insertion kludge in scheduler.

Instead of removing every instruction in add_insts_from_block(), just
move the instruction to its scheduled location. This is a step towards
doing both bottom-up and top-down scheduling without conflicts.

Note that this patch changes cycle counts for programs because it begins
including control flow instructions in the estimates.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 .../dri/i965/brw_schedule_instructions.cpp    | 25 ++++---------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 46b45a5ea01..98fa5e3117f 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -783,26 +783,13 @@ schedule_node::schedule_node(backend_instruction *inst,
 void
 instruction_scheduler::add_insts_from_block(bblock_t *block)
 {
-   /* Removing the last instruction from a basic block removes the block as
-    * well, so put a NOP at the end to keep it alive.
-    */
-   if (!block->end()->is_control_flow()) {
-      backend_instruction *nop = new(mem_ctx) backend_instruction();
-      nop->opcode = BRW_OPCODE_NOP;
-      block->end()->insert_after(block, nop);
-   }
-
-   foreach_inst_in_block_safe(backend_instruction, inst, block) {
-      if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow())
-         continue;
-
+   foreach_inst_in_block(backend_instruction, inst, block) {
       schedule_node *n = new(mem_ctx) schedule_node(inst, this);
 
-      this->instructions_to_schedule++;
-
-      inst->remove(block);
       instructions.push_tail(n);
    }
+
+   this->instructions_to_schedule = block->end_ip - block->start_ip + 1;
 }
 
 /** Recursive computation of the delay member of a node. */
@@ -1463,7 +1450,6 @@ void
 instruction_scheduler::schedule_instructions(bblock_t *block)
 {
    const struct brw_device_info *devinfo = bs->devinfo;
-   backend_instruction *inst = block->end();
    time = 0;
    if (!post_reg_alloc)
       reg_pressure = reg_pressure_in[block->num];
@@ -1482,7 +1468,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       /* Schedule this instruction. */
       assert(chosen);
       chosen->remove();
-      inst->insert_before(block, chosen->inst);
+      chosen->inst->exec_node::remove();
+      block->instructions.push_tail(chosen->inst);
       instructions_to_schedule--;
 
       if (!post_reg_alloc) {
@@ -1551,8 +1538,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       }
    }
 
-   if (block->end()->opcode == BRW_OPCODE_NOP)
-      block->end()->remove(block);
    assert(instructions_to_schedule == 0);
 
    block->cycle_count = time;

From 3495265158cce55d24a7a7f38a0a40d8c9448d38 Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sat, 12 Mar 2016 21:15:19 -0800
Subject: [PATCH 192/238] i965: Add and use is_scheduling_barrier() function.

---
 .../dri/i965/brw_schedule_instructions.cpp    | 21 +++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 98fa5e3117f..befa9ff3239 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -892,6 +892,14 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
    return inst->exec_size == 16;
 }
 
+static bool
+is_scheduling_barrier(const fs_inst *inst)
+{
+   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+          inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
 void
 fs_instruction_scheduler::calculate_deps()
 {
@@ -917,9 +925,7 @@ fs_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       fs_inst *inst = (fs_inst *)n->inst;
 
-      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-          inst->is_control_flow() ||
-          inst->has_side_effects())
+      if (is_scheduling_barrier(inst))
          add_barrier_deps(n);
 
       /* read-after-write deps. */
@@ -1131,6 +1137,13 @@ fs_instruction_scheduler::calculate_deps()
    }
 }
 
+static bool
+is_scheduling_barrier(const vec4_instruction *inst)
+{
+   return inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
 void
 vec4_instruction_scheduler::calculate_deps()
 {
@@ -1152,7 +1165,7 @@ vec4_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
-      if (inst->is_control_flow() || inst->has_side_effects())
+      if (is_scheduling_barrier(inst))
          add_barrier_deps(n);
 
       /* read-after-write deps. */

From 4fea98991c8f94f14e469d4621eddc5247d4efbd Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Sat, 12 Mar 2016 21:16:03 -0800
Subject: [PATCH 193/238] i965: Don't add barrier deps for FB write messages.

Ken did this earlier, and this is just me reimplementing his patch a
little differently.

Reviewed-by: Francisco Jerez <currojerez@riseup.net>
---
 src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index befa9ff3239..8d925843732 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -897,7 +897,8 @@ is_scheduling_barrier(const fs_inst *inst)
 {
    return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
           inst->is_control_flow() ||
-          inst->has_side_effects();
+          inst->eot ||
+          (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE);
 }
 
 void

From 903640c2accb4617afd7036f47cbd14077586394 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 19 Feb 2016 13:20:48 -0500
Subject: [PATCH 194/238] glsl: add gl_MaxSamples, new in GL 4.5 / GL ES 3.2

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/compiler/glsl/builtin_variables.cpp  | 3 +++
 src/compiler/glsl/glsl_parser_extras.cpp | 3 +++
 src/compiler/glsl/glsl_parser_extras.h   | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 76a22cee29c..6a772911f76 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -878,6 +878,9 @@ builtin_variable_generator::generate_constants()
       add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents);
       add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
    }
+
+   if (state->is_version(450, 320))
+      add_const("gl_MaxSamples", state->Const.MaxSamples);
 }
 
 
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 0ce89ceb3a8..e592e5cd55c 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -179,6 +179,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents;
    this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents;
 
+   /* GL 4.5 / OES_sample_variables */
+   this->Const.MaxSamples = ctx->Const.MaxSamples;
+
    this->current_function = NULL;
    this->toplevel_ir = NULL;
    this->found_return = false;
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 86008b48519..6185d14a20d 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -461,6 +461,9 @@ struct _mesa_glsl_parse_state {
       unsigned MaxTessControlTotalOutputComponents;
       unsigned MaxTessControlUniformComponents;
       unsigned MaxTessEvaluationUniformComponents;
+
+      /* GL 4.5 / OES_sample_variables */
+      unsigned MaxSamples;
    } Const;
 
    /**

From 6a8ca859f913cf56bc8abce6d1cde02b36a74289 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 16 Feb 2016 01:20:15 -0500
Subject: [PATCH 195/238] mesa: add OES_sample_variables to extension table,
 add enable bit

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mesa/main/extensions_table.h | 1 +
 src/mesa/main/mtypes.h           | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 7885aefcc28..700ef24ab22 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -330,6 +330,7 @@ EXT(OES_point_sprite                        , ARB_point_sprite
 EXT(OES_query_matrix                        , dummy_true                             ,  x ,  x , ES1,  x , 2003)
 EXT(OES_read_format                         , dummy_true                             , GLL, GLC, ES1,  x , 2003)
 EXT(OES_rgb8_rgba8                          , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_sample_variables                    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_shader_image_atomic                 , ARB_shader_image_load_store            ,  x ,  x ,  x ,  31, 2015)
 EXT(OES_single_precision                    , dummy_true                             ,  x ,  x , ES1,  x , 2003)
 EXT(OES_standard_derivatives                , OES_standard_derivatives               ,  x ,  x ,  x , ES2, 2005)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index ff0707d03ba..be8c21e08db 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3911,6 +3911,7 @@ struct gl_extensions
    GLboolean EXT_transform_feedback;
    GLboolean EXT_timer_query;
    GLboolean EXT_vertex_array_bgra;
+   GLboolean OES_sample_variables;
    GLboolean OES_standard_derivatives;
    GLboolean OES_texture_buffer;
    /* vendor extensions */

From 5283e810157a3c392c9887e51c6ee0df849a4973 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Fri, 19 Feb 2016 13:23:10 -0500
Subject: [PATCH 196/238] glsl: add GL_OES_sample_variables support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/compiler/glsl/builtin_variables.cpp  | 15 +++++++++++----
 src/compiler/glsl/glcpp/glcpp-parse.y    |  2 ++
 src/compiler/glsl/glsl_parser_extras.cpp |  1 +
 src/compiler/glsl/glsl_parser_extras.h   |  2 ++
 4 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 6a772911f76..7d77f705356 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -879,7 +879,8 @@ builtin_variable_generator::generate_constants()
       add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
    }
 
-   if (state->is_version(450, 320))
+   if (state->is_version(450, 320) ||
+       state->OES_sample_variables_enable)
       add_const("gl_MaxSamples", state->Const.MaxSamples);
 }
 
@@ -890,7 +891,9 @@ builtin_variable_generator::generate_constants()
 void
 builtin_variable_generator::generate_uniforms()
 {
-   if (state->is_version(400, 0) || state->ARB_sample_shading_enable)
+   if (state->is_version(400, 320) ||
+       state->ARB_sample_shading_enable ||
+       state->OES_sample_variables_enable)
       add_uniform(int_t, "gl_NumSamples");
    add_uniform(type("gl_DepthRangeParameters"), "gl_DepthRange");
    add_uniform(array(vec4_t, VERT_ATTRIB_MAX), "gl_CurrentAttribVertMESA");
@@ -1143,7 +1146,9 @@ builtin_variable_generator::generate_fs_special_vars()
          var->enable_extension_warning("GL_AMD_shader_stencil_export");
    }
 
-   if (state->is_version(400, 0) || state->ARB_sample_shading_enable) {
+   if (state->is_version(400, 320) ||
+       state->ARB_sample_shading_enable ||
+       state->OES_sample_variables_enable) {
       add_system_value(SYSTEM_VALUE_SAMPLE_ID, int_t, "gl_SampleID");
       add_system_value(SYSTEM_VALUE_SAMPLE_POS, vec2_t, "gl_SamplePosition");
       /* From the ARB_sample_shading specification:
@@ -1156,7 +1161,9 @@ builtin_variable_generator::generate_fs_special_vars()
       add_output(FRAG_RESULT_SAMPLE_MASK, array(int_t, 1), "gl_SampleMask");
    }
 
-   if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable) {
+   if (state->is_version(400, 320) ||
+       state->ARB_gpu_shader5_enable ||
+       state->OES_sample_variables_enable) {
       add_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN, array(int_t, 1), "gl_SampleMaskIn");
    }
 
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index fbbf85bfdae..14d2c7d3eff 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2371,6 +2371,8 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	   if (extensions != NULL) {
 	      if (extensions->OES_EGL_image_external)
 	         add_builtin_define(parser, "GL_OES_EGL_image_external", 1);
+              if (extensions->OES_sample_variables)
+                 add_builtin_define(parser, "GL_OES_sample_variables", 1);
               if (extensions->OES_standard_derivatives)
                  add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
               if (extensions->ARB_texture_multisample)
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index e592e5cd55c..def86e18223 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -615,6 +615,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(OES_geometry_point_size,        false, true,      OES_geometry_shader),
    EXT(OES_geometry_shader,            false, true,      OES_geometry_shader),
    EXT(OES_gpu_shader5,                false, true,      ARB_gpu_shader5),
+   EXT(OES_sample_variables,           false, true,      OES_sample_variables),
    EXT(OES_shader_image_atomic,        false, true,      ARB_shader_image_load_store),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
    EXT(OES_texture_3D,                 false, true,      dummy_true),
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 6185d14a20d..219fb427c0f 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -604,6 +604,8 @@ struct _mesa_glsl_parse_state {
    bool OES_geometry_shader_warn;
    bool OES_gpu_shader5_enable;
    bool OES_gpu_shader5_warn;
+   bool OES_sample_variables_enable;
+   bool OES_sample_variables_warn;
    bool OES_shader_image_atomic_enable;
    bool OES_shader_image_atomic_warn;
    bool OES_standard_derivatives_enable;

From 411a88accc8a2728abbdfbef4315addbc08cf5a3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 16 Feb 2016 14:29:38 -0500
Subject: [PATCH 197/238] mesa: add GL_OES_sample_shading support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/mapi/glapi/gen/es_EXT.xml           | 6 ++++++
 src/mesa/main/enable.c                  | 4 ++--
 src/mesa/main/extensions_table.h        | 1 +
 src/mesa/main/multisample.c             | 3 ++-
 src/mesa/main/tests/dispatch_sanity.cpp | 3 +++
 5 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 8f8f997b20d..8e51c058321 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -798,6 +798,12 @@
     </function>
 </category>
 
+<category name="GL_OES_sample_shading" number="169">
+    <function name="MinSampleShadingOES" alias="MinSampleShading" es2="3.0">
+        <param name="value" type="GLfloat"/>
+    </function>
+</category>
+
 <!-- 174. GL_OES_texture_storage_multisample_2d_array -->
 <category name="GL_OES_texture_storage_multisample_2d_array" number="174">
     <enum name="TEXTURE_2D_MULTISAMPLE_ARRAY_OES"              value="0x9102"/>
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index b90a60ba03f..d2830770ec2 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -807,7 +807,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
       /* GL_ARB_sample_shading */
       case GL_SAMPLE_SHADING:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_sample_shading, cap);
          if (ctx->Multisample.SampleShading == state)
@@ -1606,7 +1606,7 @@ _mesa_IsEnabled( GLenum cap )
 
       /* ARB_sample_shading */
       case GL_SAMPLE_SHADING:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_sample_shading);
          return ctx->Multisample.SampleShading;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 700ef24ab22..c1bcfc53d10 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -330,6 +330,7 @@ EXT(OES_point_sprite                        , ARB_point_sprite
 EXT(OES_query_matrix                        , dummy_true                             ,  x ,  x , ES1,  x , 2003)
 EXT(OES_read_format                         , dummy_true                             , GLL, GLC, ES1,  x , 2003)
 EXT(OES_rgb8_rgba8                          , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_sample_shading                      , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_sample_variables                    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_shader_image_atomic                 , ARB_shader_image_load_store            ,  x ,  x ,  x ,  31, 2015)
 EXT(OES_single_precision                    , dummy_true                             ,  x ,  x , ES1,  x , 2003)
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 77773a20883..5453e38632e 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -127,7 +127,8 @@ _mesa_MinSampleShading(GLclampf value)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (!ctx->Extensions.ARB_sample_shading || !_mesa_is_desktop_gl(ctx)) {
+   if (!_mesa_has_ARB_sample_shading(ctx) &&
+       !_mesa_has_OES_sample_shading(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glMinSampleShading");
       return;
    }
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 309e574ac32..06e7ec1a195 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2454,6 +2454,9 @@ const struct function gles3_functions_possible[] = {
    { "glTexBufferOES", 31, -1 },
    { "glTexBufferRangeOES", 31, -1 },
 
+   /* GL_OES_sample_shading */
+   { "glMinSampleShadingOES", 30, -1 },
+
    { NULL, 0, -1 }
 };
 

From 3002296cb68ebc9705b29e024e5fc67d5565ed46 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Sat, 20 Feb 2016 15:03:55 -0500
Subject: [PATCH 198/238] mesa: add GL_OES_shader_multisample_interpolation
 support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
---
 src/compiler/glsl/builtin_functions.cpp  | 12 +++++++-----
 src/compiler/glsl/glcpp/glcpp-parse.y    |  4 +++-
 src/compiler/glsl/glsl_lexer.ll          |  2 +-
 src/compiler/glsl/glsl_parser_extras.cpp |  1 +
 src/compiler/glsl/glsl_parser_extras.h   |  2 ++
 src/mesa/main/extensions_table.h         |  1 +
 src/mesa/main/get.c                      |  5 +++++
 src/mesa/main/get_hash_params.py         | 11 ++++++++---
 8 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index 62f07b2460e..65309fdc09c 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -264,10 +264,12 @@ shader_packing_or_es31_or_gpu_shader5(const _mesa_glsl_parse_state *state)
 }
 
 static bool
-fs_gpu_shader5(const _mesa_glsl_parse_state *state)
+fs_interpolate_at(const _mesa_glsl_parse_state *state)
 {
    return state->stage == MESA_SHADER_FRAGMENT &&
-          (state->is_version(400, 0) || state->ARB_gpu_shader5_enable);
+          (state->is_version(400, 320) ||
+           state->ARB_gpu_shader5_enable ||
+           state->OES_shader_multisample_interpolation_enable);
 }
 
 
@@ -5165,7 +5167,7 @@ builtin_builder::_interpolateAtCentroid(const glsl_type *type)
 {
    ir_variable *interpolant = in_var(type, "interpolant");
    interpolant->data.must_be_shader_input = 1;
-   MAKE_SIG(type, fs_gpu_shader5, 1, interpolant);
+   MAKE_SIG(type, fs_interpolate_at, 1, interpolant);
 
    body.emit(ret(interpolate_at_centroid(interpolant)));
 
@@ -5178,7 +5180,7 @@ builtin_builder::_interpolateAtOffset(const glsl_type *type)
    ir_variable *interpolant = in_var(type, "interpolant");
    interpolant->data.must_be_shader_input = 1;
    ir_variable *offset = in_var(glsl_type::vec2_type, "offset");
-   MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, offset);
+   MAKE_SIG(type, fs_interpolate_at, 2, interpolant, offset);
 
    body.emit(ret(interpolate_at_offset(interpolant, offset)));
 
@@ -5191,7 +5193,7 @@ builtin_builder::_interpolateAtSample(const glsl_type *type)
    ir_variable *interpolant = in_var(type, "interpolant");
    interpolant->data.must_be_shader_input = 1;
    ir_variable *sample_num = in_var(glsl_type::int_type, "sample_num");
-   MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, sample_num);
+   MAKE_SIG(type, fs_interpolate_at, 2, interpolant, sample_num);
 
    body.emit(ret(interpolate_at_sample(interpolant, sample_num)));
 
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 14d2c7d3eff..e8646c0ad32 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2371,8 +2371,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	   if (extensions != NULL) {
 	      if (extensions->OES_EGL_image_external)
 	         add_builtin_define(parser, "GL_OES_EGL_image_external", 1);
-              if (extensions->OES_sample_variables)
+              if (extensions->OES_sample_variables) {
                  add_builtin_define(parser, "GL_OES_sample_variables", 1);
+                 add_builtin_define(parser, "GL_OES_shader_multisample_interpolation", 1);
+              }
               if (extensions->OES_standard_derivatives)
                  add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
               if (extensions->ARB_texture_multisample)
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 5492045f7c3..0b7695f8d3e 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -584,7 +584,7 @@ usamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_
 
     /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
-sample		KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
+sample		KEYWORD_WITH_ALT(400, 300, 400, 320, yyextra->ARB_gpu_shader5_enable || yyextra->OES_shader_multisample_interpolation_enable, SAMPLE);
 subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE);
 
 
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index def86e18223..3dc68741902 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -617,6 +617,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(OES_gpu_shader5,                false, true,      ARB_gpu_shader5),
    EXT(OES_sample_variables,           false, true,      OES_sample_variables),
    EXT(OES_shader_image_atomic,        false, true,      ARB_shader_image_load_store),
+   EXT(OES_shader_multisample_interpolation, false, true, OES_sample_variables),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
    EXT(OES_texture_3D,                 false, true,      dummy_true),
    EXT(OES_texture_buffer,             false, true,      OES_texture_buffer),
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 219fb427c0f..0cc2d259f3a 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -608,6 +608,8 @@ struct _mesa_glsl_parse_state {
    bool OES_sample_variables_warn;
    bool OES_shader_image_atomic_enable;
    bool OES_shader_image_atomic_warn;
+   bool OES_shader_multisample_interpolation_enable;
+   bool OES_shader_multisample_interpolation_warn;
    bool OES_standard_derivatives_enable;
    bool OES_standard_derivatives_warn;
    bool OES_texture_3D_enable;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index c1bcfc53d10..84401fd501b 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -333,6 +333,7 @@ EXT(OES_rgb8_rgba8                          , dummy_true
 EXT(OES_sample_shading                      , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_sample_variables                    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_shader_image_atomic                 , ARB_shader_image_load_store            ,  x ,  x ,  x ,  31, 2015)
+EXT(OES_shader_multisample_interpolation    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_single_precision                    , dummy_true                             ,  x ,  x , ES1,  x , 2003)
 EXT(OES_standard_derivatives                , OES_standard_derivatives               ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_stencil1                            , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 88efd3ee642..6829c33254c 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -408,6 +408,11 @@ static const int extra_ARB_gpu_shader5_or_oes_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_gpu_shader5_or_OES_sample_variables[] = {
+   EXT(ARB_gpu_shader5),
+   EXT(OES_sample_variables),
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 62968fc0300..7998d0366d3 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -503,6 +503,14 @@ descriptor=[
   [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31" ],
 ]},
 
+# Enums in OpenGL Core profile and ES 3.0
+{ "apis": ["GL_CORE", "GLES3"], "params": [
+  # GL_ARB_gpu_shader5 / GL_OES_shader_multisample_interpolation
+  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+]},
+
 # Enums in OpenGL Core profile and ES 3.1
 { "apis": ["GL_CORE", "GLES31"], "params": [
 # GL_ARB_draw_indirect / GLES 3.1
@@ -882,9 +890,6 @@ descriptor=[
 
 # GL_ARB_gpu_shader5
   [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
-  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
 
 # GL_ARB_tessellation_shader
   [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],

From 2c7f5fe2960362b266aeb8e1ed0ebea762131df5 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Tue, 16 Feb 2016 01:27:27 -0500
Subject: [PATCH 199/238] st/mesa: add ES sample-shading support

We require the full ARB_gpu_shader5 for now, but in the future some
other CAP could get exposed to indicate that only the multisample-related
behavior of ARB_gpu_shader5 is available.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                           | 6 +++---
 src/mesa/state_tracker/st_extensions.c | 6 ++++++
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index f6248daa1b6..489b8219351 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -262,11 +262,11 @@ GLES3.2, GLSL ES 3.2
   GL_OES_geometry_shader                                started (Marta)
   GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
   GL_OES_primitive_bounding box                         not started
-  GL_OES_sample_shading                                 not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
-  GL_OES_sample_variables                               not started (based on parts of GL_ARB_sample_shading, which is done for some drivers)
+  GL_OES_sample_shading                                 DONE (nvc0, r600, radeonsi)
+  GL_OES_sample_variables                               DONE (nvc0, r600, radeonsi)
   GL_OES_shader_image_atomic                            DONE (all drivers that support GL_ARB_shader_image_load_store)
   GL_OES_shader_io_blocks                               not started (based on parts of GLSL 1.50, which is done)
-  GL_OES_shader_multisample_interpolation               not started (based on parts of GL_ARB_gpu_shader5, which is done)
+  GL_OES_shader_multisample_interpolation               DONE (nvc0, r600, radeonsi)
   GL_OES_tessellation_shader                            not started (based on GL_ARB_tessellation_shader, which is done for some drivers)
   GL_OES_texture_border_clamp                           DONE (all drivers)
   GL_OES_texture_buffer                                 DONE (core only)
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 36a12010c23..0a25770aa02 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -921,6 +921,12 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ARB_sync = GL_TRUE;
    }
 
+   /* Needs PIPE_CAP_SAMPLE_SHADING + all the sample-related bits of
+    * ARB_gpu_shader5. This enables all the per-sample shading ES extensions.
+    */
+   extensions->OES_sample_variables = extensions->ARB_sample_shading &&
+      extensions->ARB_gpu_shader5;
+
    /* Maximum sample count. */
    {
       enum pipe_format color_formats[] = {

From 571f538a622d9a7050015b58b7b1ac240f289dcb Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 30 Mar 2016 22:03:06 -0400
Subject: [PATCH 200/238] mesa: remove duplicate
 MAX_GEOMETRY_SHADER_INVOCATIONS entry

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mesa/main/get_hash_params.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 7998d0366d3..a0cc4f8e842 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -888,9 +888,6 @@ descriptor=[
   [ "VIEWPORT_BOUNDS_RANGE", "CONTEXT_FLOAT2(Const.ViewportBounds), extra_ARB_viewport_array" ],
   [ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Const.LayerAndVPIndexProvokingVertex), extra_ARB_viewport_array" ],
 
-# GL_ARB_gpu_shader5
-  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
-
 # GL_ARB_tessellation_shader
   [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],
   [ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ],

From ebdb5345480957c4fc3068fab17926be28d7dcd4 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Mon, 15 Feb 2016 20:34:52 -0500
Subject: [PATCH 201/238] mesa: add GL_OES_copy_image support

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                            |  2 +-
 src/mapi/glapi/gen/es_EXT.xml           | 22 +++++++
 src/mesa/main/copyimage.c               | 27 +++++++-
 src/mesa/main/extensions_table.h        |  1 +
 src/mesa/main/mtypes.h                  |  1 +
 src/mesa/main/tests/dispatch_sanity.cpp |  3 +
 src/mesa/main/textureview.c             | 86 +++++++++++++++++++++++++
 src/mesa/state_tracker/st_extensions.c  | 11 ++++
 8 files changed, 151 insertions(+), 2 deletions(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index 489b8219351..c76b1e28586 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -256,7 +256,7 @@ GLES3.2, GLSL ES 3.2
   GL_KHR_debug                                          DONE (all drivers)
   GL_KHR_robustness                                     not started (90% done with the ARB variant)
   GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
-  GL_OES_copy_image                                     not started (based on GL_ARB_copy_image, which is done for some drivers)
+  GL_OES_copy_image                                     DONE (core only)
   GL_OES_draw_buffers_indexed                           not started
   GL_OES_draw_elements_base_vertex                      DONE (all drivers)
   GL_OES_geometry_shader                                started (Marta)
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 8e51c058321..149acd5b10c 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -1013,6 +1013,28 @@
 
 </category>
 
+<category name="GL_OES_copy_image" number="208">
+
+    <function name="CopyImageSubDataOES" alias="CopyImageSubData" es2="3.0">
+        <param name="srcName" type="GLuint"/>
+        <param name="srcTarget" type="GLenum"/>
+        <param name="srcLevel" type="GLint"/>
+        <param name="srcX" type="GLint"/>
+        <param name="srcY" type="GLint"/>
+        <param name="srcZ" type="GLint"/>
+        <param name="dstName" type="GLuint"/>
+        <param name="dstTarget" type="GLenum"/>
+        <param name="dstLevel" type="GLint"/>
+        <param name="dstX" type="GLint"/>
+        <param name="dstY" type="GLint"/>
+        <param name="dstZ" type="GLint"/>
+        <param name="srcWidth" type="GLsizei"/>
+        <param name="srcHeight" type="GLsizei"/>
+        <param name="srcDepth" type="GLsizei"/>
+    </function>
+
+</category>
+
 <!-- 175. GL_OES_geometry_shader -->
 <category name="GL_OES_geometry_shader" number="210">
     <enum name="GEOMETRY_SHADER_OES"                             value="0x8DD9"/>
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index d571d221bce..a0f1c691220 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -25,6 +25,7 @@
  *    Jason Ekstrand <jason.ekstrand@intel.com>
  */
 
+#include "context.h"
 #include "glheader.h"
 #include "errors.h"
 #include "enums.h"
@@ -360,8 +361,32 @@ compressed_format_compatible(const struct gl_context *ctx,
       case GL_COMPRESSED_SIGNED_RED_RGTC1:
          compressedClass = BLOCK_CLASS_64_BITS;
          break;
+      case GL_COMPRESSED_RGBA8_ETC2_EAC:
+      case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
+      case GL_COMPRESSED_RG11_EAC:
+      case GL_COMPRESSED_SIGNED_RG11_EAC:
+         if (_mesa_is_gles(ctx))
+            compressedClass = BLOCK_CLASS_128_BITS;
+         else
+            return false;
+         break;
+      case GL_COMPRESSED_RGB8_ETC2:
+      case GL_COMPRESSED_SRGB8_ETC2:
+      case GL_COMPRESSED_R11_EAC:
+      case GL_COMPRESSED_SIGNED_R11_EAC:
+      case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+      case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+         if (_mesa_is_gles(ctx))
+            compressedClass = BLOCK_CLASS_64_BITS;
+         else
+            return false;
+         break;
       default:
-         return false;
+         if (_mesa_is_gles(ctx) && _mesa_is_astc_format(compressedFormat))
+            compressedClass = BLOCK_CLASS_128_BITS;
+         else
+            return false;
+         break;
    }
 
    switch (otherFormat) {
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 84401fd501b..717efabfec9 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -309,6 +309,7 @@ EXT(OES_blend_subtract                      , dummy_true
 EXT(OES_byte_coordinates                    , dummy_true                             ,  x ,  x , ES1,  x , 2002)
 EXT(OES_compressed_ETC1_RGB8_texture        , OES_compressed_ETC1_RGB8_texture       ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_compressed_paletted_texture         , dummy_true                             ,  x ,  x , ES1,  x , 2003)
+EXT(OES_copy_image                          , OES_copy_image                         ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_depth24                             , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_depth32                             , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
 EXT(OES_depth_texture                       , ARB_depth_texture                      ,  x ,  x ,  x , ES2, 2006)
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index be8c21e08db..f2cb4cb107b 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3911,6 +3911,7 @@ struct gl_extensions
    GLboolean EXT_transform_feedback;
    GLboolean EXT_timer_query;
    GLboolean EXT_vertex_array_bgra;
+   GLboolean OES_copy_image;
    GLboolean OES_sample_variables;
    GLboolean OES_standard_derivatives;
    GLboolean OES_texture_buffer;
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 06e7ec1a195..c3aa7106b33 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2457,6 +2457,9 @@ const struct function gles3_functions_possible[] = {
    /* GL_OES_sample_shading */
    { "glMinSampleShadingOES", 30, -1 },
 
+   /* GL_OES_copy_image */
+   { "glCopyImageSubDataOES", 30, -1 },
+
    { NULL, 0, -1 }
 };
 
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 419fbebf2f0..4b3b3245c2e 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -82,6 +82,39 @@
     |                       | COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT              |
     ---------------------------------------------------------------------------
  */
+
+#define VIEW_CLASS_GLES(x)             (GL_VIEW_CLASS_BPTC_FLOAT + 1 + x)
+#define VIEW_CLASS_EAC_R11             VIEW_CLASS_GLES(0)
+#define VIEW_CLASS_EAC_RG11            VIEW_CLASS_GLES(1)
+#define VIEW_CLASS_ETC2_RGB            VIEW_CLASS_GLES(2)
+#define VIEW_CLASS_ETC2_RGBA           VIEW_CLASS_GLES(3)
+#define VIEW_CLASS_ETC2_EAC_RGBA       VIEW_CLASS_GLES(4)
+#define VIEW_CLASS_ASTC_4x4_RGBA       VIEW_CLASS_GLES(5)
+#define VIEW_CLASS_ASTC_5x4_RGBA       VIEW_CLASS_GLES(6)
+#define VIEW_CLASS_ASTC_5x5_RGBA       VIEW_CLASS_GLES(7)
+#define VIEW_CLASS_ASTC_6x5_RGBA       VIEW_CLASS_GLES(8)
+#define VIEW_CLASS_ASTC_6x6_RGBA       VIEW_CLASS_GLES(9)
+#define VIEW_CLASS_ASTC_8x5_RGBA       VIEW_CLASS_GLES(10)
+#define VIEW_CLASS_ASTC_8x6_RGBA       VIEW_CLASS_GLES(11)
+#define VIEW_CLASS_ASTC_8x8_RGBA       VIEW_CLASS_GLES(12)
+#define VIEW_CLASS_ASTC_10x5_RGBA      VIEW_CLASS_GLES(13)
+#define VIEW_CLASS_ASTC_10x6_RGBA      VIEW_CLASS_GLES(14)
+#define VIEW_CLASS_ASTC_10x8_RGBA      VIEW_CLASS_GLES(15)
+#define VIEW_CLASS_ASTC_10x10_RGBA     VIEW_CLASS_GLES(16)
+#define VIEW_CLASS_ASTC_12x10_RGBA     VIEW_CLASS_GLES(17)
+#define VIEW_CLASS_ASTC_12x12_RGBA     VIEW_CLASS_GLES(18)
+#define VIEW_CLASS_ASTC_3x3x3_RGBA     VIEW_CLASS_GLES(19)
+#define VIEW_CLASS_ASTC_4x3x3_RGBA     VIEW_CLASS_GLES(20)
+#define VIEW_CLASS_ASTC_4x4x3_RGBA     VIEW_CLASS_GLES(21)
+#define VIEW_CLASS_ASTC_4x4x4_RGBA     VIEW_CLASS_GLES(22)
+#define VIEW_CLASS_ASTC_5x4x4_RGBA     VIEW_CLASS_GLES(23)
+#define VIEW_CLASS_ASTC_5x5x4_RGBA     VIEW_CLASS_GLES(24)
+#define VIEW_CLASS_ASTC_5x5x5_RGBA     VIEW_CLASS_GLES(25)
+#define VIEW_CLASS_ASTC_6x5x5_RGBA     VIEW_CLASS_GLES(26)
+#define VIEW_CLASS_ASTC_6x6x5_RGBA     VIEW_CLASS_GLES(27)
+#define VIEW_CLASS_ASTC_6x6x6_RGBA     VIEW_CLASS_GLES(28)
+
+
 struct internal_format_class_info {
    GLenum view_class;
    GLenum internal_format;
@@ -162,6 +195,41 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[
    {GL_VIEW_CLASS_S3TC_DXT5_RGBA, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},
 };
 
+static const struct internal_format_class_info gles_etc2_compatible_internal_formats[] = {
+   {VIEW_CLASS_EAC_R11, GL_COMPRESSED_R11_EAC},
+   {VIEW_CLASS_EAC_R11, GL_COMPRESSED_SIGNED_R11_EAC},
+   {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_RG11_EAC},
+   {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_SIGNED_RG11_EAC},
+   {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_RGB8_ETC2},
+   {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_SRGB8_ETC2},
+   {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2},
+   {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2},
+   {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_RGBA8_ETC2_EAC},
+   {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC},
+};
+
+static const struct internal_format_class_info gles_astc_compatible_internal_formats[] = {
+#define ASTC_FMT(size) \
+   {VIEW_CLASS_ASTC_##size## _RGBA, GL_COMPRESSED_RGBA_ASTC_##size##_KHR}, \
+   {VIEW_CLASS_ASTC_##size##_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_##size##_KHR}
+
+   ASTC_FMT(4x4),
+   ASTC_FMT(5x4),
+   ASTC_FMT(5x5),
+   ASTC_FMT(6x5),
+   ASTC_FMT(6x6),
+   ASTC_FMT(8x5),
+   ASTC_FMT(8x6),
+   ASTC_FMT(8x8),
+   ASTC_FMT(10x5),
+   ASTC_FMT(10x6),
+   ASTC_FMT(10x8),
+   ASTC_FMT(10x10),
+   ASTC_FMT(12x10),
+   ASTC_FMT(12x12),
+#undef ASTC_FMT
+};
+
 GLenum
 _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
 {
@@ -180,6 +248,24 @@ _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum intern
             return s3tc_compatible_internal_formats[i].view_class;
       }
    }
+
+   if (_mesa_is_gles3(ctx)) {
+      for (i = 0; i < ARRAY_SIZE(gles_etc2_compatible_internal_formats); i++) {
+         if (gles_etc2_compatible_internal_formats[i].internal_format
+             == internalformat)
+            return gles_etc2_compatible_internal_formats[i].view_class;
+      }
+
+      if (ctx->Extensions.KHR_texture_compression_astc_ldr) {
+         for (i = 0; i < ARRAY_SIZE(gles_astc_compatible_internal_formats); i++) {
+            if (gles_astc_compatible_internal_formats[i].internal_format
+                == internalformat)
+               return gles_astc_compatible_internal_formats[i].view_class;
+         }
+      }
+
+      /* FINISHME: Add 3D OES formats when supported */
+   }
    return GL_FALSE;
 }
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 0a25770aa02..8748ab5c876 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -927,6 +927,17 @@ void st_init_extensions(struct pipe_screen *screen,
    extensions->OES_sample_variables = extensions->ARB_sample_shading &&
       extensions->ARB_gpu_shader5;
 
+   /* If we don't have native ETC2 support, we don't keep track of the
+    * original ETC2 data. This is necessary to be able to copy images between
+    * compatible view classes.
+    */
+   if (extensions->ARB_copy_image && screen->is_format_supported(
+             screen, PIPE_FORMAT_ETC2_RGB8,
+             PIPE_TEXTURE_2D, 0,
+             PIPE_BIND_SAMPLER_VIEW)) {
+      extensions->OES_copy_image = GL_TRUE;
+   }
+
    /* Maximum sample count. */
    {
       enum pipe_format color_formats[] = {

From a94d8d51d7e426485a72e6cfd6185dee9df5e070 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Wed, 17 Feb 2016 13:27:14 -0500
Subject: [PATCH 202/238] mesa: add GL_EXT_copy_image support

The extension is identical to GL_OES_copy_image. But dEQP has tests that
want the EXT variant.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/mapi/glapi/gen/es_EXT.xml    | 22 ++++++++++++++++++++++
 src/mesa/main/extensions_table.h |  1 +
 2 files changed, 23 insertions(+)

diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 149acd5b10c..471f6b6e565 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -915,6 +915,28 @@
 
 </category>
 
+<category name="GL_EXT_copy_image" number="208">
+
+    <function name="CopyImageSubDataEXT" alias="CopyImageSubData" es2="3.0">
+        <param name="srcName" type="GLuint"/>
+        <param name="srcTarget" type="GLenum"/>
+        <param name="srcLevel" type="GLint"/>
+        <param name="srcX" type="GLint"/>
+        <param name="srcY" type="GLint"/>
+        <param name="srcZ" type="GLint"/>
+        <param name="dstName" type="GLuint"/>
+        <param name="dstTarget" type="GLenum"/>
+        <param name="dstLevel" type="GLint"/>
+        <param name="dstX" type="GLint"/>
+        <param name="dstY" type="GLint"/>
+        <param name="dstZ" type="GLint"/>
+        <param name="srcWidth" type="GLsizei"/>
+        <param name="srcHeight" type="GLsizei"/>
+        <param name="srcDepth" type="GLsizei"/>
+    </function>
+
+</category>
+
 <category name="GL_OES_texture_buffer" number="216">
 
     <function name="TexBufferOES" es2="3.1" alias="TexBuffer">
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 717efabfec9..1b003609eee 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -186,6 +186,7 @@ EXT(EXT_blend_subtract                      , dummy_true
 EXT(EXT_buffer_storage                      , ARB_buffer_storage                     ,  x ,  x ,  x ,  31, 2015)
 EXT(EXT_color_buffer_float                  , dummy_true                             ,  x ,  x , ES1,  30, 2013)
 EXT(EXT_compiled_vertex_array               , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(EXT_copy_image                          , OES_copy_image                         ,  x ,  x ,  x ,  30, 2014)
 EXT(EXT_copy_texture                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_depth_bounds_test                   , EXT_depth_bounds_test                  , GLL, GLC,  x ,  x , 2002)
 EXT(EXT_discard_framebuffer                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)

From f96a403bc3e1ef45f92621e9ace48cf757db4059 Mon Sep 17 00:00:00 2001
From: Pierre Moreau <pierre.morrow@free.fr>
Date: Sat, 19 Mar 2016 14:04:54 +0100
Subject: [PATCH 203/238] nv50/ir: Check for valid insn instead of def size

This fixes a null pointer dereference during the register allocation pass,
if a function had arguments.

Functions arguments get a definition from the function itself, a definition
which is therefore not linked to any instruction. If a value ends up having
a definition but no linked instruction, the register allocation pass doesn't
need to consider whether that value is generated by an instruction that
can only handle "short" registers (on nv50).

Signed-off-by: Pierre Moreau <pierre.morrow@free.fr>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index d877c253a17..500ab8915de 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -853,7 +853,7 @@ isShortRegOp(Instruction *insn)
 static bool
 isShortRegVal(LValue *lval)
 {
-   if (lval->defs.size() == 0)
+   if (lval->getInsn() == NULL)
       return false;
    for (Value::DefCIterator def = lval->defs.begin();
         def != lval->defs.end(); ++def)
@@ -1467,7 +1467,7 @@ GCRA::allocateRegisters(ArrayList& insns)
          nodes[i].init(regs, lval);
          RIG.insert(&nodes[i]);
 
-         if (lval->inFile(FILE_GPR) && lval->defs.size() > 0 &&
+         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL &&
              prog->getTarget()->getChipset() < 0xc0) {
             Instruction *insn = lval->getInsn();
             if (insn->op == OP_MAD || insn->op == OP_SAD)

From 9d7cd439880d9334d21ed099efa15ccf8b709748 Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 30 Mar 2016 09:55:56 -0600
Subject: [PATCH 204/238] tgsi: skip texture query opcodes when examining
 texture targets
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Should fix the assertion in piglit
spec@arb_gpu_shader5@texturegather@fs-r-none-shadow-2d when the
TXQ instruction specifies a 2D target but the sampler view was
declared as SHADOW2D.

Reviewed-by: Michel Dänzer <michel.daenzer@amd.com>
Tested-by: Michel Dänzer <michel.daenzer@amd.com>
---
 src/gallium/auxiliary/tgsi/tgsi_scan.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index 76a6fef8b44..d90fb1d68df 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -54,6 +54,20 @@ is_memory_file(unsigned file)
 }
 
 
+/**
+ * Is the opcode a "true" texture instruction which samples from a
+ * texture map?
+ */
+static bool
+is_texture_inst(unsigned opcode)
+{
+   return (opcode != TGSI_OPCODE_TXQ &&
+           opcode != TGSI_OPCODE_TXQS &&
+           opcode != TGSI_OPCODE_TXQ_LZ &&
+           opcode != TGSI_OPCODE_LODQ &&
+           tgsi_get_opcode_info(opcode)->is_tex);
+}
+
 static void
 scan_instruction(struct tgsi_shader_info *info,
                  const struct tgsi_full_instruction *fullinst,
@@ -189,7 +203,7 @@ scan_instruction(struct tgsi_shader_info *info,
          assert(index < Elements(info->is_msaa_sampler));
          assert(index < PIPE_MAX_SAMPLERS);
 
-         if (tgsi_get_opcode_info(fullinst->Instruction.Opcode)->is_tex) {
+         if (is_texture_inst(fullinst->Instruction.Opcode)) {
             const unsigned target = fullinst->Texture.Texture;
             assert(target < TGSI_TEXTURE_UNKNOWN);
             /* for texture instructions, check that the texture instruction

From 9076e049340db0c55f57abb4ee9bbaffba61d45d Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Wed, 30 Mar 2016 16:54:08 -0600
Subject: [PATCH 205/238] tgsi: split tgsi_util_get_texture_coord_dim()
 function into two

It was kind of overloaded, returning two different things.  Now get
the index of the shadow reference src register with a new
tgsi_util_get_shadow_ref_src_index() function.

To verify the new code, I added some temp/debug code which looped
over all TGSI_TEXTURE_x values, calling the old function and new and
checking that the returned indexes matched.

Also tested piglit "shadow" tests with softpipe/llvmpipe.
No testing of ilo and radeonsi changes.

Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c        |  5 +-
 src/gallium/auxiliary/tgsi/tgsi_util.c        | 69 +++++++++----------
 src/gallium/auxiliary/tgsi/tgsi_util.h        |  5 +-
 .../drivers/ilo/shader/ilo_shader_fs.c        |  4 +-
 .../drivers/ilo/shader/ilo_shader_vs.c        |  3 +-
 src/gallium/drivers/radeonsi/si_shader.c      |  7 +-
 6 files changed, 46 insertions(+), 47 deletions(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 53d5937b2df..e7f080eb123 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -2054,7 +2054,8 @@ exec_tex(struct tgsi_exec_machine *mach,
    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
 
-   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, &shadow_ref);
+   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
+   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
 
    assert(dim <= 4);
    if (shadow_ref >= 0)
@@ -2153,7 +2154,7 @@ exec_lodq(struct tgsi_exec_machine *mach,
    union tgsi_exec_channel r[2];
 
    unit = fetch_sampler_unit(mach, inst, 1);
-   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, NULL);
+   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
    assert(dim <= Elements(coords));
    /* fetch coordinates */
    for (i = 0; i < dim; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 5fff3f0787f..fbe29626a7f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -375,10 +375,8 @@ tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg)
  * sample index.
  */
 int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex)
 {
-   int dim;
-
    /*
     * Depending on the texture target, (src0.xyzw, src1.x) is interpreted
     * differently:
@@ -407,8 +405,7 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
    case TGSI_TEXTURE_BUFFER:
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
-      dim = 1;
-      break;
+      return 1;
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
    case TGSI_TEXTURE_1D_ARRAY:
@@ -416,52 +413,48 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
    case TGSI_TEXTURE_SHADOWRECT:
    case TGSI_TEXTURE_SHADOW1D_ARRAY:
    case TGSI_TEXTURE_2D_MSAA:
-      dim = 2;
-      break;
+      return 2;
    case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
    case TGSI_TEXTURE_2D_ARRAY:
    case TGSI_TEXTURE_SHADOWCUBE:
    case TGSI_TEXTURE_SHADOW2D_ARRAY:
    case TGSI_TEXTURE_2D_ARRAY_MSAA:
-      dim = 3;
-      break;
+      return 3;
    case TGSI_TEXTURE_CUBE_ARRAY:
    case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-      dim = 4;
-      break;
+      return 4;
    default:
       assert(!"unknown texture target");
-      dim = 0;
-      break;
+      return 0;
    }
+}
 
-   if (shadow_or_sample) {
-      switch (tgsi_tex) {
-      case TGSI_TEXTURE_SHADOW1D:
-         /* there is a gap */
-         *shadow_or_sample = 2;
-         break;
-      case TGSI_TEXTURE_SHADOW2D:
-      case TGSI_TEXTURE_SHADOWRECT:
-      case TGSI_TEXTURE_SHADOWCUBE:
-      case TGSI_TEXTURE_SHADOW1D_ARRAY:
-      case TGSI_TEXTURE_SHADOW2D_ARRAY:
-      case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-         *shadow_or_sample = dim;
-         break;
-      case TGSI_TEXTURE_2D_MSAA:
-      case TGSI_TEXTURE_2D_ARRAY_MSAA:
-         *shadow_or_sample = 3;
-         break;
-      default:
-         /* no shadow nor sample */
-         *shadow_or_sample = -1;
-         break;
-      }
+
+/**
+ * Given a TGSI_TEXTURE_x target, return the src register index for the
+ * shadow reference coordinate.
+ */
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex)
+{
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_SHADOW1D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return 2;
+   case TGSI_TEXTURE_SHADOWCUBE:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return 3;
+   case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      return 4;
+   default:
+      /* no shadow nor sample */
+      return -1;
    }
-
-   return dim;
 }
 
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 6175d95fcd6..3a049ee5667 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -80,7 +80,10 @@ struct tgsi_src_register
 tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg);
 
 int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample);
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex);
+
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex);
 
 boolean
 tgsi_is_shadow_target(unsigned target);
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
index f46126e8427..6c8f1b5222e 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -740,7 +740,9 @@ fs_prepare_tgsi_sampling(struct fs_compile_context *fcc,
       break;
    }
 
-   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+   ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
+
    tsrc_transpose(inst->src[0], coords);
    bias_or_lod = tsrc_null();
    ref_or_si = tsrc_null();
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index 0df0afc706b..2b46d44f5be 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -407,7 +407,8 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
    num_derivs = 0;
    sampler_src = 1;
 
-   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+   ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
 
    /* extract the parameters */
    switch (inst->opcode) {
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9eb531f8d80..4176e9f1ce5 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2874,8 +2874,7 @@ static LLVMValueRef image_fetch_coords(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	unsigned target = inst->Memory.Texture;
-	int sample;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample);
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 	LLVMValueRef coords[4];
 	LLVMValueRef tmp;
 	int chan;
@@ -3387,8 +3386,8 @@ static void tex_fetch_args(
 	unsigned target = inst->Texture.Texture;
 	LLVMValueRef coords[5], derivs[6];
 	LLVMValueRef address[16];
-	int ref_pos;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
+	int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
 	unsigned count = 0;
 	unsigned chan;
 	unsigned num_deriv_channels = 0;

From 05902a668611094ec876929fc2dfe1172043e4b9 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 31 Mar 2016 10:54:17 +0200
Subject: [PATCH 206/238] tgsi: fix out of bounds access in exec_atomop()

The number of channels must be 4 for all RGBA components.

Fixes: 22d129601 ("tgsi: add support for image operations to tgsi_exec. (v2.1)")
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index e7f080eb123..72d8c5a7247 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -3853,7 +3853,7 @@ static void
 exec_atomop(struct tgsi_exec_machine *mach,
             const struct tgsi_full_instruction *inst)
 {
-   union tgsi_exec_channel r[3], sample_r;
+   union tgsi_exec_channel r[4], sample_r;
    union tgsi_exec_channel value[4], value2[4];
    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
    float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];

From d22eca5f90cc86105e2a42e39acc061353c21129 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 31 Mar 2016 10:54:18 +0200
Subject: [PATCH 207/238] tgsi: silence compiler warning in
 fetch_sampler_unit()

The unit variable can be used uninitialized.

Fixes: 24e77cb09 ("tgsi: handle indirect sampler arrays. (v2)")
Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Brian Paul <brianp@vmware.com>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 src/gallium/auxiliary/tgsi/tgsi_exec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 72d8c5a7247..a595bbbc6d3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1996,7 +1996,7 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                    const struct tgsi_full_instruction *inst,
                    uint sampler)
 {
-   uint unit;
+   uint unit = 0;
    int i;
    if (inst->Src[sampler].Register.Indirect) {
       const struct tgsi_full_src_register *reg = &inst->Src[sampler];

From 8bb9c6ff7f2398e1a497a9b5413736f58205843c Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Mon, 28 Mar 2016 17:27:36 -0700
Subject: [PATCH 208/238] ptn: Silence unused parameter warning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The KIL instruction doesn't have a destination, so ptn_kil never uses
dest.

program/prog_to_nir.c: In function ‘ptn_kil’:
program/prog_to_nir.c:547:38: warning: unused parameter ‘dest’ [-Wunused-parameter]
 ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
                                      ^

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/program/prog_to_nir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 16b79c94c84..ce25f6d14f6 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -545,7 +545,7 @@ ptn_lrp(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
 }
 
 static void
-ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
+ptn_kil(nir_builder *b, nir_ssa_def **src)
 {
    nir_ssa_def *cmp = b->shader->options->native_integers ?
       nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) :
@@ -830,7 +830,7 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
       break;
 
    case OPCODE_KIL:
-      ptn_kil(b, dest, src);
+      ptn_kil(b, src);
       break;
 
    case OPCODE_CMP:

From cdea12bf035117f7cae5db0d52f3050d81c50c37 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Thu, 24 Mar 2016 15:48:55 -0700
Subject: [PATCH 209/238] ptn: Fix all users of ptn_swizzle

None of the callers actually wanted what it did.  In ptn_xpd, you only
ever want a vec3 swizzle.  In ptn_tex, you want a swizzle that matches
the number of required texture coordinates.

shader-db results:

G45:
total instructions in shared programs: 4011240 -> 4010911 (-0.01%)
instructions in affected programs: 59232 -> 58903 (-0.56%)
helped: 114
HURT: 0

total cycles in shared programs: 84314194 -> 84313220 (-0.00%)
cycles in affected programs: 779150 -> 778176 (-0.13%)
helped: 110
HURT: 13

Ironlake:
total instructions in shared programs: 6397262 -> 6396605 (-0.01%)
instructions in affected programs: 117402 -> 116745 (-0.56%)
helped: 227
HURT: 0

total cycles in shared programs: 128889798 -> 128888524 (-0.00%)
cycles in affected programs: 1214644 -> 1213370 (-0.10%)
helped: 179
HURT: 44

Sandy Bridge:
total instructions in shared programs: 8467391 -> 8467384 (-0.00%)
instructions in affected programs: 3107 -> 3100 (-0.23%)
helped: 10
HURT: 6

total cycles in shared programs: 117580120 -> 117573448 (-0.01%)
cycles in affected programs: 103158 -> 96486 (-6.47%)
helped: 84
HURT: 11

Ivy Bridge:
total instructions in shared programs: 7774255 -> 7774258 (0.00%)
instructions in affected programs: 1677 -> 1680 (0.18%)
helped: 8
HURT: 6

total cycles in shared programs: 65743828 -> 65739190 (-0.01%)
cycles in affected programs: 89312 -> 84674 (-5.19%)
helped: 78
HURT: 23

Haswell:
total instructions in shared programs: 7107172 -> 7107150 (-0.00%)
instructions in affected programs: 2048 -> 2026 (-1.07%)
helped: 16
HURT: 0

total cycles in shared programs: 64653636 -> 64647486 (-0.01%)
cycles in affected programs: 86836 -> 80686 (-7.08%)
helped: 85
HURT: 17

Broadwell and Skylake:
total instructions in shared programs: 8447529 -> 8447507 (-0.00%)
instructions in affected programs: 2038 -> 2016 (-1.08%)
helped: 16
HURT: 0

total cycles in shared programs: 66418670 -> 66413416 (-0.01%)
cycles in affected programs: 90110 -> 84856 (-5.83%)
helped: 83
HURT: 20

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/mesa/program/prog_to_nir.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index ce25f6d14f6..a6119ae4e7c 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -59,7 +59,6 @@ struct ptn_compile {
 
 #define SWIZ(X, Y, Z, W) \
    (unsigned[4]){ SWIZZLE_##X, SWIZZLE_##Y, SWIZZLE_##Z, SWIZZLE_##W }
-#define ptn_swizzle(b, src, x, y, z, w) nir_swizzle(b, src, SWIZ(x, y, z, w), 4, true)
 #define ptn_channel(b, src, ch) nir_swizzle(b, src, SWIZ(ch, ch, ch, ch), 1, true)
 
 static nir_ssa_def *
@@ -491,11 +490,11 @@ ptn_xpd(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
    ptn_move_dest_masked(b, dest,
                         nir_fsub(b,
                                  nir_fmul(b,
-                                          ptn_swizzle(b, src[0], Y, Z, X, X),
-                                          ptn_swizzle(b, src[1], Z, X, Y, X)),
+                                          nir_swizzle(b, src[0], SWIZ(Y, Z, X, W), 3, true),
+                                          nir_swizzle(b, src[1], SWIZ(Z, X, Y, W), 3, true)),
                                  nir_fmul(b,
-                                          ptn_swizzle(b, src[1], Y, Z, X, X),
-                                          ptn_swizzle(b, src[0], Z, X, Y, X))),
+                                          nir_swizzle(b, src[1], SWIZ(Y, Z, X, W), 3, true),
+                                          nir_swizzle(b, src[0], SWIZ(Z, X, Y, W), 3, true))),
                         WRITEMASK_XYZ);
    ptn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), WRITEMASK_W);
 }
@@ -642,7 +641,8 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
    unsigned src_number = 0;
 
    instr->src[src_number].src =
-      nir_src_for_ssa(ptn_swizzle(b, src[0], X, Y, Z, W));
+      nir_src_for_ssa(nir_swizzle(b, src[0], SWIZ(X, Y, Z, W),
+                                  instr->coord_components, true));
    instr->src[src_number].src_type = nir_tex_src_coord;
    src_number++;
 

From 08ff5f4d1f04ea426be679018c2c38da6b6b9a65 Mon Sep 17 00:00:00 2001
From: Ian Romanick <ian.d.romanick@intel.com>
Date: Tue, 29 Mar 2016 15:25:04 -0700
Subject: [PATCH 210/238] nir: Simplify a bcsel to logical-or

Oddly, this did not affect the shader where I first noticed the pattern.
That particular shader doesn't get its if-statement converted to a bcsel
because there are two assignments in the else-statement.  This led to me
submitting https://bugs.freedesktop.org/show_bug.cgi?id=94747.

shader-db results:

Sandy Bridge
total instructions in shared programs: 8467384 -> 8467069 (-0.00%)
instructions in affected programs: 36594 -> 36279 (-0.86%)
helped: 46
HURT: 0

total cycles in shared programs: 117573448 -> 117568518 (-0.00%)
cycles in affected programs: 339114 -> 334184 (-1.45%)
helped: 46
HURT: 0

Ivy Bridge / Haswell / Broadwell / Skylake:
total instructions in shared programs: 7774258 -> 7773999 (-0.00%)
instructions in affected programs: 30874 -> 30615 (-0.84%)
helped: 46
HURT: 0

total cycles in shared programs: 65739190 -> 65734530 (-0.01%)
cycles in affected programs: 180380 -> 175720 (-2.58%)
helped: 45
HURT: 1

No change on G45 or Ironlake.

I also tried these expressions, but none of them affected any shaders in
shader-db:

   (('bcsel', a, 'a@bool', 'b@bool'), ('ior', a, b)),
   (('bcsel', a, 'b@bool', False),    ('iand', a, b)),
   (('bcsel', a, 'b@bool', 'a@bool'), ('iand', a, b)),

Signed-off-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/compiler/nir/nir_opt_algebraic.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 2fac9adafe6..e72b4a791cb 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -126,6 +126,7 @@ optimizations = [
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
    (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
    (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
+   (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
    (('fmin', a, a), a),
    (('fmax', a, a), a),
    (('imin', a, a), a),

From 58d4751fa0c5a38069879e9f72047b75f8351d93 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Tue, 29 Mar 2016 09:58:11 -0700
Subject: [PATCH 211/238] i965: Fix textureSize() depth value for 1 layer
 surfaces on Gen4-6.

According to the Sandybridge PRM's description of the resinfo message,
the .z value returned will be Depth == 0 ? 0 : Depth + 1.  The earlier
PRMs have the same table.

This means we return 0 for array textures with a single slice, when
we ought to return 1.  Just override it to max(depth, 1).

Fixes 10 dEQP-GLES3.functional tests on Sandybridge:
shaders.texture_functions.texturesize.sampler2darray_fixed_vertex
shaders.texture_functions.texturesize.sampler2darray_fixed_fragment
shaders.texture_functions.texturesize.sampler2darray_float_vertex
shaders.texture_functions.texturesize.sampler2darray_float_fragment
shaders.texture_functions.texturesize.isampler2darray_vertex
shaders.texture_functions.texturesize.isampler2darray_fragment
shaders.texture_functions.texturesize.usampler2darray_vertex
shaders.texture_functions.texturesize.usampler2darray_fragment
shaders.texture_functions.texturesize.sampler2darrayshadow_vertex
shaders.texture_functions.texturesize.sampler2darrayshadow_fragment

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp   | 10 ++++++++--
 src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp | 14 ++++++++++----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index dc61d096efc..4fbcf2bd105 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -228,10 +228,16 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    }
 
    /* fixup #layers for cube map arrays */
-   if (op == ir_txs && is_cube_array) {
+   if (op == ir_txs && (devinfo->gen < 7 || is_cube_array)) {
       fs_reg depth = offset(dst, bld, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
-      bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
+
+      if (is_cube_array) {
+         bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
+      } else if (devinfo->gen < 7) {
+         /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+         bld.emit_minmax(fixed_depth, depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
+      }
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
       int components = inst->regs_written / (inst->exec_size / 8);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index d30330a379f..2ab141fdf21 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1056,10 +1056,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
     * spec requires layers.
     */
-   if (op == ir_txs && is_cube_array) {
-      emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                writemask(inst->dst, WRITEMASK_Z),
-                src_reg(inst->dst), brw_imm_d(6));
+   if (op == ir_txs) {
+      if (is_cube_array) {
+         emit_math(SHADER_OPCODE_INT_QUOTIENT,
+                   writemask(inst->dst, WRITEMASK_Z),
+                   src_reg(inst->dst), brw_imm_d(6));
+      } else if (devinfo->gen < 7) {
+         /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+         emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+                     src_reg(inst->dst), brw_imm_d(1));
+      }
    }
 
    if (devinfo->gen == 6 && op == ir_tg4) {

From a57320a9ba4f453c567716bf8270c0ac629ad0d2 Mon Sep 17 00:00:00 2001
From: Kenneth Graunke <kenneth@whitecape.org>
Date: Thu, 31 Mar 2016 00:53:21 -0700
Subject: [PATCH 212/238] i965: Use brw->urb.min_vs_urb_entries instead of 32
 for BLORP.

Haswell GT2 and GT3 have a minimum of 64 entries.  Hardcoding 32
is not legal.

v2: Delete stale comment (caught by Alejandro).

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Topi Pohjolainen <topi.pohjolainen@intel.com>
---
 src/mesa/drivers/dri/i965/gen7_blorp.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 89b73ca7519..eae1e30e150 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -55,11 +55,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
                                  0 /* gs_size */,
                                  urb_size / 2 /* fs_size */);
 
-   /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
-    * 1.15:0 "VS Number of URB Entries".
-    */
    gen7_emit_urb_state(brw,
-                       32 /* num_vs_entries */,
+                       brw->urb.min_vs_entries /* num_vs_entries */,
                        2 /* vs_size */,
                        2 /* vs_start */,
                        0 /* num_hs_entries */,

From e0e16830873b945a24880ae515466bf7f9165f42 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Mar 2016 00:48:01 -0400
Subject: [PATCH 213/238] mesa: add GL_OES/EXT_draw_buffers_indexed support

This is the same ext as ARB_draw_buffers_blend (plus some core
functionality that already exists). Add the alias entrypoints.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
Reviewed-by: Dave Airlie <airlied@redhat.com>
---
 docs/GL3.txt                            |   2 +-
 docs/relnotes/11.3.0.html               |   1 +
 src/mapi/glapi/gen/es_EXT.xml           | 106 ++++++++++++++++++++++++
 src/mesa/main/extensions_table.h        |   2 +
 src/mesa/main/tests/dispatch_sanity.cpp |  10 +++
 5 files changed, 120 insertions(+), 1 deletion(-)

diff --git a/docs/GL3.txt b/docs/GL3.txt
index c76b1e28586..5b6dc89e250 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -257,7 +257,7 @@ GLES3.2, GLSL ES 3.2
   GL_KHR_robustness                                     not started (90% done with the ARB variant)
   GL_KHR_texture_compression_astc_ldr                   DONE (i965/gen9+)
   GL_OES_copy_image                                     DONE (core only)
-  GL_OES_draw_buffers_indexed                           not started
+  GL_OES_draw_buffers_indexed                           DONE (all drivers that support GL_ARB_draw_buffers_blend)
   GL_OES_draw_elements_base_vertex                      DONE (all drivers)
   GL_OES_geometry_shader                                started (Marta)
   GL_OES_gpu_shader5                                    DONE (all drivers that support GL_ARB_gpu_shader5)
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index 4e23959e314..8aa9444c54c 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -49,6 +49,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_shader_image_load_store on radeonsi, softpipe</li>
 <li>GL_ARB_shader_image_size on radeonsi</li>
 <li>GL_ATI_fragment_shader on all Gallium drivers</li>
+<li>GL_OES_draw_buffers_indexed and GL_EXT_draw_buffers_indexed on all drivers that support GL_ARB_draw_buffers_blend</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 </ul>
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 471f6b6e565..3b2c15ebf5c 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -823,6 +823,59 @@
     </function>
 </category>
 
+<category name="GL_EXT_draw_buffers_indexed" number="176">
+
+  <function name="BlendFunciEXT" alias="BlendFunciARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactor" type="GLenum"/>
+    <param name="dfactor" type="GLenum"/>
+  </function>
+
+  <function name="BlendFuncSeparateiEXT" alias="BlendFuncSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactorRGB" type="GLenum"/>
+    <param name="dfactorRGB" type="GLenum"/>
+    <param name="sfactorAlpha" type="GLenum"/>
+    <param name="dfactorAlpha" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationiEXT" alias="BlendEquationiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="mode" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationSeparateiEXT" alias="BlendEquationSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="modeRGB" type="GLenum"/>
+    <param name="modeA" type="GLenum"/>
+  </function>
+
+  <function name="ColorMaskiEXT" alias="ColorMaski" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="r" type="GLboolean"/>
+    <param name="g" type="GLboolean"/>
+    <param name="b" type="GLboolean"/>
+    <param name="a" type="GLboolean"/>
+  </function>
+
+  <function name="EnableiEXT" alias="Enablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="DisableiEXT" alias="Disablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="IsEnablediEXT" alias="IsEnabledi" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+    <return type="GLboolean"/>
+  </function>
+
+</category>
+
 <category name="GL_EXT_texture_border_clamp" number="182">
 
     <!-- The *TexParameter* functions are added in EXT_texture_integer -->
@@ -937,6 +990,59 @@
 
 </category>
 
+<category name="GL_OES_draw_buffers_indexed" number="209">
+
+  <function name="BlendFunciOES" alias="BlendFunciARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactor" type="GLenum"/>
+    <param name="dfactor" type="GLenum"/>
+  </function>
+
+  <function name="BlendFuncSeparateiOES" alias="BlendFuncSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactorRGB" type="GLenum"/>
+    <param name="dfactorRGB" type="GLenum"/>
+    <param name="sfactorAlpha" type="GLenum"/>
+    <param name="dfactorAlpha" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationiOES" alias="BlendEquationiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="mode" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationSeparateiOES" alias="BlendEquationSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="modeRGB" type="GLenum"/>
+    <param name="modeA" type="GLenum"/>
+  </function>
+
+  <function name="ColorMaskiOES" alias="ColorMaski" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="r" type="GLboolean"/>
+    <param name="g" type="GLboolean"/>
+    <param name="b" type="GLboolean"/>
+    <param name="a" type="GLboolean"/>
+  </function>
+
+  <function name="EnableiOES" alias="Enablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="DisableiOES" alias="Disablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="IsEnablediOES" alias="IsEnabledi" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+    <return type="GLboolean"/>
+  </function>
+
+</category>
+
 <category name="GL_OES_texture_buffer" number="216">
 
     <function name="TexBufferOES" es2="3.1" alias="TexBuffer">
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 1b003609eee..7c36b1e1d0b 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -192,6 +192,7 @@ EXT(EXT_depth_bounds_test                   , EXT_depth_bounds_test
 EXT(EXT_discard_framebuffer                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 EXT(EXT_draw_buffers                        , dummy_true                             ,  x ,  x ,  x , ES2, 2012)
 EXT(EXT_draw_buffers2                       , EXT_draw_buffers2                      , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_draw_buffers_indexed                , ARB_draw_buffers_blend                 ,  x ,  x ,  x ,  30, 2014)
 EXT(EXT_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
 EXT(EXT_draw_instanced                      , ARB_draw_instanced                     , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_draw_range_elements                 , dummy_true                             , GLL,  x ,  x ,  x , 1997)
@@ -315,6 +316,7 @@ EXT(OES_depth24                             , dummy_true
 EXT(OES_depth32                             , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
 EXT(OES_depth_texture                       , ARB_depth_texture                      ,  x ,  x ,  x , ES2, 2006)
 EXT(OES_depth_texture_cube_map              , OES_depth_texture_cube_map             ,  x ,  x ,  x , ES2, 2012)
+EXT(OES_draw_buffers_indexed                , ARB_draw_buffers_blend                 ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
 EXT(OES_draw_texture                        , OES_draw_texture                       ,  x ,  x , ES1,  x , 2004)
 EXT(OES_element_index_uint                  , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index c3aa7106b33..9f278be47ca 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2460,6 +2460,16 @@ const struct function gles3_functions_possible[] = {
    /* GL_OES_copy_image */
    { "glCopyImageSubDataOES", 30, -1 },
 
+   /* GL_OES_draw_buffers_indexed */
+   { "glBlendFunciOES", 30, -1 },
+   { "glBlendFuncSeparateiOES", 30, -1 },
+   { "glBlendEquationiOES", 30, -1 },
+   { "glBlendEquationSeparateiOES", 30, -1 },
+   { "glColorMaskiOES", 30, -1 },
+   { "glEnableiOES", 30, -1 },
+   { "glDisableiOES", 30, -1 },
+   { "glIsEnablediOES", 30, -1 },
+
    { NULL, 0, -1 }
 };
 

From df03be196abc34bdf29ff1119099e1477a991ad3 Mon Sep 17 00:00:00 2001
From: Ilia Mirkin <imirkin@alum.mit.edu>
Date: Thu, 31 Mar 2016 21:52:13 -0400
Subject: [PATCH 214/238] nv50,nvc0: add PIPE_BIND_LINEAR support to
 is_format_supported

vdpau has recently come to rely on this, so make sure to check it
properly.

Signed-off-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nv50/nv50_screen.c | 9 +++++++++
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 5836bb23764..57e28992727 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -67,9 +67,18 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
       break;
    }
 
+   if (bindings & PIPE_BIND_LINEAR)
+      if (util_format_is_depth_or_stencil(format) ||
+          (target != PIPE_TEXTURE_1D &&
+           target != PIPE_TEXTURE_2D &&
+           target != PIPE_TEXTURE_RECT) ||
+          sample_count > 1)
+         return false;
+
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
                  PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_LINEAR |
                  PIPE_BIND_SHARED);
 
    return (( nv50_format_table[format].usage |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 553c001cd2b..8d7d4ef6fb8 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -57,9 +57,18 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
       if (util_format_get_blocksizebits(format) == 3 * 32)
          return false;
 
+   if (bindings & PIPE_BIND_LINEAR)
+      if (util_format_is_depth_or_stencil(format) ||
+          (target != PIPE_TEXTURE_1D &&
+           target != PIPE_TEXTURE_2D &&
+           target != PIPE_TEXTURE_RECT) ||
+          sample_count > 1)
+         return false;
+
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
                  PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_LINEAR |
                  PIPE_BIND_SHARED);
 
    return (( nvc0_format_table[format].usage |

From cd7d631c71bb1342a6607a193918ccb3289c0bbf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Pi=C3=B1eiro?= <apinheiro@igalia.com>
Date: Fri, 1 Apr 2016 09:11:15 +0200
Subject: [PATCH 215/238] glsl: do not raise unitialized variable warnings on
 builtins/reserved GL variables

Needed because not all the built-in variables are marked as system
values, so they still have the mode ir_var_auto. Right now it fixes
raising the warning when gl_GlobalInvocationID and
gl_LocalInvocationIndex are used.

v2: use is_gl_identifier instead of filtering for some names (Ilia
    Mirkin)

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
---
 src/compiler/glsl/ast_to_hir.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index a0312319161..3fe90079420 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -1905,7 +1905,8 @@ ast_expression::do_hir(exec_list *instructions,
 
          if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_shader_out)
              && !this->is_lhs
-             && result->variable_referenced()->data.assigned != true) {
+             && result->variable_referenced()->data.assigned != true
+             && !is_gl_identifier(var->name)) {
             _mesa_glsl_warning(&loc, state, "`%s' used uninitialized",
                                this->primary_expression.identifier);
          }

From cdf7c6b83dad7eb6a7600af61403315b02dcf13f Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Thu, 31 Mar 2016 23:37:34 +0100
Subject: [PATCH 216/238] gallivm: Use vector selects on LLVM 3.3+.

This is an old patch I had around.

Vector selects seem to work well from LLVM 3.3.  Using them should
improve code quality, as it might make constant propagation pass more
effective.

Tested lp_test_*

Reviewed-by: Roland Scheidegger <sroland@vmware.com>
---
 src/gallium/auxiliary/gallivm/lp_bld_logic.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 19d30d0d63c..5b0b6c6b234 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -314,11 +314,13 @@ lp_build_select(struct lp_build_context *bld,
       mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
       res = LLVMBuildSelect(builder, mask, a, b, "");
    }
-   else if (0) {
+   else if (HAVE_LLVM >= 0x0303) {
       /* Generate a vector select.
        *
-       * XXX: Using vector selects would avoid emitting intrinsics, but they aren't
-       * properly supported yet.
+       * Using vector selects would avoid emitting intrinsics, but they weren't
+       * properly supported yet for a long time.
+       *
+       * LLVM 3.3 appears to reliably support it.
        *
        * LLVM 3.1 supports it, but it yields buggy code (e.g. lp_blend_test).
        *

From 2d9e0f24e1a13648a9bceb03dbfb438e03c81fd7 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Feb 2016 14:45:07 -0600
Subject: [PATCH 217/238] Android: fix x86 gallium builds

Builds with gallium enabled fail on x86 with linker error:

external/mesa3d/src/mesa/vbo/vbo_exec_array.c:127: error: undefined reference to '_mesa_uint_array_min_max'

The problem is sse_minmax.c is not included in the libmesa_st_mesa
library. Since the SSE4.1 files are needed for both libmesa_st_mesa
and libmesa_dricore, move SSE4.1 files into a separate static library
that can be used by both.

Cc: "11.1 11.2" <mesa-stable@lists.freedesktop.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/mesa/Android.libmesa_dricore.mk |  7 ++---
 src/mesa/Android.libmesa_sse41.mk   | 44 +++++++++++++++++++++++++++++
 src/mesa/Android.libmesa_st_mesa.mk |  4 ++-
 src/mesa/Android.mk                 |  1 +
 src/mesa/Makefile.sources           |  4 +++
 5 files changed, 55 insertions(+), 5 deletions(-)
 create mode 100644 src/mesa/Android.libmesa_sse41.mk

diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk
index a3e6c6d55ae..d7647a76bd0 100644
--- a/src/mesa/Android.libmesa_dricore.mk
+++ b/src/mesa/Android.libmesa_dricore.mk
@@ -48,9 +48,8 @@ endif # x86
 endif # MESA_ENABLE_ASM
 
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
-LOCAL_SRC_FILES += \
-	main/streaming-load-memcpy.c \
-	main/sse_minmax.c
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_sse41
 LOCAL_CFLAGS := \
 	-msse4.1 \
        -DUSE_SSE41
@@ -63,7 +62,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
-LOCAL_WHOLE_STATIC_LIBRARIES := \
+LOCAL_WHOLE_STATIC_LIBRARIES += \
 	libmesa_program
 
 include $(LOCAL_PATH)/Android.gen.mk
diff --git a/src/mesa/Android.libmesa_sse41.mk b/src/mesa/Android.libmesa_sse41.mk
new file mode 100644
index 00000000000..8562da60193
--- /dev/null
+++ b/src/mesa/Android.libmesa_sse41.mk
@@ -0,0 +1,44 @@
+# Copyright 2012 Intel Corporation
+# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
+# Copyright (C) 2010-2011 LunarG Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_sse41
+
+LOCAL_SRC_FILES += \
+	$(X86_SSE41_FILES)
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/mapi \
+	$(MESA_TOP)/src/gallium/include \
+	$(MESA_TOP)/src/gallium/auxiliary
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+endif
diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk
index 9fd9460a5ba..bbd39562785 100644
--- a/src/mesa/Android.libmesa_st_mesa.mk
+++ b/src/mesa/Android.libmesa_st_mesa.mk
@@ -47,6 +47,8 @@ endif # x86
 endif # MESA_ENABLE_ASM
 
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_sse41
 LOCAL_CFLAGS := \
        -DUSE_SSE41
 endif
@@ -58,7 +60,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/auxiliary \
 	$(MESA_TOP)/src/gallium/include
 
-LOCAL_WHOLE_STATIC_LIBRARIES := \
+LOCAL_WHOLE_STATIC_LIBRARIES += \
 	libmesa_program
 
 include $(LOCAL_PATH)/Android.gen.mk
diff --git a/src/mesa/Android.mk b/src/mesa/Android.mk
index 20f781948be..9a1aef8b28e 100644
--- a/src/mesa/Android.mk
+++ b/src/mesa/Android.mk
@@ -24,5 +24,6 @@ include $(LOCAL_PATH)/Android.mesa_gen_matypes.mk
 include $(LOCAL_PATH)/Android.libmesa_glsl_utils.mk
 include $(LOCAL_PATH)/Android.libmesa_dricore.mk
 include $(LOCAL_PATH)/Android.libmesa_st_mesa.mk
+include $(LOCAL_PATH)/Android.libmesa_sse41.mk
 
 include $(LOCAL_PATH)/program/Android.mk
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 54601a956fd..7425f01273d 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -587,6 +587,10 @@ X86_64_FILES =		\
 	x86-64/x86-64.h	\
 	x86-64/xform4.S
 
+X86_SSE41_FILES = \
+	main/streaming-load-memcpy.c \
+	main/sse_minmax.c
+
 SPARC_FILES =			\
 	sparc/sparc.h		\
 	sparc/sparc_clip.S	\

From 8975527f58afd4af77966c6e46b485fc04008779 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Feb 2016 14:23:07 -0600
Subject: [PATCH 218/238] egl: Add EGL_FRAMEBUFFER_TARGET_ANDROID attribute

This is used by Android to select an eglconfig compatible with HWComposer.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Rob Herring <robh@kernel.org>
[Emil Velikov: add the _eglIsConfigAttribValid check]
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/egl/main/eglapi.c     | 1 +
 src/egl/main/eglconfig.c  | 7 ++++++-
 src/egl/main/eglconfig.h  | 2 ++
 src/egl/main/egldisplay.h | 1 +
 4 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index dd145a1195e..6c395bdf311 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -381,6 +381,7 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
    char *exts = dpy->ExtensionsString;
 
    /* Please keep these sorted alphabetically. */
+   _EGL_CHECK_EXTENSION(ANDROID_framebuffer_target);
    _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
 
    _EGL_CHECK_EXTENSION(CHROMIUM_sync_control);
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index d79c0e15422..7d2791ca340 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -245,7 +245,10 @@ static const struct {
    /* extensions */
    { EGL_Y_INVERTED_NOK,            ATTRIB_TYPE_BOOLEAN,
                                     ATTRIB_CRITERION_EXACT,
-                                    EGL_DONT_CARE }
+                                    EGL_DONT_CARE },
+   { EGL_FRAMEBUFFER_TARGET_ANDROID, ATTRIB_TYPE_BOOLEAN,
+                                    ATTRIB_CRITERION_EXACT,
+                                    EGL_DONT_CARE },
 };
 
 
@@ -488,6 +491,8 @@ _eglIsConfigAttribValid(_EGLConfig *conf, EGLint attr)
    switch (attr) {
    case EGL_Y_INVERTED_NOK:
       return conf->Display->Extensions.NOK_texture_from_pixmap;
+   case EGL_FRAMEBUFFER_TARGET_ANDROID:
+      return conf->Display->Extensions.ANDROID_framebuffer_target;
    default:
       break;
    }
diff --git a/src/egl/main/eglconfig.h b/src/egl/main/eglconfig.h
index 84cb2276b70..7bdb090c46c 100644
--- a/src/egl/main/eglconfig.h
+++ b/src/egl/main/eglconfig.h
@@ -86,6 +86,7 @@ struct _egl_config
 
    /* extensions */
    EGLint YInvertedNOK;
+   EGLint FramebufferTargetAndroid;
 };
 
 
@@ -133,6 +134,7 @@ _eglOffsetOfConfig(EGLint attr)
    ATTRIB_MAP(EGL_CONFORMANT,                Conformant);
    /* extensions */
    ATTRIB_MAP(EGL_Y_INVERTED_NOK,            YInvertedNOK);
+   ATTRIB_MAP(EGL_FRAMEBUFFER_TARGET_ANDROID, FramebufferTargetAndroid);
 #undef ATTRIB_MAP
    default:
       return -1;
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index cec6d59e6a4..a468a312784 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -90,6 +90,7 @@ struct _egl_resource
 struct _egl_extensions
 {
    /* Please keep these sorted alphabetically. */
+   EGLBoolean ANDROID_framebuffer_target;
    EGLBoolean ANDROID_image_native_buffer;
 
    EGLBoolean CHROMIUM_sync_control;

From e21e81aa1885287e438970429d44abb8b3dabb96 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Feb 2016 14:23:08 -0600
Subject: [PATCH 219/238] egl: Add EGL_RECORDABLE_ANDROID attribute

This is used by Android to select an eglconfig compatible with screen
recording.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Rob Herring <robh@kernel.org>
[Emil Velikov: add the _eglIsConfigAttribValid check]
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/egl/main/eglapi.c     | 1 +
 src/egl/main/eglconfig.c  | 5 +++++
 src/egl/main/eglconfig.h  | 2 ++
 src/egl/main/egldisplay.h | 1 +
 4 files changed, 9 insertions(+)

diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 6c395bdf311..8886759011a 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -383,6 +383,7 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
    /* Please keep these sorted alphabetically. */
    _EGL_CHECK_EXTENSION(ANDROID_framebuffer_target);
    _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
+   _EGL_CHECK_EXTENSION(ANDROID_recordable);
 
    _EGL_CHECK_EXTENSION(CHROMIUM_sync_control);
 
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index 7d2791ca340..435d9245384 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -249,6 +249,9 @@ static const struct {
    { EGL_FRAMEBUFFER_TARGET_ANDROID, ATTRIB_TYPE_BOOLEAN,
                                     ATTRIB_CRITERION_EXACT,
                                     EGL_DONT_CARE },
+   { EGL_RECORDABLE_ANDROID,        ATTRIB_TYPE_BOOLEAN,
+                                    ATTRIB_CRITERION_EXACT,
+                                    EGL_DONT_CARE },
 };
 
 
@@ -493,6 +496,8 @@ _eglIsConfigAttribValid(_EGLConfig *conf, EGLint attr)
       return conf->Display->Extensions.NOK_texture_from_pixmap;
    case EGL_FRAMEBUFFER_TARGET_ANDROID:
       return conf->Display->Extensions.ANDROID_framebuffer_target;
+   case EGL_RECORDABLE_ANDROID:
+      return conf->Display->Extensions.ANDROID_recordable;
    default:
       break;
    }
diff --git a/src/egl/main/eglconfig.h b/src/egl/main/eglconfig.h
index 7bdb090c46c..22da697e83c 100644
--- a/src/egl/main/eglconfig.h
+++ b/src/egl/main/eglconfig.h
@@ -87,6 +87,7 @@ struct _egl_config
    /* extensions */
    EGLint YInvertedNOK;
    EGLint FramebufferTargetAndroid;
+   EGLint RecordableAndroid;
 };
 
 
@@ -135,6 +136,7 @@ _eglOffsetOfConfig(EGLint attr)
    /* extensions */
    ATTRIB_MAP(EGL_Y_INVERTED_NOK,            YInvertedNOK);
    ATTRIB_MAP(EGL_FRAMEBUFFER_TARGET_ANDROID, FramebufferTargetAndroid);
+   ATTRIB_MAP(EGL_RECORDABLE_ANDROID,        RecordableAndroid);
 #undef ATTRIB_MAP
    default:
       return -1;
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index a468a312784..6bfc8589a42 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -92,6 +92,7 @@ struct _egl_extensions
    /* Please keep these sorted alphabetically. */
    EGLBoolean ANDROID_framebuffer_target;
    EGLBoolean ANDROID_image_native_buffer;
+   EGLBoolean ANDROID_recordable;
 
    EGLBoolean CHROMIUM_sync_control;
 

From 952720ccee0b4e97729e1972cf22b463641f1569 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Tue, 2 Feb 2016 14:23:11 -0600
Subject: [PATCH 220/238] egl: android: enable EGL_FRAMEBUFFER_TARGET_ANDROID
 and EGL_RECORDABLE_ANDROID

Set EGL_FRAMEBUFFER_TARGET_ANDROID and EGL_RECORDABLE_ANDROID config
attributes to true for Android. These are required in Marshmallow.

The implementation of EGL_RECORDABLE_ANDROID support has 2 options in
the definition of the extension. Android implements the 2nd option
which is the encoder must support RGB input. The requested input format
is RGB888, so setting the attribute on all the native Android visual
formats should be sufficient.

Similarly, setting EGL_FRAMEBUFFER_TARGET_ANDROID for all configs with
a EGL_NATIVE_VISUAL_ID should be sufficient. Most likely, the HWC should
support the same set of formats the underlying DRM driver supports.

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/egl/drivers/dri2/platform_android.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 7d546650272..41840aa7b2a 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -537,6 +537,8 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
    EGLint config_attrs[] = {
      EGL_NATIVE_VISUAL_ID,   0,
      EGL_NATIVE_VISUAL_TYPE, 0,
+     EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE,
+     EGL_RECORDABLE_ANDROID, EGL_TRUE,
      EGL_NONE
    };
    int count, i, j;
@@ -714,7 +716,9 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
       goto cleanup_screen;
    }
 
+   dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE;
    dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
+   dpy->Extensions.ANDROID_recordable = EGL_TRUE;
    dpy->Extensions.KHR_image_base = EGL_TRUE;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during

From e09d04cd56eaca4db027c7faa3f92f4fb18b6751 Mon Sep 17 00:00:00 2001
From: Mauro Rossi <issor.oruam@gmail.com>
Date: Sun, 21 Feb 2016 20:57:47 +0100
Subject: [PATCH 221/238] radeonsi: use util_strchrnul() to fix android build
 error

Android Bionic does not support strchrnul() string function,
gallium auxiliary util/u_string.h provides util_strchrnul()

This change avoids the following building error:

external/mesa/src/gallium/drivers/radeonsi/si_shader.c:3863: error:
undefined reference to 'strchrnul'
collect2: error: ld returned 1 exit status

Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
---
 src/gallium/drivers/radeonsi/si_shader.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 4176e9f1ce5..56c575948ab 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -39,6 +39,7 @@
 #include "radeon/radeon_llvm_emit.h"
 #include "util/u_memory.h"
 #include "util/u_pstipple.h"
+#include "util/u_string.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_util.h"
@@ -4995,7 +4996,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 
 			line = binary->disasm_string;
 			while (*line) {
-				p = strchrnul(line, '\n');
+				p = util_strchrnul(line, '\n');
 				count = p - line;
 
 				if (count) {

From 58557b345c1382aeeef747060ba14d9edc6362de Mon Sep 17 00:00:00 2001
From: Brian Paul <brianp@vmware.com>
Date: Thu, 31 Mar 2016 10:30:07 -0600
Subject: [PATCH 222/238] docs: minor updates to license.html file

Mesa demos are no longer part of the main Mesa tree/tarball.
Add Gallium and GLX code to list of major components.

Reviewed-by: Jose Fonseca <jfonseca@vmware.com>
---
 docs/license.html | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/docs/license.html b/docs/license.html
index d56823f6dd1..28488325d63 100644
--- a/docs/license.html
+++ b/docs/license.html
@@ -46,10 +46,10 @@ library</em>. <br>
 
 <p>
 The Mesa distribution consists of several components.  Different copyrights
-and licenses apply to different components.  For example, some demo programs
-are copyrighted by SGI, some of the Mesa device drivers are copyrighted by
-their authors.  See below for a list of Mesa's main components and the license
-for each.
+and licenses apply to different components.
+For example, the GLX client code uses the SGI Free Software License B, and
+some of the Mesa device drivers are copyrighted by their authors.
+See below for a list of Mesa's main components and the license for each.
 </p>
 <p>
 The core Mesa library is licensed according to the terms of the MIT license.
@@ -97,13 +97,17 @@ and their respective licenses.
 <pre>
 Component         Location               License
 ------------------------------------------------------------------
-Main Mesa code    src/mesa/              Mesa (MIT)
+Main Mesa code    src/mesa/              MIT
 
 Device drivers    src/mesa/drivers/*     MIT, generally
 
+Gallium code      src/gallium/           MIT
+
 Ext headers       include/GL/glext.h     Khronos
                   include/GL/glxext.h
 
+GLX client code   src/glx/               SGI Free Software License B
+
 C11 thread        include/c11/threads*.h Boost (permissive)
 emulation
 </pre>

From 972054f5bfa3f0349a44db7cf508d611a0832e52 Mon Sep 17 00:00:00 2001
From: Rob Clark <robclark@freedesktop.org>
Date: Tue, 29 Mar 2016 10:49:03 -0400
Subject: [PATCH 223/238] compiler: random comment fixup

Just noticed this in passing..  gl_shader_stage already has tess so this
comment no longer applies.

Signed-off-by: Rob Clark <robclark@freedesktop.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/compiler/shader_enums.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index d4326c55a9d..07ae9ee2de7 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -31,7 +31,7 @@ extern "C" {
 #endif
 
 /**
- * Shader stages. Note that these will become 5 with tessellation.
+ * Shader stages.
  *
  * The order must match how shaders are ordered in the pipeline.
  * The GLSL linker assumes that if i<j, then the j-th shader is

From f72de6f3863049106288b7dd66efeb64c822fb17 Mon Sep 17 00:00:00 2001
From: Jose Fonseca <jfonseca@vmware.com>
Date: Fri, 1 Apr 2016 21:17:18 +0100
Subject: [PATCH 224/238] gallivm: Prevent disassembly debug output from being
 truncated.

By using os_log_message directly, as _debug_vprintf truncates messages
to 4K.

Also cleanup the disassemble interface.

Spotted by Roland.

Trivial.
---
 src/gallium/auxiliary/gallivm/lp_bld_debug.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index efaf2fa306a..11e9f92189f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -128,7 +128,7 @@ lp_debug_dump_value(LLVMValueRef value)
  * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
  */
 static size_t
-disassemble(const void* func, std::stringstream &buffer)
+disassemble(const void* func, std::ostream &buffer)
 {
    const uint8_t *bytes = (const uint8_t *)func;
 
@@ -235,15 +235,16 @@ disassemble(const void* func, std::stringstream &buffer)
 
 
 extern "C" void
-lp_disassemble(LLVMValueRef func, const void *code) {
-   std::stringstream buffer;
+lp_disassemble(LLVMValueRef func, const void *code)
+{
+   std::ostringstream buffer;
    std::string s;
 
    buffer << LLVMGetValueName(func) << ":\n";
    disassemble(code, buffer);
    s = buffer.str();
-   _debug_printf("%s", s.c_str());
-   _debug_printf("\n");
+   os_log_message(s.c_str());
+   os_log_message("\n");
 }
 
 
@@ -259,7 +260,6 @@ extern "C" void
 lp_profile(LLVMValueRef func, const void *code)
 {
 #if defined(__linux__) && defined(PROFILE)
-   std::stringstream buffer;
    static std::ofstream perf_asm_file;
    static boolean first_time = TRUE;
    static FILE *perf_map_file = NULL;
@@ -283,9 +283,9 @@ lp_profile(LLVMValueRef func, const void *code)
    if (perf_map_file) {
       const char *symbol = LLVMGetValueName(func);
       unsigned long addr = (uintptr_t)code;
-      buffer << symbol << ":\n";
-      unsigned long size = disassemble(code, buffer);
-      perf_asm_file << buffer.rdbuf() << std::flush;
+      perf_asm_file << symbol << ":\n";
+      unsigned long size = disassemble(code, perf_asm_file);
+      perf_asm_file.flush();
       fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol);
       fflush(perf_map_file);
    }

From debd9105122586ee00c0d20a73bb4e3191c50e70 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 24 Feb 2016 17:03:57 +0100
Subject: [PATCH 225/238] nvc0: bind driver cb for compute on c7[] for Kepler

Instead of using the screen->parm buffer object which will be removed,
upload auxiliary constants to uniform_bo to be consistent regarding
what we already do for Fermi.

This breaks surfaces support (for compute only) but this will be
properly re-introduced later for ARB_shader_image_load_store.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nvc0_context.h       |  6 +++
 .../drivers/nouveau/nvc0/nvc0_program.c       | 11 +++--
 .../drivers/nouveau/nvc0/nve4_compute.c       | 40 ++++++++++++-------
 .../drivers/nouveau/nvc0/nve4_compute.h       | 25 ------------
 4 files changed, 37 insertions(+), 45 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 31e1272aeed..f4f2d0b9780 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -110,6 +110,12 @@
 /* 32 textures handles, at 1 32-bits integer each */
 #define NVC0_CB_AUX_TEX_INFO(i)     0x020 + (i) * 4
 #define NVC0_CB_AUX_TEX_SIZE        (32 * 4)
+/* 8 sets of 32-bits coordinate offsets */
+#define NVC0_CB_AUX_MS_INFO         0x0a0 /* CP */
+#define NVC0_CB_AUX_MS_SIZE         (8 * 2 * 4)
+/* block/grid size, at 3 32-bits integers each and gridid */
+#define NVC0_CB_AUX_GRID_INFO       0x0e0 /* CP */
+#define NVC0_CB_AUX_GRID_SIZE       (7 * 4)
 /* 8 user clip planes, at 4 32-bits floats each */
 #define NVC0_CB_AUX_UCP_INFO        0x100
 #define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index a3433f4a10a..d76b48fa4c9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -540,17 +540,16 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.auxCBSlot = 0;
-         info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
-         info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
-         info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
+         info->io.auxCBSlot = 7;
+         info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
+         info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
          info->io.bufInfoBase = 0; /* TODO */
       } else {
          info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
-         info->io.suInfoBase = 0; /* TODO */
       }
       info->io.msInfoCBSlot = 0;
-      info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
+      info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
+      info->io.suInfoBase = 0; /* TODO */
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
          info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index b3d841461d6..cae4838be38 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -41,6 +41,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    int i;
    int ret;
    uint32_t obj_class;
+   uint64_t address;
 
    switch (dev->chipset & ~0xf) {
    case 0x100:
@@ -65,7 +66,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
       return ret;
    }
 
-   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
                         &screen->parm);
    if (ret)
       return ret;
@@ -128,15 +129,17 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    }
 
    BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
-   PUSH_DATA (push, 0); /* does not interefere with 3D */
+   PUSH_DATA (push, 7); /* does not interfere with 3D */
 
    if (obj_class == NVF0_COMPUTE_CLASS)
       IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
 
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
+
    /* MS sample coordinate offsets: these do not work with _ALT modes ! */
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
-   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
+   PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, 64);
    PUSH_DATA (push, 1);
@@ -159,7 +162,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    PUSH_DATA (push, 3); /* 7 */
    PUSH_DATA (push, 1);
 
-#ifdef DEBUG
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
    PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
@@ -194,6 +197,9 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
    uint32_t mask;
    unsigned i;
    const unsigned t = 1;
+   uint64_t address;
+
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 
    mask = nvc0->surfaces_dirty[t];
    while (mask) {
@@ -205,8 +211,8 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
        * directly instead of via binding points, so we have to supply them.
        */
       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-      PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
-      PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i));
+      PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i));
       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
       PUSH_DATA (push, 64);
       PUSH_DATA (push, 1);
@@ -271,6 +277,7 @@ static void
 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    uint64_t address;
    const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
    unsigned i, n;
@@ -282,11 +289,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
    n = util_logbase2(dirty) + 1 - i;
    assert(n);
 
-   address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i);
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, address);
-   PUSH_DATA (push, address);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
+   PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, n * 4);
    PUSH_DATA (push, 0x1);
@@ -334,6 +341,9 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_program *cp = nvc0->compprog;
+   uint64_t address;
+
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 
    if (cp->parm_size) {
       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
@@ -347,8 +357,8 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
       PUSH_DATAp(push, input, cp->parm_size / 4);
    }
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
-   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
+   PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
+   PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO);
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, 7 * 4);
    PUSH_DATA (push, 0x1);
@@ -408,7 +418,9 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
       if (nvc0->constbuf[s][i].u.buf)
          nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
    }
-   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
+   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
+   nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
+                              NVC0_CB_AUX_INFO(5), 1 << 10);
 }
 
 static inline struct nve4_cp_launch_desc *
@@ -495,7 +507,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    const unsigned s = 5;
    unsigned i;
-   uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX];
+   uint32_t commands[2][32];
    unsigned n[2] = { 0, 0 };
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 84f8593b9b6..dcafbeda397 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -4,31 +4,6 @@
 
 #include "nvc0/nve4_compute.xml.h"
 
-/* Input space is implemented as c0[], to which we bind the screen->parm bo.
- */
-#define NVE4_CP_INPUT_USER           0x0000
-#define NVE4_CP_INPUT_USER_LIMIT     0x1000
-#define NVE4_CP_INPUT_GRID_INFO(i)  (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NTID(i)       (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NCTAID(i)     (0x100c + (i) * 4)
-#define NVE4_CP_INPUT_GRIDID         0x1018
-#define NVE4_CP_INPUT_TEX(i)        (0x1040 + (i) * 4)
-#define NVE4_CP_INPUT_TEX_STRIDE     4
-#define NVE4_CP_INPUT_TEX_MAX        32
-#define NVE4_CP_INPUT_MS_OFFSETS     0x10c0
-#define NVE4_CP_INPUT_SUF_STRIDE     64
-#define NVE4_CP_INPUT_SUF(i)        (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE)
-#define NVE4_CP_INPUT_SUF_MAX        32
-#define NVE4_CP_INPUT_TRAP_INFO_PTR  0x1900
-#define NVE4_CP_INPUT_TEMP_PTR       0x1908
-#define NVE4_CP_INPUT_MP_TEMP_SIZE   0x1910
-#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914
-#define NVE4_CP_INPUT_CSTACK_SIZE    0x1918
-#define NVE4_CP_INPUT_SIZE           0x1a00
-#define NVE4_CP_PARAM_TRAP_INFO      0x2000
-#define NVE4_CP_PARAM_TRAP_INFO_SZ  (1 << 16)
-#define NVE4_CP_PARAM_SIZE          (NVE4_CP_PARAM_TRAP_INFO + (1 << 16))
-
 struct nve4_cp_launch_desc
 {
    u32 unk0[8];

From 1828d90a0084a4bfce4f1bff8cac8a87d1dfcd40 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 11 Jan 2016 16:05:59 +0100
Subject: [PATCH 226/238] nvc0: bind shader buffers for compute on Kepler

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nvc0_program.c       |  4 +-
 .../drivers/nouveau/nvc0/nve4_compute.c       | 38 +++++++++++++++++++
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index d76b48fa4c9..9df99bd4356 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -543,12 +543,10 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
          info->io.auxCBSlot = 7;
          info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
          info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
-         info->io.bufInfoBase = 0; /* TODO */
-      } else {
-         info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       }
       info->io.msInfoCBSlot = 0;
       info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
+      info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       info->io.suInfoBase = 0; /* TODO */
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index cae4838be38..ccf5aef5c64 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -308,6 +308,43 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
    nvc0->samplers_dirty[s] = 0;
 }
 
+static void
+nve4_compute_validate_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint64_t address;
+   const int s = 5;
+   int i;
+
+   address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
+   PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+   for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+      if (nvc0->buffers[s][i].buffer) {
+         struct nv04_resource *res =
+            nv04_resource(nvc0->buffers[s][i].buffer);
+         PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+         PUSH_DATA (push, 0);
+         BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
+      } else {
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+      }
+   }
+}
+
 static struct nvc0_state_validate
 validate_list_cp[] = {
    { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
@@ -317,6 +354,7 @@ validate_list_cp[] = {
                                           NVC0_NEW_CP_SAMPLERS    },
    { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
+   { nve4_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
 };
 
 static bool

From 12aa047c98e597a109b387e9b71cd87bff0dea0a Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 11 Jan 2016 21:22:58 +0100
Subject: [PATCH 227/238] nvc0: bind user uniforms for compute on Kepler

Uniform buffer objects will be sticked to the driver constant buffer
like buffers because the launch descriptor only allows 8 CBs.

Input kernel parameters for OpenCL are still uploaded to screen->parm
which is bound on c0, but this will be changed later with a new series.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nve4_compute.c       | 63 ++++++++++++++++---
 .../drivers/nouveau/nvc0/nve4_compute.h       | 19 +-----
 2 files changed, 55 insertions(+), 27 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index ccf5aef5c64..5f340926971 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -308,6 +308,42 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
    nvc0->samplers_dirty[s] = 0;
 }
 
+static void
+nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const int s = 5;
+
+   while (nvc0->constbuf_dirty[s]) {
+      int i = ffs(nvc0->constbuf_dirty[s]) - 1;
+      nvc0->constbuf_dirty[s] &= ~(1 << i);
+
+      if (nvc0->constbuf[s][i].user) {
+         struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+         const unsigned base = NVC0_CB_USR_INFO(s);
+         const unsigned size = nvc0->constbuf[s][0].size;
+         assert(i == 0); /* we really only want OpenGL uniforms here */
+         assert(nvc0->constbuf[s][0].u.data);
+
+         BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, bo->offset + base);
+         PUSH_DATA (push, bo->offset + base);
+         BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, size);
+         PUSH_DATA (push, 0x1);
+         BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
+         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+         PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
+      }
+      else {
+         /* TODO: will be updated in the next commit */
+      }
+   }
+
+   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+}
+
 static void
 nve4_compute_validate_buffers(struct nvc0_context *nvc0)
 {
@@ -355,6 +391,7 @@ validate_list_cp[] = {
    { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
    { nve4_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
+   { nve4_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
 };
 
 static bool
@@ -372,7 +409,9 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 }
 
 static void
-nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
+nve4_compute_upload_input(struct nvc0_context *nvc0,
+                          struct nve4_cp_launch_desc *desc,
+                          const void *input,
                           const uint *block_layout,
                           const uint *grid_layout)
 {
@@ -393,6 +432,11 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
       PUSH_DATAp(push, input, cp->parm_size / 4);
+
+      /* Bind user parameters coming from clover. */
+      /* TODO: This should be harmonized with uniform_bo. */
+      assert(!(desc->cb_mask & (1 << 0)));
+      nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
    }
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
@@ -429,7 +473,6 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
 {
    const struct nvc0_screen *screen = nvc0->screen;
    const struct nvc0_program *cp = nvc0->compprog;
-   unsigned i;
 
    nve4_cp_launch_desc_init_default(desc);
 
@@ -451,12 +494,13 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    desc->gpr_alloc = cp->num_gprs;
    desc->bar_alloc = cp->num_barriers;
 
-   for (i = 0; i < 7; ++i) {
-      const unsigned s = 5;
-      if (nvc0->constbuf[s][i].u.buf)
-         nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
+   // Only bind OpenGL uniforms and the driver constant buffer through the
+   // launch descriptor because UBOs are sticked to the driver cb to avoid the
+   // limitation of 8 CBs.
+   if (nvc0->constbuf[5][0].user) {
+      nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
+                                 NVC0_CB_USR_INFO(5), 1 << 16);
    }
-   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
    nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
                               NVC0_CB_AUX_INFO(5), 1 << 10);
 }
@@ -500,13 +544,14 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
 
    nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
                                   info->block, info->grid);
+
+   nve4_compute_upload_input(nvc0, desc, info->input, info->block, info->grid);
+
 #ifdef DEBUG
    if (debug_get_num_option("NV50_PROG_DEBUG", 0))
       nve4_compute_dump_launch_desc(desc);
 #endif
 
-   nve4_compute_upload_input(nvc0, info->input, info->block, info->grid);
-
    /* upload descriptor and flush */
 #if 0
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index dcafbeda397..b98c65d4a09 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -56,7 +56,7 @@ static inline void
 nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
                            unsigned index,
                            struct nouveau_bo *bo,
-                           uint32_t base, uint16_t size)
+                           uint32_t base, uint32_t size)
 {
    uint64_t address = bo->offset + base;
 
@@ -70,23 +70,6 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
    desc->cb_mask |= 1 << index;
 }
 
-static inline void
-nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
-                               unsigned index,
-                               const struct nvc0_constbuf *cb)
-{
-   assert(index < 8);
-
-   if (!cb->u.buf) {
-      desc->cb_mask &= ~(1 << index);
-   } else {
-      const struct nv04_resource *buf = nv04_resource(cb->u.buf);
-      assert(!cb->user);
-      nve4_cp_launch_desc_set_cb(desc, index,
-                                 buf->bo, buf->offset + cb->offset, cb->size);
-   }
-}
-
 struct nve4_mp_trap_info {
    u32 lock;
    u32 pc;

From e2e8085fac13a7af33feaf11a9c085467d257490 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 31 Mar 2016 00:50:23 +0200
Subject: [PATCH 228/238] nvc0: store ubo info to the driver constbuf on Kepler

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/codegen/nv50_ir_driver.h  |  1 +
 .../drivers/nouveau/nvc0/nvc0_context.h       |  3 +++
 .../drivers/nouveau/nvc0/nvc0_program.c       |  1 +
 .../drivers/nouveau/nvc0/nve4_compute.c       | 26 ++++++++++++++++++-
 4 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 54c53c98325..c7f8567cadb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -181,6 +181,7 @@ struct nv50_ir_prog_info
       uint16_t sampleInfoBase;   /* base address for sample positions */
       uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
       uint16_t msInfoBase;       /* base address for multisample info */
+      uint16_t uboInfoBase;      /* base address for compute UBOs (gk104+) */
    } io;
 
    /* driver callback to assign input/output locations */
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index f4f2d0b9780..91dffa116e1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -119,6 +119,9 @@
 /* 8 user clip planes, at 4 32-bits floats each */
 #define NVC0_CB_AUX_UCP_INFO        0x100
 #define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
+/* 13 ubos, at 4 32-bits integer each */
+#define NVC0_CB_AUX_UBO_INFO(i)     0x100 + (i) * 4 * 4 /* CP */
+#define NVC0_CB_AUX_UBO_SIZE        ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
 /* 8 sets of 32-bits integer pairs sample offsets */
 #define NVC0_CB_AUX_SAMPLE_INFO     0x180 /* FP */
 #define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 4 * 2)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index 9df99bd4356..db02fa2df5c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -543,6 +543,7 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
          info->io.auxCBSlot = 7;
          info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
          info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
+         info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
       }
       info->io.msInfoCBSlot = 0;
       info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 5f340926971..b1450f80023 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -336,7 +336,31 @@ nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
          PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
       }
       else {
-         /* TODO: will be updated in the next commit */
+         struct nv04_resource *res =
+            nv04_resource(nvc0->constbuf[s][i].u.buf);
+         if (res) {
+            uint64_t address
+               = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+            assert(i > 0); /* we really only want uniform buffer objects */
+
+            BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+            PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+            BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+            PUSH_DATA (push, 4 * 4);
+            PUSH_DATA (push, 0x1);
+            BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
+            PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+            PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+            PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+            PUSH_DATA (push, nvc0->constbuf[5][i].size);
+            PUSH_DATA (push, 0);
+            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
+
+            res->cb_bindings[s] |= 1 << i;
+         }
       }
    }
 

From 7797d5f7d9b367f96200093cbe166c4478eae65e Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 22 Feb 2016 21:44:25 +0100
Subject: [PATCH 229/238] nvc0: reduce likelihood of collision for real buffers
 on Kepler

Reduce likelihood of collision with real buffers by placing the
hole at the top of the 4G area. This fixes some indirect draw+compute
tests with large buffers.

Suggested by Ilia Mirkin.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nve4_compute.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index b1450f80023..04ede3e51e1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -96,9 +96,9 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
     *  accessible. We cannot prevent that at the moment, so expect failure.
     */
    BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
-   PUSH_DATA (push, 1 << 24);
+   PUSH_DATA (push, 0xff << 24);
    BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
-   PUSH_DATA (push, 2 << 24);
+   PUSH_DATA (push, 0xfe << 24);
 
    BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->text->offset);

From 3b246a71d7fe12c4b0670a9dadf566ea3eca1128 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 14 Jan 2016 18:24:53 +0100
Subject: [PATCH 230/238] nvc0: add indirect compute support on Kepler

The grid size is stored as three 32-bits integers in the indirect
buffer but the launch descriptor uses a 32-bits integer for both
griddim_y and griddim_z like this (z << 16) | y. To make it work,
the 16 high bits of griddim_y are overwritten by griddim_z.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../drivers/nouveau/nvc0/nve4_compute.c       | 111 ++++++++++++------
 1 file changed, 77 insertions(+), 34 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 04ede3e51e1..4d069df983e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -435,9 +435,7 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 static void
 nve4_compute_upload_input(struct nvc0_context *nvc0,
                           struct nve4_cp_launch_desc *desc,
-                          const void *input,
-                          const uint *block_layout,
-                          const uint *grid_layout)
+                          const struct pipe_grid_info *info)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
@@ -455,7 +453,7 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
       PUSH_DATA (push, 0x1);
       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-      PUSH_DATAp(push, input, cp->parm_size / 4);
+      PUSH_DATAp(push, info->input, cp->parm_size / 4);
 
       /* Bind user parameters coming from clover. */
       /* TODO: This should be harmonized with uniform_bo. */
@@ -468,10 +466,25 @@ nve4_compute_upload_input(struct nvc0_context *nvc0,
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, 7 * 4);
    PUSH_DATA (push, 0x1);
-   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
-   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-   PUSH_DATAp(push, block_layout, 3);
-   PUSH_DATAp(push, grid_layout, 3);
+
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, info->block, 3);
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+   } else {
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, info->block, 3);
+      PUSH_DATAp(push, info->grid, 3);
+   }
    PUSH_DATA (push, 0);
 
    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
@@ -491,23 +504,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 static void
 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
                                struct nve4_cp_launch_desc *desc,
-                               uint32_t label,
-                               const uint *block_layout,
-                               const uint *grid_layout)
+                               const struct pipe_grid_info *info)
 {
    const struct nvc0_screen *screen = nvc0->screen;
    const struct nvc0_program *cp = nvc0->compprog;
 
    nve4_cp_launch_desc_init_default(desc);
 
-   desc->entry = nvc0_program_symbol_offset(cp, label);
+   desc->entry = nvc0_program_symbol_offset(cp, info->pc);
 
-   desc->griddim_x = grid_layout[0];
-   desc->griddim_y = grid_layout[1];
-   desc->griddim_z = grid_layout[2];
-   desc->blockdim_x = block_layout[0];
-   desc->blockdim_y = block_layout[1];
-   desc->blockdim_z = block_layout[2];
+   desc->griddim_x = info->grid[0];
+   desc->griddim_y = info->grid[1];
+   desc->griddim_z = info->grid[2];
+   desc->blockdim_x = info->block[0];
+   desc->blockdim_y = info->block[1];
+   desc->blockdim_z = info->block[2];
 
    desc->shared_size = align(cp->cp.smem_size, 0x100);
    desc->local_size_p = align(cp->cp.lmem_size, 0x10);
@@ -566,30 +577,62 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    if (ret)
       goto out;
 
-   nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
-                                  info->block, info->grid);
+   nve4_compute_setup_launch_desc(nvc0, desc, info);
 
-   nve4_compute_upload_input(nvc0, desc, info->input, info->block, info->grid);
+   nve4_compute_upload_input(nvc0, desc, info);
 
 #ifdef DEBUG
    if (debug_get_num_option("NV50_PROG_DEBUG", 0))
       nve4_compute_dump_launch_desc(desc);
 #endif
 
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+
+      /* upload the descriptor */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr);
+      PUSH_DATA (push, desc_gpuaddr);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 256);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+
+      /* overwrite griddim_x and griddim_y as two 32-bits integers even
+       * if griddim_y must be a 16-bits integer */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr + 48);
+      PUSH_DATA (push, desc_gpuaddr + 48);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 8);
+      PUSH_DATA (push, 1);
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
+
+      /* overwrite the 16 high bits of griddim_y with griddim_z because
+       * we need (z << 16) | x */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr + 54);
+      PUSH_DATA (push, desc_gpuaddr + 54);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 4);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      nouveau_pushbuf_data(push, res->bo, offset + 8,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
+   }
+
    /* upload descriptor and flush */
-#if 0
-   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, desc_gpuaddr);
-   PUSH_DATA (push, desc_gpuaddr);
-   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-   PUSH_DATA (push, 256);
-   PUSH_DATA (push, 1);
-   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
-   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
-   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
-   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
-   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
-#endif
    BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
    PUSH_DATA (push, desc_gpuaddr >> 8);
    BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);

From 4f58b78c309db372d408912ca87e88d319b895da Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Thu, 31 Mar 2016 00:50:39 +0200
Subject: [PATCH 231/238] nvc0/ir: add support for compute UBOs on Kepler

Make sure to avoid out of bounds access in presence of indirect
array indexing by loading the size from the driver constant buffer.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 55 ++++++++++++++++++-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h   |  3 +
 2 files changed, 57 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 850147b62e9..da58ced4d7c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1321,6 +1321,24 @@ NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
    return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
 }
 
+inline Value *
+NVC0LoweringPass::loadUboInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
 inline Value *
 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 {
@@ -1711,7 +1729,42 @@ NVC0LoweringPass::handleLDST(Instruction *i)
          assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
       }
    } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
-      if (i->src(0).isIndirect(1)) {
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
+          prog->getType() == Program::TYPE_COMPUTE) {
+         // The launch descriptor only allows to set up 8 CBs, but OpenGL
+         // requires at least 12 UBOs. To bypass this limitation, we store the
+         // addrs into the driver constbuf and we directly load from the global
+         // memory.
+         int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
+         Value *ind = i->getIndirect(0, 1);
+         Value *ptr = loadUboInfo64(ind, fileIndex * 16);
+
+         // TODO: clamp the offset to the maximum number of const buf.
+         if (i->src(0).isIndirect(1)) {
+            Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+            Value *length = loadUboLength32(ind, fileIndex * 16);
+            Value *pred = new_LValue(func, FILE_PREDICATE);
+            if (i->src(0).isIndirect(0)) {
+               bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+               bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+            }
+            i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+            bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+            i->setPredicate(CC_NOT_P, pred);
+            if (i->defExists(0)) {
+               bld.mkMov(i->getDef(0), bld.mkImm(0));
+            }
+         } else if (fileIndex >= 0) {
+            if (i->src(0).isIndirect(0)) {
+               bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+            }
+            i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+         }
+      } else if (i->src(0).isIndirect(1)) {
          Value *ptr;
          if (i->src(0).isIndirect(0))
             ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index be81d29eb0a..aa192494d9f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -127,6 +127,9 @@ private:
    Value *loadBufInfo32(Value *ptr, uint32_t off);
    Value *loadBufInfo64(Value *ptr, uint32_t off);
    Value *loadBufLength32(Value *ptr, uint32_t off);
+   Value *loadUboInfo32(Value *ptr, uint32_t off);
+   Value *loadUboInfo64(Value *ptr, uint32_t off);
+   Value *loadUboLength32(Value *ptr, uint32_t off);
    Value *loadMsInfo32(Value *ptr, uint32_t off);
    Value *loadTexHandle(Value *ptr, unsigned int slot);
 

From 275019d7db033286e41eb4983ac50d3d3d335586 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 22 Feb 2016 23:20:30 +0100
Subject: [PATCH 232/238] nvc0/ir: fix wrong pred emission for ld lock on GK104

This fixes 84b9b8f (nvc0/ir: add missing emission of locked load
predicate).

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 8b9328b6296..d61109f0040 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1858,7 +1858,10 @@ CodeEmitterNVC0::emitLOAD(const Instruction *i)
    if (i->src(0).getFile() == FILE_MEMORY_SHARED) {
       if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
          assert(i->defExists(1));
-         defId(i->def(1), 32 + 18);
+         if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+            defId(i->def(1), 8);
+         else
+            defId(i->def(1), 32 + 18);
       }
    }
 

From 543fb95473e404b7212eea3f00a23dd0d23758d5 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Wed, 10 Feb 2016 22:37:42 +0100
Subject: [PATCH 233/238] nvc0/ir: add atomics support on shared memory for
 Kepler

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp | 108 +++++++++++++++++-
 .../nouveau/codegen/nv50_ir_lowering_nvc0.h   |   1 +
 2 files changed, 108 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index da58ced4d7c..fb2aec62144 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1080,6 +1080,108 @@ NVC0LoweringPass::handleSUQ(Instruction *suq)
    return true;
 }
 
+void
+NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
+{
+   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
+
+   BasicBlock *currBB = atom->bb;
+   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
+   BasicBlock *joinBB = atom->bb->splitAfter(atom);
+   BasicBlock *setAndUnlockBB = new BasicBlock(func);
+   BasicBlock *failLockBB = new BasicBlock(func);
+
+   bld.setPosition(currBB, true);
+   assert(!currBB->joinAt);
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   CmpInstruction *pred =
+      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                TYPE_U32, bld.mkImm(0), bld.mkImm(1));
+
+   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
+   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
+
+   bld.setPosition(tryLockBB, true);
+
+   Instruction *ld =
+      bld.mkLoad(TYPE_U32, atom->getDef(0),
+                 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0), NULL);
+   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
+   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
+
+   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
+   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
+   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
+
+   tryLockBB->cfg.detach(&joinBB->cfg);
+   bld.remove(atom);
+
+   bld.setPosition(setAndUnlockBB, true);
+   Value *stVal;
+   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
+      // Read the old value, and write the new one.
+      stVal = atom->getSrc(1);
+   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      CmpInstruction *set =
+         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
+                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
+
+      bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
+                TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
+   } else {
+      operation op;
+
+      switch (atom->subOp) {
+      case NV50_IR_SUBOP_ATOM_ADD:
+         op = OP_ADD;
+         break;
+      case NV50_IR_SUBOP_ATOM_AND:
+         op = OP_AND;
+         break;
+      case NV50_IR_SUBOP_ATOM_OR:
+         op = OP_OR;
+         break;
+      case NV50_IR_SUBOP_ATOM_XOR:
+         op = OP_XOR;
+         break;
+      case NV50_IR_SUBOP_ATOM_MIN:
+         op = OP_MIN;
+         break;
+      case NV50_IR_SUBOP_ATOM_MAX:
+         op = OP_MAX;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+
+      stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
+                         atom->getSrc(1));
+   }
+
+   Instruction *st =
+      bld.mkStore(OP_STORE, TYPE_U32,
+                  bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0),
+                  NULL, stVal);
+   st->setDef(0, pred->getDef(0));
+   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
+
+   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
+
+   // Lock until the store has not been performed.
+   bld.setPosition(failLockBB, true);
+   bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
+   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
+   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
+   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
+
+   bld.setPosition(joinBB, false);
+   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
+}
+
 void
 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
 {
@@ -1186,7 +1288,11 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
       sv = SV_LBASE;
       break;
    case FILE_MEMORY_SHARED:
-      handleSharedATOM(atom);
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET) {
+         handleSharedATOMNVE4(atom);
+      } else {
+         handleSharedATOM(atom);
+      }
       return true;
    default:
       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index aa192494d9f..d5c2cb5e7e1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -106,6 +106,7 @@ protected:
    bool handleCasExch(Instruction *, bool needCctl);
    void handleSurfaceOpNVE4(TexInstruction *);
    void handleSharedATOM(Instruction *);
+   void handleSharedATOMNVE4(Instruction *);
    void handleLDST(Instruction *);
 
    void checkPredicate(Instruction *);

From 839a469166b9c0b9959620eda85a6481f9efa15f Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 7 Mar 2016 18:56:21 +0100
Subject: [PATCH 234/238] nvc0/ir: do not lower shared+atomics on GM107+

For Maxwell, the ATOMS instruction can be used to perform atomic
operations on shared memory instead of this load/store lowering pass.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 .../nouveau/codegen/nv50_ir_lowering_nvc0.cpp   | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index fb2aec62144..ce83618d681 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1288,11 +1288,12 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
       sv = SV_LBASE;
       break;
    case FILE_MEMORY_SHARED:
-      if (targ->getChipset() >= NVISA_GK104_CHIPSET) {
-         handleSharedATOMNVE4(atom);
-      } else {
+      // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
+      // operations on shared memory. For Maxwell, ATOMS is enough.
+      if (targ->getChipset() < NVISA_GK104_CHIPSET)
          handleSharedATOM(atom);
-      }
+      else if (targ->getChipset() < NVISA_GM107_CHIPSET)
+         handleSharedATOMNVE4(atom);
       return true;
    default:
       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
@@ -1320,9 +1321,11 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
 bool
 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 {
-   if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
-      // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
-      return false;
+   if (targ->getChipset() < NVISA_GM107_CHIPSET) {
+      if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
+         // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
+         return false;
+      }
    }
 
    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&

From 71f327aa21d095f848c2162247476612eca1ed73 Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 8 Feb 2016 18:20:02 +0100
Subject: [PATCH 235/238] nvc0: bump the maximum number of UBOs for compute on
 Kepler

The maximum number of uniform blocks (MAX_COMPUTE_UNIFORM_BLOCKS)
per compute program must be at least 12.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 2 --
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.h | 1 -
 2 files changed, 3 deletions(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 8d7d4ef6fb8..ac7f57bac08 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -320,8 +320,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
       return 65536;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-      if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
-         return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
       return NVC0_MAX_PIPE_CONSTBUFS;
    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
       return shader != PIPE_SHADER_FRAGMENT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 46b692df2e3..0f782207f13 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -16,7 +16,6 @@
 
 /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
 #define NVC0_MAX_PIPE_CONSTBUFS         14
-#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE  7
 
 #define NVC0_MAX_SURFACE_SLOTS 16
 

From 60e1c6a7fce82e48a36e3a5605222009c47f59bb Mon Sep 17 00:00:00 2001
From: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Date: Mon, 8 Feb 2016 18:19:47 +0100
Subject: [PATCH 236/238] nvc0: enable compute shaders on GK104 and GM107+

Compute support on GK110 is still unstable for weird reasons, but
this can be fixed later as the NVF0_COMPUTE envvar prevent using
compute.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset@gmail.com>
Reviewed-by: Ilia Mirkin <imirkin@alum.mit.edu>
---
 src/gallium/drivers/nouveau/nvc0/nvc0_screen.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index ac7f57bac08..590dac972a7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -291,7 +291,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_PREFERRED_IR:
       return PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_SUPPORTED_IRS:
-      if (class_3d >= NVE4_3D_CLASS)
+      if (class_3d == NVF0_3D_CLASS &&
+          !debug_get_bool_option("NVF0_COMPUTE", false))
          return 0;
       return 1 << PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:

From de60e250f5095a9237727a3188eb0c092a4e6a05 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 13:58:17 -0700
Subject: [PATCH 237/238] nir: Add an opcode for stomping a 32-bit value to
 16-bit precision

This correlates directly to the SPIR-V opcode OpQuantizeToF16

Reviewed-by: Rob Clark <robdclark@gmail.com>
---
 src/compiler/nir/nir_opcodes.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 553f924afc5..bc9845036d3 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -180,6 +180,7 @@ unop("ffloor", tfloat, "bit_size == 64 ? floor(src0) : floorf(src0)")
 unop("ffract", tfloat, "src0 - (bit_size == 64 ? floor(src0) : floorf(src0))")
 unop("fround_even", tfloat, "bit_size == 64 ? _mesa_roundeven(src0) : _mesa_roundevenf(src0)")
 
+unop("fquantize2f16", tfloat, "(fabs(src0) < ldexpf(1.0, -14)) ? copysignf(0.0f, src0) : _mesa_half_to_float(_mesa_float_to_half(src0))")
 
 # Trigonometric operations.
 

From 14c46954c910efb1db94a068a866c7259deaa9d9 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Fri, 25 Mar 2016 13:57:15 -0700
Subject: [PATCH 238/238] i965: Add an implemnetation of nir_op_fquantize2f16

Reviewed-by: Matt Turner <mattst88@gmail.com>
---
 src/mesa/drivers/dri/i965/brw_fs_nir.cpp   | 28 ++++++++++++++++++++++
 src/mesa/drivers/dri/i965/brw_vec4_nir.cpp | 25 +++++++++++++++++++
 2 files changed, 53 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 4de559941ce..7839428c52e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -993,6 +993,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fquantize2f16: {
+      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
+      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
+      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+      /* The destination stride must be at least as big as the source stride. */
+      tmp16.type = BRW_REGISTER_TYPE_W;
+      tmp16.stride = 2;
+
+      /* Check for denormal */
+      fs_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+              BRW_CONDITIONAL_L);
+      /* Get the appropriately signed zero */
+      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
+              retype(op[0], BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(0x80000000));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
+      bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
+      /* Select that or zero based on normal status */
+      inst = bld.SEL(result, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
    case nir_op_fmin:
    case nir_op_imin:
    case nir_op_umin:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index eef3940b643..ee6929b16a2 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -1188,6 +1188,31 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fquantize2f16: {
+      /* See also vec4_visitor::emit_pack_half_2x16() */
+      src_reg tmp16 = src_reg(this, glsl_type::uvec4_type);
+      src_reg tmp32 = src_reg(this, glsl_type::vec4_type);
+      src_reg zero = src_reg(this, glsl_type::vec4_type);
+
+      /* Check for denormal */
+      src_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+               BRW_CONDITIONAL_L));
+      /* Get the appropriately signed zero */
+      emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
+               retype(op[0], BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(0x80000000)));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      emit(F32TO16(dst_reg(tmp16), op[0]));
+      emit(F16TO32(dst_reg(tmp32), tmp16));
+      /* Select that or zero based on normal status */
+      inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
    case nir_op_fmin:
    case nir_op_imin:
    case nir_op_umin: