From c23e2a662a84f7f2704c99f393bf65fc4e93a5ef Mon Sep 17 00:00:00 2001
From: Felix DeGrood <felix.j.degrood@intel.com>
Date: Wed, 19 May 2021 11:51:20 -0700
Subject: [PATCH] intel/compiler: tileY friendly LID order for CS

Computer shaders that access tileY resources (textures) benefit
from Y-locality accesses. Easiest way to implement this is walk
local ids in Y-major fashion, instead of X-major fashion. Y-major
local ids will reduce partial writes and increase cache locality
for tileY accesses since tileY resources cachelines progress in
Y direction.

Improves performance on TGL:
  Borderlands3.dxvk-g2  +1.5%

Y-major can introduce a performance drop on CS that use mixture
of buffers and images. This should be fixed in next commit.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/10733>
---
 .../compiler/brw_nir_lower_cs_intrinsics.c    | 26 ++++++++++++++-----
 1 file changed, 20 insertions(+), 6 deletions(-)

diff --git a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
index b77d6fe49bb..fd574603cbe 100644
--- a/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
+++ b/src/intel/compiler/brw_nir_lower_cs_intrinsics.c
@@ -111,14 +111,28 @@ lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
             nir_ssa_def *id_x, *id_y, *id_z;
             switch (state->nir->info.cs.derivative_group) {
             case DERIVATIVE_GROUP_NONE:
-               /* If not using derivatives, just set the local invocation
-                * index linearly, and calculate local invocation ID from that.
-                */
-               id_x = nir_umod(b, linear, size_x);
-               id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+               if (nir->info.num_images == 0 &&
+                   nir->info.num_textures == 0) {
+                  /* X-major lid order. Optimal for linear accesses only,
+                   * which are usually buffers. X,Y ordering will look like:
+                   * (0,0) (1,0) (2,0) ... (size_x-1,0) (0,1) (1,1) ...
+                   */
+                  id_x = nir_umod(b, linear, size_x);
+                  id_y = nir_umod(b, nir_udiv(b, linear, size_x), size_y);
+                  local_index = linear;
+               } else {
+                  /* Y-major lid order. Optimal for tileY accesses only,
+                   * which are usually images. X,Y ordering will look like:
+                   * (0,0) (0,1) (0,2) ... (0,size_y-1) (1,0) (1,1) ...
+                   */
+                  id_y = nir_umod(b, linear, size_y);
+                  id_x = nir_umod(b, nir_udiv(b, linear, size_y), size_x);
+                  local_index = nir_iadd(b, nir_iadd(b, id_x,
+                                                        nir_imul(b, id_y, size_x)),
+                                                        nir_imul(b, id_z, size_xy));
+               }
                id_z = nir_udiv(b, linear, size_xy);
                local_id = nir_vec3(b, id_x, id_y, id_z);
-               local_index = linear;
                break;
             case DERIVATIVE_GROUP_LINEAR:
                /* For linear, just set the local invocation index linearly,