From 6642749458b6a2e0800bb5e606dcf15e4db0c479 Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason@jlekstrand.net>
Date: Tue, 13 Jul 2021 18:21:43 -0500
Subject: [PATCH] intel/dev: Add a max_cs_workgroup_threads field

This is distinct form max_cs_threads because it also encodes
restrictions about the way we use GPGPU/COMPUTE_WALKER.  This gets rid
of the MIN2(64, devinfo->max_cs_threads) we have scattered all over the
driver and puts it in a central place.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/11861>
---
 src/gallium/drivers/crocus/crocus_screen.c |  3 +--
 src/gallium/drivers/iris/iris_screen.c     |  4 +---
 src/intel/compiler/brw_fs.cpp              |  5 ++---
 src/intel/dev/intel_device_info.c          | 19 +++++++++++++++++++
 src/intel/dev/intel_device_info.h          | 11 +++++++++++
 src/intel/vulkan/anv_device.c              |  6 ++----
 src/mesa/drivers/dri/i965/brw_context.c    |  8 +-------
 7 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/src/gallium/drivers/crocus/crocus_screen.c b/src/gallium/drivers/crocus/crocus_screen.c
index c1e0cf3a294..2647fb923ee 100644
--- a/src/gallium/drivers/crocus/crocus_screen.c
+++ b/src/gallium/drivers/crocus/crocus_screen.c
@@ -545,8 +545,7 @@ crocus_get_compute_param(struct pipe_screen *pscreen,
    struct crocus_screen *screen = (struct crocus_screen *)pscreen;
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
-   const uint32_t max_invocations = 32 * max_threads;
+   const uint32_t max_invocations = 32 * devinfo->max_cs_workgroup_threads;
 
    if (devinfo->ver < 7)
       return 0;
diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c
index f9df596dade..7b540fa0651 100644
--- a/src/gallium/drivers/iris/iris_screen.c
+++ b/src/gallium/drivers/iris/iris_screen.c
@@ -516,9 +516,7 @@ iris_get_compute_param(struct pipe_screen *pscreen,
    struct iris_screen *screen = (struct iris_screen *)pscreen;
    const struct intel_device_info *devinfo = &screen->devinfo;
 
-   /* Limit max_threads to 64 for the GPGPU_WALKER command. */
-   const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
-   const uint32_t max_invocations = 32 * max_threads;
+   const uint32_t max_invocations = 32 * devinfo->max_cs_workgroup_threads;
 
 #define RET(x) do {                  \
    if (ret)                          \
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 7dbebfd3c83..6052d3760ef 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -10099,7 +10099,7 @@ brw_compile_cs(const struct brw_compiler *compiler,
                                       prog_data->local_size[2];
 
       /* Limit max_threads to 64 for the GPGPU_WALKER command */
-      const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads);
+      const uint32_t max_threads = compiler->devinfo->max_cs_workgroup_threads;
       min_dispatch_width = util_next_power_of_two(
          MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads)));
       assert(min_dispatch_width <= 32);
@@ -10316,8 +10316,7 @@ brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo,
    if ((INTEL_DEBUG & DEBUG_DO32) && (mask & simd32))
       return 32;
 
-   /* Limit max_threads to 64 for the GPGPU_WALKER command */
-   const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads);
+   const uint32_t max_threads = devinfo->max_cs_workgroup_threads;
 
    if ((mask & simd8) && group_size <= 8 * max_threads) {
       /* Prefer SIMD16 if can do without spilling.  Matches logic in
diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c
index a1d3b2c2b8e..4a7cea5a718 100644
--- a/src/intel/dev/intel_device_info.c
+++ b/src/intel/dev/intel_device_info.c
@@ -1228,6 +1228,21 @@ getparam(int fd, uint32_t param, int *value)
    return true;
 }
 
+static void
+update_cs_workgroup_threads(struct intel_device_info *devinfo)
+{
+   /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we
+    * can program is 64 without going up to a rectangular group. This only
+    * impacts Haswell and TGL which have higher thread counts.
+    *
+    * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+
+    * is 10 bits so we have no such restrictions.
+    */
+   devinfo->max_cs_workgroup_threads =
+      devinfo->verx10 >= 125 ? devinfo->max_cs_threads :
+                               MIN2(devinfo->max_cs_threads, 64);
+}
+
 bool
 intel_get_device_info_from_pci_id(int pci_id,
                                   struct intel_device_info *devinfo)
@@ -1302,6 +1317,8 @@ intel_get_device_info_from_pci_id(int pci_id,
    if (devinfo->verx10 == 0)
       devinfo->verx10 = devinfo->ver * 10;
 
+   update_cs_workgroup_threads(devinfo);
+
    devinfo->chipset_id = pci_id;
    return true;
 }
@@ -1434,6 +1451,8 @@ fixup_chv_device_info(struct intel_device_info *devinfo)
    if (max_cs_threads > devinfo->max_cs_threads)
       devinfo->max_cs_threads = max_cs_threads;
 
+   update_cs_workgroup_threads(devinfo);
+
    /* Braswell is even more annoying.  Its marketing name isn't determinable
     * from the PCI ID and is also dependent on fusing.
     */
diff --git a/src/intel/dev/intel_device_info.h b/src/intel/dev/intel_device_info.h
index 12a62b7690b..30330f8b5b1 100644
--- a/src/intel/dev/intel_device_info.h
+++ b/src/intel/dev/intel_device_info.h
@@ -214,6 +214,17 @@ struct intel_device_info
     */
    unsigned max_cs_threads;
 
+   /**
+    * Maximum number of threads per workgroup supported by the GPGPU_WALKER or
+    * COMPUTE_WALKER command.
+    *
+    * This may be smaller than max_cs_threads as it takes into account added
+    * restrictions on the GPGPU/COMPUTE_WALKER commands.  While max_cs_threads
+    * expresses the total parallelism of the GPU, this expresses the maximum
+    * number of threads we can dispatch in a single workgroup.
+    */
+   unsigned max_cs_workgroup_threads;
+
    struct {
       /**
        * Fixed size of the URB.
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index deb9c89c56c..55541fe936c 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -1899,8 +1899,7 @@ void anv_GetPhysicalDeviceProperties(
       pdevice->has_bindless_images && pdevice->has_a64_buffer_access
       ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1;
 
-   /* Limit max_threads to 64 for the GPGPU_WALKER command */
-   const uint32_t max_workgroup_size = 32 * MIN2(64, devinfo->max_cs_threads);
+   const uint32_t max_workgroup_size = 32 * devinfo->max_cs_workgroup_threads;
 
    VkSampleCountFlags sample_counts =
       isl_device_get_sample_counts(&pdevice->isl_dev);
@@ -2537,8 +2536,7 @@ void anv_GetPhysicalDeviceProperties2(
          STATIC_ASSERT(8 <= BRW_SUBGROUP_SIZE && BRW_SUBGROUP_SIZE <= 32);
          props->minSubgroupSize = 8;
          props->maxSubgroupSize = 32;
-         /* Limit max_threads to 64 for the GPGPU_WALKER command. */
-         props->maxComputeWorkgroupSubgroups = MIN2(64, pdevice->info.max_cs_threads);
+         props->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads;
          props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT;
          break;
       }
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 86478c20eab..0d4c8317ea0 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -840,14 +840,8 @@ brw_initialize_cs_context_constants(struct brw_context *brw)
 
    /* Maximum number of scalar compute shader invocations that can be run in
     * parallel in the same subslice assuming SIMD32 dispatch.
-    *
-    * We don't advertise more than 64 threads, because we are limited to 64 by
-    * our usage of thread_width_max in the gpgpu walker command. This only
-    * currently impacts Haswell, which otherwise might be able to advertise 70
-    * threads. With SIMD32 and 64 threads, Haswell still provides twice the
-    * required the number of invocation needed for ARB_compute_shader.
     */
-   const unsigned max_threads = MIN2(64, devinfo->max_cs_threads);
+   const unsigned max_threads = devinfo->max_cs_workgroup_threads;
    const uint32_t max_invocations = 32 * max_threads;
    ctx->Const.MaxComputeWorkGroupSize[0] = max_invocations;
    ctx->Const.MaxComputeWorkGroupSize[1] = max_invocations;