From 6642749458b6a2e0800bb5e606dcf15e4db0c479 Mon Sep 17 00:00:00 2001 From: Jason Ekstrand Date: Tue, 13 Jul 2021 18:21:43 -0500 Subject: [PATCH] intel/dev: Add a max_cs_workgroup_threads field This is distinct form max_cs_threads because it also encodes restrictions about the way we use GPGPU/COMPUTE_WALKER. This gets rid of the MIN2(64, devinfo->max_cs_threads) we have scattered all over the driver and puts it in a central place. Reviewed-by: Caio Marcelo de Oliveira Filho Part-of: --- src/gallium/drivers/crocus/crocus_screen.c | 3 +-- src/gallium/drivers/iris/iris_screen.c | 4 +--- src/intel/compiler/brw_fs.cpp | 5 ++--- src/intel/dev/intel_device_info.c | 19 +++++++++++++++++++ src/intel/dev/intel_device_info.h | 11 +++++++++++ src/intel/vulkan/anv_device.c | 6 ++---- src/mesa/drivers/dri/i965/brw_context.c | 8 +------- 7 files changed, 37 insertions(+), 19 deletions(-) diff --git a/src/gallium/drivers/crocus/crocus_screen.c b/src/gallium/drivers/crocus/crocus_screen.c index c1e0cf3a294..2647fb923ee 100644 --- a/src/gallium/drivers/crocus/crocus_screen.c +++ b/src/gallium/drivers/crocus/crocus_screen.c @@ -545,8 +545,7 @@ crocus_get_compute_param(struct pipe_screen *pscreen, struct crocus_screen *screen = (struct crocus_screen *)pscreen; const struct intel_device_info *devinfo = &screen->devinfo; - const unsigned max_threads = MIN2(64, devinfo->max_cs_threads); - const uint32_t max_invocations = 32 * max_threads; + const uint32_t max_invocations = 32 * devinfo->max_cs_workgroup_threads; if (devinfo->ver < 7) return 0; diff --git a/src/gallium/drivers/iris/iris_screen.c b/src/gallium/drivers/iris/iris_screen.c index f9df596dade..7b540fa0651 100644 --- a/src/gallium/drivers/iris/iris_screen.c +++ b/src/gallium/drivers/iris/iris_screen.c @@ -516,9 +516,7 @@ iris_get_compute_param(struct pipe_screen *pscreen, struct iris_screen *screen = (struct iris_screen *)pscreen; const struct intel_device_info *devinfo = &screen->devinfo; - /* Limit max_threads to 64 for the GPGPU_WALKER command. */ - const unsigned max_threads = MIN2(64, devinfo->max_cs_threads); - const uint32_t max_invocations = 32 * max_threads; + const uint32_t max_invocations = 32 * devinfo->max_cs_workgroup_threads; #define RET(x) do { \ if (ret) \ diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 7dbebfd3c83..6052d3760ef 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -10099,7 +10099,7 @@ brw_compile_cs(const struct brw_compiler *compiler, prog_data->local_size[2]; /* Limit max_threads to 64 for the GPGPU_WALKER command */ - const uint32_t max_threads = MIN2(64, compiler->devinfo->max_cs_threads); + const uint32_t max_threads = compiler->devinfo->max_cs_workgroup_threads; min_dispatch_width = util_next_power_of_two( MAX2(8, DIV_ROUND_UP(local_workgroup_size, max_threads))); assert(min_dispatch_width <= 32); @@ -10316,8 +10316,7 @@ brw_cs_simd_size_for_group_size(const struct intel_device_info *devinfo, if ((INTEL_DEBUG & DEBUG_DO32) && (mask & simd32)) return 32; - /* Limit max_threads to 64 for the GPGPU_WALKER command */ - const uint32_t max_threads = MIN2(64, devinfo->max_cs_threads); + const uint32_t max_threads = devinfo->max_cs_workgroup_threads; if ((mask & simd8) && group_size <= 8 * max_threads) { /* Prefer SIMD16 if can do without spilling. Matches logic in diff --git a/src/intel/dev/intel_device_info.c b/src/intel/dev/intel_device_info.c index a1d3b2c2b8e..4a7cea5a718 100644 --- a/src/intel/dev/intel_device_info.c +++ b/src/intel/dev/intel_device_info.c @@ -1228,6 +1228,21 @@ getparam(int fd, uint32_t param, int *value) return true; } +static void +update_cs_workgroup_threads(struct intel_device_info *devinfo) +{ + /* GPGPU_WALKER::ThreadWidthCounterMaximum is U6-1 so the most threads we + * can program is 64 without going up to a rectangular group. This only + * impacts Haswell and TGL which have higher thread counts. + * + * INTERFACE_DESCRIPTOR_DATA::NumberofThreadsinGPGPUThreadGroup on Xe-HP+ + * is 10 bits so we have no such restrictions. + */ + devinfo->max_cs_workgroup_threads = + devinfo->verx10 >= 125 ? devinfo->max_cs_threads : + MIN2(devinfo->max_cs_threads, 64); +} + bool intel_get_device_info_from_pci_id(int pci_id, struct intel_device_info *devinfo) @@ -1302,6 +1317,8 @@ intel_get_device_info_from_pci_id(int pci_id, if (devinfo->verx10 == 0) devinfo->verx10 = devinfo->ver * 10; + update_cs_workgroup_threads(devinfo); + devinfo->chipset_id = pci_id; return true; } @@ -1434,6 +1451,8 @@ fixup_chv_device_info(struct intel_device_info *devinfo) if (max_cs_threads > devinfo->max_cs_threads) devinfo->max_cs_threads = max_cs_threads; + update_cs_workgroup_threads(devinfo); + /* Braswell is even more annoying. Its marketing name isn't determinable * from the PCI ID and is also dependent on fusing. */ diff --git a/src/intel/dev/intel_device_info.h b/src/intel/dev/intel_device_info.h index 12a62b7690b..30330f8b5b1 100644 --- a/src/intel/dev/intel_device_info.h +++ b/src/intel/dev/intel_device_info.h @@ -214,6 +214,17 @@ struct intel_device_info */ unsigned max_cs_threads; + /** + * Maximum number of threads per workgroup supported by the GPGPU_WALKER or + * COMPUTE_WALKER command. + * + * This may be smaller than max_cs_threads as it takes into account added + * restrictions on the GPGPU/COMPUTE_WALKER commands. While max_cs_threads + * expresses the total parallelism of the GPU, this expresses the maximum + * number of threads we can dispatch in a single workgroup. + */ + unsigned max_cs_workgroup_threads; + struct { /** * Fixed size of the URB. diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c index deb9c89c56c..55541fe936c 100644 --- a/src/intel/vulkan/anv_device.c +++ b/src/intel/vulkan/anv_device.c @@ -1899,8 +1899,7 @@ void anv_GetPhysicalDeviceProperties( pdevice->has_bindless_images && pdevice->has_a64_buffer_access ? UINT32_MAX : MAX_BINDING_TABLE_SIZE - MAX_RTS - 1; - /* Limit max_threads to 64 for the GPGPU_WALKER command */ - const uint32_t max_workgroup_size = 32 * MIN2(64, devinfo->max_cs_threads); + const uint32_t max_workgroup_size = 32 * devinfo->max_cs_workgroup_threads; VkSampleCountFlags sample_counts = isl_device_get_sample_counts(&pdevice->isl_dev); @@ -2537,8 +2536,7 @@ void anv_GetPhysicalDeviceProperties2( STATIC_ASSERT(8 <= BRW_SUBGROUP_SIZE && BRW_SUBGROUP_SIZE <= 32); props->minSubgroupSize = 8; props->maxSubgroupSize = 32; - /* Limit max_threads to 64 for the GPGPU_WALKER command. */ - props->maxComputeWorkgroupSubgroups = MIN2(64, pdevice->info.max_cs_threads); + props->maxComputeWorkgroupSubgroups = pdevice->info.max_cs_workgroup_threads; props->requiredSubgroupSizeStages = VK_SHADER_STAGE_COMPUTE_BIT; break; } diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c index 86478c20eab..0d4c8317ea0 100644 --- a/src/mesa/drivers/dri/i965/brw_context.c +++ b/src/mesa/drivers/dri/i965/brw_context.c @@ -840,14 +840,8 @@ brw_initialize_cs_context_constants(struct brw_context *brw) /* Maximum number of scalar compute shader invocations that can be run in * parallel in the same subslice assuming SIMD32 dispatch. - * - * We don't advertise more than 64 threads, because we are limited to 64 by - * our usage of thread_width_max in the gpgpu walker command. This only - * currently impacts Haswell, which otherwise might be able to advertise 70 - * threads. With SIMD32 and 64 threads, Haswell still provides twice the - * required the number of invocation needed for ARB_compute_shader. */ - const unsigned max_threads = MIN2(64, devinfo->max_cs_threads); + const unsigned max_threads = devinfo->max_cs_workgroup_threads; const uint32_t max_invocations = 32 * max_threads; ctx->Const.MaxComputeWorkGroupSize[0] = max_invocations; ctx->Const.MaxComputeWorkGroupSize[1] = max_invocations;