diff --git a/src/gallium/drivers/crocus/crocus_program.c b/src/gallium/drivers/crocus/crocus_program.c index 8204f94b9dc..ef1a95aa6da 100644 --- a/src/gallium/drivers/crocus/crocus_program.c +++ b/src/gallium/drivers/crocus/crocus_program.c @@ -49,7 +49,6 @@ #include "nir/tgsi_to_nir.h" #define KEY_INIT_NO_ID() \ - .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \ .base.tex.swizzles[0 ... BRW_MAX_SAMPLERS - 1] = 0x688, \ .base.tex.compressed_multisample_layout_mask = ~0 #define KEY_INIT() \ diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 02a748afa63..1e9cbc4f755 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -54,7 +54,6 @@ #define BRW_KEY_INIT(gen, prog_id, limit_trig_input) \ .base.program_string_id = prog_id, \ .base.limit_trig_input_range = limit_trig_input, \ - .base.subgroup_size_type = BRW_SUBGROUP_SIZE_UNIFORM, \ .base.tex.swizzles[0 ... BRW_MAX_SAMPLERS - 1] = 0x688, \ .base.tex.compressed_multisample_layout_mask = ~0, \ .base.tex.msaa_16 = (gen >= 9 ? ~0 : 0) diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 8fc90815df7..fec0a0a5bdc 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -230,26 +230,9 @@ struct brw_sampler_prog_key_data { float scale_factors[BRW_MAX_SAMPLERS]; }; -/** An enum representing what kind of input gl_SubgroupSize is. */ -enum PACKED brw_subgroup_size_type -{ - BRW_SUBGROUP_SIZE_API_CONSTANT, /**< Default Vulkan behavior */ - BRW_SUBGROUP_SIZE_UNIFORM, /**< OpenGL behavior */ - BRW_SUBGROUP_SIZE_VARYING, /**< VK_EXT_subgroup_size_control */ - - /* These enums are specifically chosen so that the value of the enum is - * also the subgroup size. If any new values are added, they must respect - * this invariant. - */ - BRW_SUBGROUP_SIZE_REQUIRE_8 = 8, /**< VK_EXT_subgroup_size_control */ - BRW_SUBGROUP_SIZE_REQUIRE_16 = 16, /**< VK_EXT_subgroup_size_control */ - BRW_SUBGROUP_SIZE_REQUIRE_32 = 32, /**< VK_EXT_subgroup_size_control */ -}; - struct brw_base_prog_key { unsigned program_string_id; - enum brw_subgroup_size_type subgroup_size_type; bool robust_buffer_access; /** diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d139b1caa62..27e3d5ab9fb 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -7759,7 +7759,7 @@ brw_compile_cs(const struct brw_compiler *compiler, } const unsigned required_dispatch_width = - brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type); + brw_required_dispatch_width(&nir->info); fs_visitor *v[3] = {0}; const char *error[3] = {0}; diff --git a/src/intel/compiler/brw_kernel.c b/src/intel/compiler/brw_kernel.c index f3d112e816b..29ae5875d32 100644 --- a/src/intel/compiler/brw_kernel.c +++ b/src/intel/compiler/brw_kernel.c @@ -439,9 +439,7 @@ brw_kernel_from_spirv(struct brw_compiler *compiler, NIR_PASS_V(nir, brw_nir_lower_cs_intrinsics); NIR_PASS_V(nir, lower_kernel_intrinsics); - struct brw_cs_prog_key key = { - .base.subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING, - }; + struct brw_cs_prog_key key = { }; memset(&kernel->prog_data, 0, sizeof(kernel->prog_data)); kernel->prog_data.base.nr_params = DIV_ROUND_UP(nir->num_uniforms, 4); diff --git a/src/intel/compiler/brw_mesh.cpp b/src/intel/compiler/brw_mesh.cpp index fb78ca1c1ec..d9828923c9e 100644 --- a/src/intel/compiler/brw_mesh.cpp +++ b/src/intel/compiler/brw_mesh.cpp @@ -217,7 +217,7 @@ brw_compile_task(const struct brw_compiler *compiler, NIR_PASS_V(nir, brw_nir_lower_tue_outputs, &prog_data->map); const unsigned required_dispatch_width = - brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type); + brw_required_dispatch_width(&nir->info); fs_visitor *v[3] = {0}; const char *error[3] = {0}; @@ -715,7 +715,7 @@ brw_compile_mesh(const struct brw_compiler *compiler, NIR_PASS_V(nir, brw_nir_lower_mue_outputs, &prog_data->map); const unsigned required_dispatch_width = - brw_required_dispatch_width(&nir->info, key->base.subgroup_size_type); + brw_required_dispatch_width(&nir->info); fs_visitor *v[3] = {0}; const char *error[3] = {0}; diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c index 9ceb627cbb8..e7b053bb908 100644 --- a/src/intel/compiler/brw_nir.c +++ b/src/intel/compiler/brw_nir.c @@ -1353,16 +1353,14 @@ brw_nir_apply_sampler_key(nir_shader *nir, } static unsigned -get_subgroup_size(gl_shader_stage stage, - const struct brw_base_prog_key *key, - unsigned max_subgroup_size) +get_subgroup_size(const struct shader_info *info, unsigned max_subgroup_size) { - switch (key->subgroup_size_type) { - case BRW_SUBGROUP_SIZE_API_CONSTANT: + switch (info->subgroup_size) { + case SUBGROUP_SIZE_API_CONSTANT: /* We have to use the global constant size. */ return BRW_SUBGROUP_SIZE; - case BRW_SUBGROUP_SIZE_UNIFORM: + case SUBGROUP_SIZE_UNIFORM: /* It has to be uniform across all invocations but can vary per stage * if we want. This gives us a bit more freedom. * @@ -1373,7 +1371,7 @@ get_subgroup_size(gl_shader_stage stage, */ return max_subgroup_size; - case BRW_SUBGROUP_SIZE_VARYING: + case SUBGROUP_SIZE_VARYING: /* The subgroup size is allowed to be fully varying. For geometry * stages, we know it's always 8 which is max_subgroup_size so we can * return that. For compute, brw_nir_apply_key is called once per @@ -1384,16 +1382,21 @@ get_subgroup_size(gl_shader_stage stage, * that's a risk the client took when it asked for a varying subgroup * size. */ - return stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size; + return info->stage == MESA_SHADER_FRAGMENT ? 0 : max_subgroup_size; - case BRW_SUBGROUP_SIZE_REQUIRE_8: - case BRW_SUBGROUP_SIZE_REQUIRE_16: - case BRW_SUBGROUP_SIZE_REQUIRE_32: - assert(gl_shader_stage_uses_workgroup(stage)); + case SUBGROUP_SIZE_REQUIRE_8: + case SUBGROUP_SIZE_REQUIRE_16: + case SUBGROUP_SIZE_REQUIRE_32: + assert(gl_shader_stage_uses_workgroup(info->stage)); /* These enum values are expressly chosen to be equal to the subgroup * size that they require. */ - return key->subgroup_size_type; + return info->subgroup_size; + + case SUBGROUP_SIZE_FULL_SUBGROUPS: + case SUBGROUP_SIZE_REQUIRE_64: + case SUBGROUP_SIZE_REQUIRE_128: + break; } unreachable("Invalid subgroup size type"); @@ -1411,8 +1414,7 @@ brw_nir_apply_key(nir_shader *nir, OPT(brw_nir_apply_sampler_key, compiler, &key->tex); const nir_lower_subgroups_options subgroups_options = { - .subgroup_size = get_subgroup_size(nir->info.stage, key, - max_subgroup_size), + .subgroup_size = get_subgroup_size(&nir->info, max_subgroup_size), .ballot_bit_size = 32, .ballot_components = 1, .lower_subgroup_masks = true, diff --git a/src/intel/compiler/brw_private.h b/src/intel/compiler/brw_private.h index c4334ce3ff7..0cddef9e051 100644 --- a/src/intel/compiler/brw_private.h +++ b/src/intel/compiler/brw_private.h @@ -31,8 +31,7 @@ extern "C" { #endif -unsigned brw_required_dispatch_width(const struct shader_info *info, - enum brw_subgroup_size_type subgroup_size_type); +unsigned brw_required_dispatch_width(const struct shader_info *info); bool brw_simd_should_compile(void *mem_ctx, unsigned simd, diff --git a/src/intel/compiler/brw_simd_selection.c b/src/intel/compiler/brw_simd_selection.c index 2ab9bfeed89..48055eb9b42 100644 --- a/src/intel/compiler/brw_simd_selection.c +++ b/src/intel/compiler/brw_simd_selection.c @@ -28,26 +28,17 @@ #include "util/ralloc.h" unsigned -brw_required_dispatch_width(const struct shader_info *info, - enum brw_subgroup_size_type subgroup_size_type) +brw_required_dispatch_width(const struct shader_info *info) { - unsigned required = 0; - - if ((int)subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) { + if ((int)info->subgroup_size >= (int)SUBGROUP_SIZE_REQUIRE_8) { assert(gl_shader_stage_uses_workgroup(info->stage)); /* These enum values are expressly chosen to be equal to the subgroup * size that they require. */ - required = (unsigned)subgroup_size_type; + return (unsigned)info->subgroup_size; + } else { + return 0; } - - if (gl_shader_stage_is_compute(info->stage) && - info->subgroup_size >= SUBGROUP_SIZE_REQUIRE_8) { - assert(required == 0 || required == info->subgroup_size); - required = info->subgroup_size; - } - - return required; } static inline bool diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 758afb54057..0feecbd73bf 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -315,11 +315,9 @@ populate_sampler_prog_key(const struct intel_device_info *devinfo, static void populate_base_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_acccess, struct brw_base_prog_key *key) { - key->subgroup_size_type = subgroup_size_type; key->robust_buffer_access = robust_buffer_acccess; key->limit_trig_input_range = device->physical->instance->limit_trig_input_range; @@ -329,14 +327,12 @@ populate_base_prog_key(const struct anv_device *device, static void populate_vs_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_acccess, struct brw_vs_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, - robust_buffer_acccess, &key->base); + populate_base_prog_key(device, robust_buffer_acccess, &key->base); /* XXX: Handle vertex input work-arounds */ @@ -345,41 +341,35 @@ populate_vs_prog_key(const struct anv_device *device, static void populate_tcs_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_acccess, unsigned input_vertices, struct brw_tcs_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, - robust_buffer_acccess, &key->base); + populate_base_prog_key(device, robust_buffer_acccess, &key->base); key->input_vertices = input_vertices; } static void populate_tes_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_acccess, struct brw_tes_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, - robust_buffer_acccess, &key->base); + populate_base_prog_key(device, robust_buffer_acccess, &key->base); } static void populate_gs_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_acccess, struct brw_gs_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, - robust_buffer_acccess, &key->base); + populate_base_prog_key(device, robust_buffer_acccess, &key->base); } static bool @@ -439,29 +429,26 @@ pipeline_has_coarse_pixel(const struct anv_graphics_pipeline *pipeline, static void populate_task_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_access, struct brw_task_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, robust_buffer_access, &key->base); + populate_base_prog_key(device, robust_buffer_access, &key->base); } static void populate_mesh_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_access, struct brw_mesh_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, robust_buffer_access, &key->base); + populate_base_prog_key(device, robust_buffer_access, &key->base); } static void populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline, - VkPipelineShaderStageCreateFlags flags, bool robust_buffer_acccess, const VkPipelineMultisampleStateCreateInfo *ms_info, const VkPipelineFragmentShadingRateStateCreateInfoKHR *fsr_info, @@ -472,7 +459,7 @@ populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline, memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, flags, robust_buffer_acccess, &key->base); + populate_base_prog_key(device, robust_buffer_acccess, &key->base); /* We set this to 0 here and set to the actual value before we call * brw_compile_fs. @@ -520,25 +507,22 @@ populate_wm_prog_key(const struct anv_graphics_pipeline *pipeline, static void populate_cs_prog_key(const struct anv_device *device, - enum brw_subgroup_size_type subgroup_size_type, bool robust_buffer_acccess, struct brw_cs_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, subgroup_size_type, - robust_buffer_acccess, &key->base); + populate_base_prog_key(device, robust_buffer_acccess, &key->base); } static void populate_bs_prog_key(const struct anv_device *device, - VkPipelineShaderStageCreateFlags flags, bool robust_buffer_access, struct brw_bs_prog_key *key) { memset(key, 0, sizeof(*key)); - populate_base_prog_key(device, flags, robust_buffer_access, &key->base); + populate_base_prog_key(device, robust_buffer_access, &key->base); } struct anv_pipeline_stage { @@ -1323,45 +1307,6 @@ anv_pipeline_add_executables(struct anv_pipeline *pipeline, pipeline->ray_queries = MAX2(pipeline->ray_queries, bin->prog_data->ray_queries); } -static enum brw_subgroup_size_type -anv_subgroup_size_type(gl_shader_stage stage, - const struct vk_shader_module *module, - VkPipelineShaderStageCreateFlags flags, - const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info) -{ - enum brw_subgroup_size_type subgroup_size_type; - - const bool allow_varying = - flags & VK_PIPELINE_SHADER_STAGE_CREATE_ALLOW_VARYING_SUBGROUP_SIZE_BIT_EXT || - vk_shader_module_spirv_version(module) >= 0x10600; - - if (rss_info) { - assert(gl_shader_stage_uses_workgroup(stage)); - /* These enum values are expressly chosen to be equal to the subgroup - * size that they require. - */ - assert(rss_info->requiredSubgroupSize == 8 || - rss_info->requiredSubgroupSize == 16 || - rss_info->requiredSubgroupSize == 32); - subgroup_size_type = rss_info->requiredSubgroupSize; - } else if (allow_varying) { - subgroup_size_type = BRW_SUBGROUP_SIZE_VARYING; - } else if (flags & VK_PIPELINE_SHADER_STAGE_CREATE_REQUIRE_FULL_SUBGROUPS_BIT_EXT) { - assert(stage == MESA_SHADER_COMPUTE); - /* If the client expressly requests full subgroups and they don't - * specify a subgroup size neither allow varying subgroups, we need to - * pick one. So we specify the API value of 32. Performance will - * likely be terrible in this case but there's nothing we can do about - * that. The client should have chosen a size. - */ - subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_32; - } else { - subgroup_size_type = BRW_SUBGROUP_SIZE_API_CONSTANT; - } - - return subgroup_size_type; -} - static void anv_pipeline_init_from_cached_graphics(struct anv_graphics_pipeline *pipeline) { @@ -1404,7 +1349,6 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, VkResult result; for (uint32_t i = 0; i < info->stageCount; i++) { const VkPipelineShaderStageCreateInfo *sinfo = &info->pStages[i]; - VK_FROM_HANDLE(vk_shader_module, module, sinfo->module); gl_shader_stage stage = vk_to_mesa_shader_stage(sinfo->stage); int64_t stage_start = os_time_get_nano(); @@ -1413,33 +1357,26 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, stages[stage].info = sinfo; vk_pipeline_hash_shader_stage(&info->pStages[i], stages[stage].shader_sha1); - const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT *rss_info = - vk_find_struct_const(sinfo->pNext, - PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT); - - enum brw_subgroup_size_type subgroup_size_type = - anv_subgroup_size_type(stage, module, sinfo->flags, rss_info); - const struct anv_device *device = pipeline->base.device; switch (stage) { case MESA_SHADER_VERTEX: - populate_vs_prog_key(device, subgroup_size_type, + populate_vs_prog_key(device, pipeline->base.device->robust_buffer_access, &stages[stage].key.vs); break; case MESA_SHADER_TESS_CTRL: - populate_tcs_prog_key(device, subgroup_size_type, + populate_tcs_prog_key(device, pipeline->base.device->robust_buffer_access, info->pTessellationState->patchControlPoints, &stages[stage].key.tcs); break; case MESA_SHADER_TESS_EVAL: - populate_tes_prog_key(device, subgroup_size_type, + populate_tes_prog_key(device, pipeline->base.device->robust_buffer_access, &stages[stage].key.tes); break; case MESA_SHADER_GEOMETRY: - populate_gs_prog_key(device, subgroup_size_type, + populate_gs_prog_key(device, pipeline->base.device->robust_buffer_access, &stages[stage].key.gs); break; @@ -1447,7 +1384,7 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, const bool raster_enabled = !info->pRasterizationState->rasterizerDiscardEnable || dynamic_states & ANV_CMD_DIRTY_DYNAMIC_RASTERIZER_DISCARD_ENABLE; - populate_wm_prog_key(pipeline, subgroup_size_type, + populate_wm_prog_key(pipeline, pipeline->base.device->robust_buffer_access, raster_enabled ? info->pMultisampleState : NULL, vk_find_struct_const(info->pNext, @@ -1457,12 +1394,12 @@ anv_pipeline_compile_graphics(struct anv_graphics_pipeline *pipeline, break; } case MESA_SHADER_TASK: - populate_task_prog_key(device, subgroup_size_type, + populate_task_prog_key(device, pipeline->base.device->robust_buffer_access, &stages[stage].key.task); break; case MESA_SHADER_MESH: - populate_mesh_prog_key(device, subgroup_size_type, + populate_mesh_prog_key(device, pipeline->base.device->robust_buffer_access, &stages[stage].key.mesh); break; @@ -1849,7 +1786,6 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline, const VkComputePipelineCreateInfo *info) { const VkPipelineShaderStageCreateInfo *sinfo = &info->stage; - VK_FROM_HANDLE(vk_shader_module, module, sinfo->module); assert(sinfo->stage == VK_SHADER_STAGE_COMPUTE_BIT); VkPipelineCreationFeedbackEXT pipeline_feedback = { @@ -1874,16 +1810,7 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline, struct anv_shader_bin *bin = NULL; - const VkPipelineShaderStageRequiredSubgroupSizeCreateInfo *rss_info = - vk_find_struct_const(info->stage.pNext, - PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO); - - const enum brw_subgroup_size_type subgroup_size_type = - anv_subgroup_size_type(MESA_SHADER_COMPUTE, module, info->stage.flags, rss_info); - - populate_cs_prog_key(device, subgroup_size_type, - device->robust_buffer_access, - &stage.key.cs); + populate_cs_prog_key(device, device->robust_buffer_access, &stage.key.cs); ANV_FROM_HANDLE(anv_pipeline_layout, layout, info->layout); @@ -1939,10 +1866,19 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline, */ if (device->physical->instance->assume_full_subgroups && stage.nir->info.cs.uses_wide_subgroup_intrinsics && - subgroup_size_type == BRW_SUBGROUP_SIZE_API_CONSTANT && + stage.nir->info.subgroup_size == SUBGROUP_SIZE_API_CONSTANT && local_size && local_size % BRW_SUBGROUP_SIZE == 0) - stage.key.base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_32; + stage.nir->info.subgroup_size = SUBGROUP_SIZE_FULL_SUBGROUPS; + + /* If the client requests that we dispatch full subgroups but doesn't + * allow us to pick a subgroup size, we have to smash it to the API + * value of 32. Performance will likely be terrible in this case but + * there's nothing we can do about that. The client should have chosen + * a size. + */ + if (stage.nir->info.subgroup_size == SUBGROUP_SIZE_FULL_SUBGROUPS) + stage.nir->info.subgroup_size = BRW_SUBGROUP_SIZE; stage.num_stats = 1; @@ -2693,7 +2629,7 @@ anv_pipeline_init_ray_tracing_stages(struct anv_ray_tracing_pipeline *pipeline, }, }; - populate_bs_prog_key(pipeline->base.device, sinfo->flags, + populate_bs_prog_key(pipeline->base.device, pipeline->base.device->robust_buffer_access, &stages[i].key.bs); @@ -2997,10 +2933,6 @@ anv_device_init_rt_shaders(struct anv_device *device) struct brw_cs_prog_key key; } trampoline_key = { .name = "rt-trampoline", - .key = { - /* TODO: Other subgroup sizes? */ - .base.subgroup_size_type = BRW_SUBGROUP_SIZE_REQUIRE_8, - }, }; device->rt_trampoline = anv_device_search_for_kernel(device, device->internal_cache, @@ -3012,6 +2944,8 @@ anv_device_init_rt_shaders(struct anv_device *device) nir_shader *trampoline_nir = brw_nir_create_raygen_trampoline(device->physical->compiler, tmp_ctx); + trampoline_nir->info.subgroup_size = SUBGROUP_SIZE_REQUIRE_8; + struct anv_pipeline_bind_map bind_map = { .surface_count = 0, .sampler_count = 0,