diff --git a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c index 02e8b80353e..e2fefef1736 100644 --- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c +++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c @@ -1895,6 +1895,45 @@ lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin, return true; } +static bool +lower_num_workgroups(nir_builder *b, nir_intrinsic_instr *intrin, + struct apply_pipeline_layout_state *state) +{ + /* For those stages, HW will generate values through payload registers. */ + if (gl_shader_stage_is_mesh(b->shader->info.stage)) + return false; + + b->cursor = nir_instr_remove(&intrin->instr); + nir_def *num_workgroups; + /* On Gfx12.5+ we use the inline register to push the values, on prior + * generation we use push constants. + */ + if (state->pdevice->info.verx10 >= 125) { + num_workgroups = + nir_load_inline_data_intel( + b, 3, 32, + .base = ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET); + } else { + num_workgroups = + anv_load_driver_uniform(b, 3, cs.num_work_groups[0]); + } + + nir_def *num_workgroups_indirect; + nir_push_if(b, nir_ieq_imm(b, nir_channel(b, num_workgroups, 0), UINT32_MAX)); + { + nir_def *addr = nir_pack_64_2x32_split(b, + nir_channel(b, num_workgroups, 1), + nir_channel(b, num_workgroups, 2)); + num_workgroups_indirect = nir_load_global_constant(b, addr, 4, 3, 32); + } + nir_pop_if(b, NULL); + + num_workgroups = nir_if_phi(b, num_workgroups_indirect, num_workgroups); + nir_def_rewrite_uses(&intrin->def, num_workgroups); + + return true; +} + static bool apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state) { @@ -1930,6 +1969,8 @@ apply_pipeline_layout(nir_builder *b, nir_instr *instr, void *_state) return lower_base_workgroup_id(b, intrin, state); case nir_intrinsic_load_ray_query_global_intel: return lower_ray_query_globals(b, intrin, state); + case nir_intrinsic_load_num_workgroups: + return lower_num_workgroups(b, intrin, state); default: return false; } @@ -2434,7 +2475,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader, nir_opt_dce(shader); nir_shader_instructions_pass(shader, apply_pipeline_layout, - nir_metadata_control_flow, + nir_metadata_none, &state); ralloc_free(mem_ctx); diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index 73bb0cd4735..79d60b5ce1f 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -62,6 +62,16 @@ anv_nir_compute_push_layout(nir_shader *nir, unsigned range = nir_intrinsic_range(intrin); push_start = MIN2(push_start, base); push_end = MAX2(push_end, base + range); + /* We need to retain this information to update the push + * constant on vkCmdDispatch*(). + */ + if (nir->info.stage == MESA_SHADER_COMPUTE && + base >= anv_drv_const_offset(cs.num_work_groups[0]) && + base < (anv_drv_const_offset(cs.num_work_groups[2]) + 4)) { + struct brw_cs_prog_data *cs_prog_data = + container_of(prog_data, struct brw_cs_prog_data, base); + cs_prog_data->uses_num_work_groups = true; + } break; } diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c index 9fe244fc5b6..3a827361ad0 100644 --- a/src/intel/vulkan/anv_pipeline.c +++ b/src/intel/vulkan/anv_pipeline.c @@ -1700,9 +1700,6 @@ anv_pipeline_add_executable(struct anv_pipeline *pipeline, stage->bind_map.push_ranges[i].start * 32); break; - case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: - unreachable("gl_NumWorkgroups is never pushed"); - case ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS: unreachable("Color attachments can't be pushed"); @@ -2684,13 +2681,6 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline, anv_stage_allocate_bind_map_tables(&pipeline->base, &stage, mem_ctx); - /* Set up a binding for the gl_NumWorkGroups */ - stage.bind_map.surface_count = 1; - stage.bind_map.surface_to_descriptor[0] = (struct anv_pipeline_binding) { - .set = ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS, - .binding = UINT32_MAX, - }; - VkResult result = anv_pipeline_stage_get_nir(&pipeline->base, cache, mem_ctx, &stage); if (result != VK_SUCCESS) { @@ -2736,12 +2726,6 @@ anv_pipeline_compile_cs(struct anv_compute_pipeline *pipeline, anv_nir_validate_push_layout(device->physical, &stage.prog_data.base, &stage.bind_map); - if (!stage.prog_data.cs.uses_num_work_groups) { - assert(stage.bind_map.surface_to_descriptor[0].set == - ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS); - stage.bind_map.surface_to_descriptor[0].set = ANV_DESCRIPTOR_SET_NULL; - } - struct anv_shader_upload_params upload_params = { .stage = MESA_SHADER_COMPUTE, .key_data = &stage.cache_key, diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index 46253af46b0..15a3c4065b9 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -222,6 +222,8 @@ struct intel_perf_query_result; #define ANV_GRAPHICS_SHADER_STAGE_COUNT (MESA_SHADER_MESH + 1) +#define ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET (8) + /* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64 * and we can't put anything else there we use 64b. */ @@ -3076,11 +3078,10 @@ anv_descriptor_set_write_template(struct anv_device *device, const struct vk_descriptor_update_template *template, const void *data); -#define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 5) -#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 4) -#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 3) -#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 2) -#define ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS (UINT8_MAX - 1) +#define ANV_DESCRIPTOR_SET_DESCRIPTORS_BUFFER (UINT8_MAX - 4) +#define ANV_DESCRIPTOR_SET_NULL (UINT8_MAX - 3) +#define ANV_DESCRIPTOR_SET_PUSH_CONSTANTS (UINT8_MAX - 2) +#define ANV_DESCRIPTOR_SET_DESCRIPTORS (UINT8_MAX - 1) #define ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS UINT8_MAX struct anv_pipeline_binding { @@ -3598,6 +3599,9 @@ struct anv_push_constants { */ uint32_t base_work_group_id[3]; + /** gl_NumWorkgroups */ + uint32_t num_work_groups[3]; + /** Subgroup ID * * This is never set by software but is implicitly filled out when @@ -3908,8 +3912,6 @@ struct anv_cmd_compute_state { bool pipeline_dirty; - struct anv_address num_workgroups; - uint32_t scratch_size; }; diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 689286a124d..a78b44431b1 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -2102,29 +2102,6 @@ emit_binding_table(struct anv_cmd_buffer *cmd_buffer, bt_map[s] = surface_state.offset + state_offset; break; - case ANV_DESCRIPTOR_SET_NUM_WORK_GROUPS: { - /* This is always the first binding for compute shaders */ - assert(shader->stage == MESA_SHADER_COMPUTE && s == 0); - - struct anv_state surface_state = - anv_cmd_buffer_alloc_surface_states(cmd_buffer, 1); - if (surface_state.map == NULL) - return VK_ERROR_OUT_OF_DEVICE_MEMORY; - - const enum isl_format format = - anv_isl_format_for_descriptor_type(cmd_buffer->device, - VK_DESCRIPTOR_TYPE_STORAGE_BUFFER); - anv_fill_buffer_surface_state(cmd_buffer->device, surface_state.map, - format, ISL_SWIZZLE_IDENTITY, - ISL_SURF_USAGE_CONSTANT_BUFFER_BIT, - cmd_buffer->state.compute.num_workgroups, - 12, 1); - - assert(surface_state.map); - bt_map[s] = surface_state.offset + state_offset; - break; - } - case ANV_DESCRIPTOR_SET_DESCRIPTORS: { struct anv_descriptor_set *set = pipe_state->descriptors[binding->index]; @@ -4762,6 +4739,20 @@ genX(flush_pipeline_select)(struct anv_cmd_buffer *cmd_buffer, } } #endif + +#if GFX_VER == 9 + /* Undocumented workaround, we need to reemit MEDIA_CURBE_LOAD on Gfx9 when + * switching from 3D->GPGPU, otherwise the shader gets corrupted push + * constants. Note that this doesn't trigger a push constant reallocation, + * we just reprogram the same pointer. + * + * The issue reproduces pretty much 100% on + * dEQP-VK.memory_model.transitive.* tests. Reducing the number of + * iteration in the test from 50 to < 10 makes the tests flaky. + */ + if (pipeline == GPGPU) + cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; +#endif #endif /* else of if GFX_VER >= 20 */ cmd_buffer->state.current_pipeline = pipeline; } diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 17c9e671e53..efc01d1d9cc 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -145,11 +145,15 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) genX(cmd_buffer_ensure_cfe_state)(cmd_buffer, prog_data->base.total_scratch); #endif - /* The workgroup size of the pipeline affects our push constant layout - * so flag push constants as dirty if we change the pipeline. +#if GFX_VERx10 == 120 + /* Normally we should not require any dirtying here, but for some reason + * on Gfx12.0, when running tests in parallel we see failures in the + * dEQP-VK.memory_model.* tests. This is likely a HW issue with push + * constants & context save/restore. */ cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; comp_state->base.push_constants_data_dirty = true; +#endif } cmd_buffer->state.descriptors_dirty |= @@ -217,23 +221,58 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) } static void -anv_cmd_buffer_push_base_group_id(struct anv_cmd_buffer *cmd_buffer, - uint32_t baseGroupX, - uint32_t baseGroupY, - uint32_t baseGroupZ) +anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer, + const struct brw_cs_prog_data *prog_data, + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ, + struct anv_address indirect_group) { if (anv_batch_has_error(&cmd_buffer->batch)) return; struct anv_push_constants *push = &cmd_buffer->state.compute.base.push_constants; + bool updated = false; if (push->cs.base_work_group_id[0] != baseGroupX || push->cs.base_work_group_id[1] != baseGroupY || push->cs.base_work_group_id[2] != baseGroupZ) { push->cs.base_work_group_id[0] = baseGroupX; push->cs.base_work_group_id[1] = baseGroupY; push->cs.base_work_group_id[2] = baseGroupZ; + updated = true; + } + /* On Gfx12.5+ this value goes into the inline parameter register */ + if (GFX_VERx10 < 125 && prog_data->uses_num_work_groups) { + if (anv_address_is_null(indirect_group)) { + if (push->cs.num_work_groups[0] != groupCountX || + push->cs.num_work_groups[1] != groupCountY || + push->cs.num_work_groups[2] != groupCountZ) { + push->cs.num_work_groups[0] = groupCountX; + push->cs.num_work_groups[1] = groupCountY; + push->cs.num_work_groups[2] = groupCountZ; + updated = true; + } + } else { + uint64_t addr64 = anv_address_physical(indirect_group); + uint32_t lower_addr32 = addr64 & 0xffffffff; + uint32_t upper_addr32 = addr64 >> 32; + if (push->cs.num_work_groups[0] != UINT32_MAX || + push->cs.num_work_groups[1] != lower_addr32 || + push->cs.num_work_groups[2] != upper_addr32) { + push->cs.num_work_groups[0] = UINT32_MAX; + push->cs.num_work_groups[1] = lower_addr32; + push->cs.num_work_groups[2] = upper_addr32; + updated = true; + } + } + } + + if (updated) { cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; cmd_buffer->state.compute.base.push_constants_data_dirty = true; } @@ -321,6 +360,8 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, brw_cs_get_dispatch_info(devinfo, prog_data, NULL); const int dispatch_size = dispatch.simd_size / 16; + uint64_t indirect_addr64 = anv_address_physical(indirect_addr); + struct GENX(COMPUTE_WALKER_BODY) body = { .SIMDSize = dispatch_size, .MessageSIMD = dispatch_size, @@ -339,6 +380,11 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, .InterfaceDescriptor = get_interface_descriptor_data(cmd_buffer, shader, prog_data, &dispatch), + .InlineData = { + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX, + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff, + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32, + }, }; cmd_buffer->state.last_indirect_dispatch = @@ -357,7 +403,8 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, static inline void emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, - const struct anv_compute_pipeline *pipeline, bool indirect, + const struct anv_compute_pipeline *pipeline, + struct anv_address indirect_addr, const struct brw_cs_prog_data *prog_data, uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) @@ -369,12 +416,24 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, const struct intel_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(devinfo, prog_data, NULL); + uint32_t num_workgroup_data[3]; + if (!anv_address_is_null(indirect_addr)) { + uint64_t indirect_addr64 = anv_address_physical(indirect_addr); + num_workgroup_data[0] = 0xffffffff; + num_workgroup_data[1] = indirect_addr64 & 0xffffffff; + num_workgroup_data[2] = indirect_addr64 >> 32; + } else { + num_workgroup_data[0] = groupCountX; + num_workgroup_data[1] = groupCountY; + num_workgroup_data[2] = groupCountZ; + } + cmd_buffer->state.last_compute_walker = anv_batch_emitn( &cmd_buffer->batch, GENX(COMPUTE_WALKER_length), GENX(COMPUTE_WALKER), - .IndirectParameterEnable = indirect, + .IndirectParameterEnable = !anv_address_is_null(indirect_addr), .PredicateEnable = predicate, .SIMDSize = dispatch.simd_size / 16, .MessageSIMD = dispatch.simd_size / 16, @@ -401,7 +460,13 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, .InterfaceDescriptor = get_interface_descriptor_data(cmd_buffer, pipeline->cs, prog_data, &dispatch), - ); + .EmitInlineParameter = prog_data->uses_inline_data, + .InlineData = { + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0], + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1], + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2], + }); + } #else /* #if GFX_VERx10 >= 125 */ @@ -459,7 +524,7 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, compute_load_indirect_params(cmd_buffer, indirect_addr); #if GFX_VERx10 >= 125 - emit_compute_walker(cmd_buffer, pipeline, is_indirect, prog_data, + emit_compute_walker(cmd_buffer, pipeline, indirect_addr, prog_data, groupCountX, groupCountY, groupCountZ); #else emit_gpgpu_walker(cmd_buffer, pipeline, is_indirect, prog_data, @@ -481,12 +546,14 @@ void genX(CmdDispatchBase)( anv_pipeline_to_compute(cmd_buffer->state.compute.base.pipeline); const struct brw_cs_prog_data *prog_data = get_cs_prog_data(pipeline); - anv_cmd_buffer_push_base_group_id(cmd_buffer, baseGroupX, - baseGroupY, baseGroupZ); - if (anv_batch_has_error(&cmd_buffer->batch)) return; + anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data, + baseGroupX, baseGroupY, baseGroupZ, + groupCountX, groupCountY, groupCountZ, + ANV_NULL_ADDRESS); + anv_measure_snapshot(cmd_buffer, INTEL_SNAPSHOT_COMPUTE, "compute", @@ -496,20 +563,6 @@ void genX(CmdDispatchBase)( trace_intel_begin_compute(&cmd_buffer->trace); - if (prog_data->uses_num_work_groups) { - struct anv_state state = - anv_cmd_buffer_alloc_temporary_state(cmd_buffer, 12, 4); - uint32_t *sizes = state.map; - sizes[0] = groupCountX; - sizes[1] = groupCountY; - sizes[2] = groupCountZ; - cmd_buffer->state.compute.num_workgroups = - anv_cmd_buffer_temporary_state_address(cmd_buffer, state); - - /* The num_workgroups buffer goes in the binding table */ - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - } - genX(cmd_buffer_flush_compute_state)(cmd_buffer); if (cmd_buffer->state.conditional_render_enabled) @@ -536,7 +589,11 @@ void genX(CmdDispatchIndirect)( struct anv_address addr = anv_address_add(buffer->address, offset); UNUSED struct anv_batch *batch = &cmd_buffer->batch; - anv_cmd_buffer_push_base_group_id(cmd_buffer, 0, 0, 0); + if (anv_batch_has_error(&cmd_buffer->batch)) + return; + + anv_cmd_buffer_push_workgroups(cmd_buffer, prog_data, + 0, 0, 0, 0, 0, 0, addr); anv_measure_snapshot(cmd_buffer, INTEL_SNAPSHOT_COMPUTE, @@ -544,13 +601,6 @@ void genX(CmdDispatchIndirect)( 0); trace_intel_begin_compute_indirect(&cmd_buffer->trace); - if (prog_data->uses_num_work_groups) { - cmd_buffer->state.compute.num_workgroups = addr; - - /* The num_workgroups buffer goes in the binding table */ - cmd_buffer->state.descriptors_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; - } - genX(cmd_buffer_flush_compute_state)(cmd_buffer); if (cmd_buffer->state.conditional_render_enabled)