diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 72fc4cdea4f..17aaba3c612 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -660,9 +660,9 @@ iris_rewrite_compute_walker_pc(struct iris_batch *batch, uint32_t dwords[GENX(COMPUTE_WALKER_length)]; _iris_pack_command(batch, GENX(COMPUTE_WALKER), dwords, cw) { - cw.PostSync.Operation = WriteTimestamp; - cw.PostSync.DestinationAddress = addr; - cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); + cw.body.PostSync.Operation = WriteTimestamp; + cw.body.PostSync.DestinationAddress = addr; + cw.body.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); } for (uint32_t i = 0; i < GENX(COMPUTE_WALKER_length); i++) @@ -9012,29 +9012,33 @@ iris_upload_compute_walker(struct iris_context *ice, ice->utrace.last_compute_walker = iris_emit_dwords(batch, GENX(COMPUTE_WALKER_length)); + + struct GENX(COMPUTE_WALKER_BODY) body = { + .SIMDSize = dispatch.simd_size / 16, + .MessageSIMD = dispatch.simd_size / 16, + .LocalXMaximum = grid->block[0] - 1, + .LocalYMaximum = grid->block[1] - 1, + .LocalZMaximum = grid->block[2] - 1, + .ThreadGroupIDXDimension = grid->grid[0], + .ThreadGroupIDYDimension = grid->grid[1], + .ThreadGroupIDZDimension = grid->grid[2], + .ExecutionMask = dispatch.right_mask, + .PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0), + .InterfaceDescriptor = idd, + +#if GFX_VERx10 >= 125 + .GenerateLocalID = cs_data->generate_local_id != 0, + .EmitLocal = cs_data->generate_local_id, + .WalkOrder = cs_data->walk_order, + .TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ? + TileY32bpe : Linear, +#endif + }; + _iris_pack_command(batch, GENX(COMPUTE_WALKER), ice->utrace.last_compute_walker, cw) { cw.IndirectParameterEnable = grid->indirect; - cw.SIMDSize = dispatch.simd_size / 16; - cw.MessageSIMD = dispatch.simd_size / 16; - cw.LocalXMaximum = grid->block[0] - 1; - cw.LocalYMaximum = grid->block[1] - 1; - cw.LocalZMaximum = grid->block[2] - 1; - cw.ThreadGroupIDXDimension = grid->grid[0]; - cw.ThreadGroupIDYDimension = grid->grid[1]; - cw.ThreadGroupIDZDimension = grid->grid[2]; - cw.ExecutionMask = dispatch.right_mask; - cw.PostSync.MOCS = iris_mocs(NULL, &screen->isl_dev, 0); - cw.InterfaceDescriptor = idd; - -#if GFX_VERx10 >= 125 - cw.GenerateLocalID = cs_data->generate_local_id != 0; - cw.EmitLocal = cs_data->generate_local_id; - cw.WalkOrder = cs_data->walk_order; - cw.TileLayout = cs_data->walk_order == INTEL_WALK_ORDER_YXZ ? - TileY32bpe : Linear; -#endif - + cw.body = body; assert(iris_cs_push_const_total_size(shader, dispatch.threads) == 0); } } diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h index 97a549a57e3..dc5b633bd30 100644 --- a/src/intel/blorp/blorp_genX_exec_brw.h +++ b/src/intel/blorp/blorp_genX_exec_brw.h @@ -1653,43 +1653,42 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) assert(cs_prog_data->local_size[2] == 1); #if GFX_VERx10 >= 125 - assert(cs_prog_data->push.per_thread.regs == 0); - blorp_emit(batch, GENX(COMPUTE_WALKER), cw) { - cw.SIMDSize = dispatch.simd_size / 16; - cw.MessageSIMD = dispatch.simd_size / 16, - cw.LocalXMaximum = cs_prog_data->local_size[0] - 1; - cw.LocalYMaximum = cs_prog_data->local_size[1] - 1; - cw.LocalZMaximum = cs_prog_data->local_size[2] - 1; - cw.ThreadGroupIDStartingX = group_x0; - cw.ThreadGroupIDStartingY = group_y0; - cw.ThreadGroupIDStartingZ = group_z0; - cw.ThreadGroupIDXDimension = group_x1; - cw.ThreadGroupIDYDimension = group_y1; - cw.ThreadGroupIDZDimension = group_z1; - cw.ExecutionMask = 0xffffffff; - cw.PostSync.MOCS = isl_mocs(batch->blorp->isl_dev, 0, false); + uint32_t surfaces_offset = blorp_setup_binding_table(batch, params); - uint32_t surfaces_offset = blorp_setup_binding_table(batch, params); + uint32_t samplers_offset = + params->src.enabled ? blorp_emit_sampler_state(batch) : 0; - uint32_t samplers_offset = - params->src.enabled ? blorp_emit_sampler_state(batch) : 0; + uint32_t push_const_offset; + unsigned push_const_size; + blorp_get_compute_push_const(batch, params, dispatch.threads, + &push_const_offset, &push_const_size); + struct GENX(COMPUTE_WALKER_BODY) body = { + .SIMDSize = dispatch.simd_size / 16, + .MessageSIMD = dispatch.simd_size / 16, + .LocalXMaximum = cs_prog_data->local_size[0] - 1, + .LocalYMaximum = cs_prog_data->local_size[1] - 1, + .LocalZMaximum = cs_prog_data->local_size[2] - 1, + .ThreadGroupIDStartingX = group_x0, + .ThreadGroupIDStartingY = group_y0, + .ThreadGroupIDStartingZ = group_z0, + .ThreadGroupIDXDimension = group_x1, + .ThreadGroupIDYDimension = group_y1, + .ThreadGroupIDZDimension = group_z1, + .ExecutionMask = 0xffffffff, + .PostSync.MOCS = isl_mocs(batch->blorp->isl_dev, 0, false), - uint32_t push_const_offset; - unsigned push_const_size; - blorp_get_compute_push_const(batch, params, dispatch.threads, - &push_const_offset, &push_const_size); - cw.IndirectDataStartAddress = push_const_offset; - cw.IndirectDataLength = push_const_size; + .IndirectDataStartAddress = push_const_offset, + .IndirectDataLength = push_const_size, #if GFX_VERx10 >= 125 - cw.GenerateLocalID = cs_prog_data->generate_local_id != 0; - cw.EmitLocal = cs_prog_data->generate_local_id; - cw.WalkOrder = cs_prog_data->walk_order; - cw.TileLayout = cs_prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? - TileY32bpe : Linear; + .GenerateLocalID = cs_prog_data->generate_local_id != 0, + .EmitLocal = cs_prog_data->generate_local_id, + .WalkOrder = cs_prog_data->walk_order, + .TileLayout = cs_prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? + TileY32bpe : Linear, #endif - cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { .KernelStartPointer = params->cs_prog_kernel, .SamplerStatePointer = samplers_offset, .SamplerCount = params->src.enabled ? 1 : 0, @@ -1704,7 +1703,12 @@ blorp_exec_compute(struct blorp_batch *batch, const struct blorp_params *params) dispatch.group_size, dispatch.simd_size), .NumberOfBarriers = cs_prog_data->uses_barrier, - }; + }, + }; + + assert(cs_prog_data->push.per_thread.regs == 0); + blorp_emit(batch, GENX(COMPUTE_WALKER), cw) { + cw.body = body; } #else diff --git a/src/intel/executor/executor_genx.c b/src/intel/executor/executor_genx.c index a5548dc1d25..c523ca91757 100644 --- a/src/intel/executor/executor_genx.c +++ b/src/intel/executor/executor_genx.c @@ -143,17 +143,23 @@ genX(emit_execute)(executor_context *ec, const executor_params *params) emit_pipe_control(ec); #if GFX_VERx10 >= 125 - executor_batch_emit(GENX(COMPUTE_WALKER), cw) { + struct GENX(COMPUTE_WALKER_BODY) body = { #if GFX_VERx10 >= 200 - cw.SIMDSize = 1; - cw.MessageSIMD = 1; + .SIMDSize = 1, + .MessageSIMD = 1, #endif - cw.ThreadGroupIDXDimension = 1; - cw.ThreadGroupIDYDimension = 1; - cw.ThreadGroupIDZDimension = 1; - cw.ExecutionMask = 0xFFFFFFFF; - cw.PostSync.MOCS = mocs; - cw.InterfaceDescriptor = desc; + .ThreadGroupIDXDimension = 1, + .ThreadGroupIDYDimension = 1, + .ThreadGroupIDZDimension = 1, + .ExecutionMask = 0xFFFFFFFF, + .PostSync.MOCS = mocs, + .InterfaceDescriptor = desc, + }; +#endif + +#if GFX_VERx10 >= 125 + executor_batch_emit(GENX(COMPUTE_WALKER), cw) { + cw.body = body; }; #else uint32_t *idd = executor_alloc_bytes_aligned(&ec->bo.extra, 8 * 4, 256); diff --git a/src/intel/genxml/gen125.xml b/src/intel/genxml/gen125.xml index 13e861ac492..c78cdef9ff7 100644 --- a/src/intel/genxml/gen125.xml +++ b/src/intel/genxml/gen125.xml @@ -1590,66 +1590,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + diff --git a/src/intel/genxml/gen20.xml b/src/intel/genxml/gen20.xml index 18b6aa47bcd..c43a0bc292d 100644 --- a/src/intel/genxml/gen20.xml +++ b/src/intel/genxml/gen20.xml @@ -936,64 +936,7 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c index 7274837768a..c97350245a3 100644 --- a/src/intel/vulkan/genX_cmd_buffer.c +++ b/src/intel/vulkan/genX_cmd_buffer.c @@ -6197,11 +6197,13 @@ void genX(cmd_emit_timestamp)(struct anv_batch *batch, uint32_t dwords[GENX(COMPUTE_WALKER_length)]; GENX(COMPUTE_WALKER_pack)(batch, dwords, &(struct GENX(COMPUTE_WALKER)) { - .PostSync = (struct GENX(POSTSYNC_DATA)) { - .Operation = WriteTimestamp, - .DestinationAddress = addr, - .MOCS = anv_mocs(device, NULL, 0), - }, + .body = { + .PostSync = (struct GENX(POSTSYNC_DATA)) { + .Operation = WriteTimestamp, + .DestinationAddress = addr, + .MOCS = anv_mocs(device, NULL, 0), + }, + } }); for (uint32_t i = 0; i < ARRAY_SIZE(dwords); i++) { diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 132e2d56088..6db84bc23f6 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -437,6 +437,37 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, num_workgroup_data[2] = groupCountZ; } + struct GENX(COMPUTE_WALKER_BODY) body = { + .SIMDSize = dispatch.simd_size / 16, + .MessageSIMD = dispatch.simd_size / 16, + .IndirectDataStartAddress = comp_state->base.push_constants_state.offset, + .IndirectDataLength = comp_state->base.push_constants_state.alloc_size, + .GenerateLocalID = prog_data->generate_local_id != 0, + .EmitLocal = prog_data->generate_local_id, + .WalkOrder = prog_data->walk_order, + .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? + TileY32bpe : Linear, + .LocalXMaximum = prog_data->local_size[0] - 1, + .LocalYMaximum = prog_data->local_size[1] - 1, + .LocalZMaximum = prog_data->local_size[2] - 1, + .ThreadGroupIDXDimension = groupCountX, + .ThreadGroupIDYDimension = groupCountY, + .ThreadGroupIDZDimension = groupCountZ, + .ExecutionMask = dispatch.right_mask, + .PostSync = { + .MOCS = anv_mocs(pipeline->base.device, NULL, 0), + }, + .InterfaceDescriptor = + get_interface_descriptor_data(cmd_buffer, pipeline->cs, + prog_data, &dispatch), + .EmitInlineParameter = prog_data->uses_inline_data, + .InlineData = { + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0], + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1], + [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2], + } + }; + cmd_buffer->state.last_compute_walker = anv_batch_emitn( &cmd_buffer->batch, @@ -444,38 +475,11 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, GENX(COMPUTE_WALKER), .IndirectParameterEnable = !anv_address_is_null(indirect_addr), .PredicateEnable = predicate, - .SIMDSize = dispatch.simd_size / 16, - .MessageSIMD = dispatch.simd_size / 16, - .IndirectDataStartAddress = comp_state->base.push_constants_state.offset, - .IndirectDataLength = comp_state->base.push_constants_state.alloc_size, + .body = body, #if GFX_VERx10 == 125 .SystolicModeEnable = prog_data->uses_systolic, #endif - .GenerateLocalID = prog_data->generate_local_id != 0, - .EmitLocal = prog_data->generate_local_id, - .WalkOrder = prog_data->walk_order, - .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? - TileY32bpe : Linear, - .LocalXMaximum = prog_data->local_size[0] - 1, - .LocalYMaximum = prog_data->local_size[1] - 1, - .LocalZMaximum = prog_data->local_size[2] - 1, - .ThreadGroupIDXDimension = groupCountX, - .ThreadGroupIDYDimension = groupCountY, - .ThreadGroupIDZDimension = groupCountZ, - .ExecutionMask = dispatch.right_mask, - .PostSync = { - .MOCS = anv_mocs(pipeline->base.device, NULL, 0), - }, - .InterfaceDescriptor = - get_interface_descriptor_data(cmd_buffer, pipeline->cs, - prog_data, &dispatch), - .EmitInlineParameter = prog_data->uses_inline_data, - .InlineData = { - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0], - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1], - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2], - }); - + ); } #else /* #if GFX_VERx10 >= 125 */ @@ -724,33 +728,39 @@ genX(cmd_buffer_dispatch_kernel)(struct anv_cmd_buffer *cmd_buffer, struct intel_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(devinfo, cs_prog_data, NULL); - anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { - cw.PredicateEnable = false; - cw.SIMDSize = dispatch.simd_size / 16; - cw.MessageSIMD = dispatch.simd_size / 16; - cw.IndirectDataStartAddress = indirect_data.offset; - cw.IndirectDataLength = indirect_data.alloc_size; - cw.LocalXMaximum = cs_prog_data->local_size[0] - 1; - cw.LocalYMaximum = cs_prog_data->local_size[1] - 1; - cw.LocalZMaximum = cs_prog_data->local_size[2] - 1; - cw.ExecutionMask = dispatch.right_mask; - cw.PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal; - - if (global_size != NULL) { - cw.ThreadGroupIDXDimension = global_size[0]; - cw.ThreadGroupIDYDimension = global_size[1]; - cw.ThreadGroupIDZDimension = global_size[2]; - } else { - cw.IndirectParameterEnable = true; - } - - cw.InterfaceDescriptor = + struct GENX(COMPUTE_WALKER_BODY) body = { + .SIMDSize = dispatch.simd_size / 16, + .MessageSIMD = dispatch.simd_size / 16, + .IndirectDataStartAddress = indirect_data.offset, + .IndirectDataLength = indirect_data.alloc_size, + .LocalXMaximum = cs_prog_data->local_size[0] - 1, + .LocalYMaximum = cs_prog_data->local_size[1] - 1, + .LocalZMaximum = cs_prog_data->local_size[2] - 1, + .ExecutionMask = dispatch.right_mask, + .PostSync.MOCS = cmd_buffer->device->isl_dev.mocs.internal, + .InterfaceDescriptor = get_interface_descriptor_data(cmd_buffer, kernel->bin, cs_prog_data, - &dispatch); + &dispatch), + }; + + if (global_size != NULL) { + body.ThreadGroupIDXDimension = global_size[0]; + body.ThreadGroupIDYDimension = global_size[1]; + body.ThreadGroupIDZDimension = global_size[2]; } + cmd_buffer->state.last_compute_walker = + anv_batch_emitn( + &cmd_buffer->batch, + GENX(COMPUTE_WALKER_length), + GENX(COMPUTE_WALKER), + .IndirectParameterEnable = global_size == NULL, + .PredicateEnable = false, + .body = body, + ); + /* We just blew away the compute pipeline state */ cmd_buffer->state.compute.pipeline_dirty = true; } @@ -1132,26 +1142,39 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, struct intel_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(device->info, cs_prog_data, NULL); - anv_batch_emit(&cmd_buffer->batch, GENX(COMPUTE_WALKER), cw) { - cw.IndirectParameterEnable = params->is_launch_size_indirect; - cw.PredicateEnable = cmd_buffer->state.conditional_render_enabled; - cw.SIMDSize = dispatch.simd_size / 16; - cw.MessageSIMD = dispatch.simd_size / 16; - cw.LocalXMaximum = (1 << local_size_log2[0]) - 1; - cw.LocalYMaximum = (1 << local_size_log2[1]) - 1; - cw.LocalZMaximum = (1 << local_size_log2[2]) - 1; - cw.ThreadGroupIDXDimension = global_size[0]; - cw.ThreadGroupIDYDimension = global_size[1]; - cw.ThreadGroupIDZDimension = global_size[2]; - cw.ExecutionMask = 0xff; - cw.EmitInlineParameter = true; - cw.PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0); + const gl_shader_stage s = MESA_SHADER_RAYGEN; + struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; + struct anv_state *samplers = &cmd_buffer->state.samplers[s]; + struct brw_rt_raygen_trampoline_params trampoline_params = { + .rt_disp_globals_addr = anv_address_physical(rtdg_addr), + .raygen_bsr_addr = + params->is_sbt_indirect ? + (params->indirect_sbts_addr + + offsetof(VkTraceRaysIndirectCommand2KHR, + raygenShaderRecordAddress)) : + params->raygen_sbt->deviceAddress, + .is_indirect = params->is_sbt_indirect, + .local_group_size_log2 = { + local_size_log2[0], + local_size_log2[1], + local_size_log2[2], + }, + }; - const gl_shader_stage s = MESA_SHADER_RAYGEN; - struct anv_device *device = cmd_buffer->device; - struct anv_state *surfaces = &cmd_buffer->state.binding_tables[s]; - struct anv_state *samplers = &cmd_buffer->state.samplers[s]; - cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + struct GENX(COMPUTE_WALKER_BODY) body = { + .SIMDSize = dispatch.simd_size / 16, + .MessageSIMD = dispatch.simd_size / 16, + .LocalXMaximum = (1 << local_size_log2[0]) - 1, + .LocalYMaximum = (1 << local_size_log2[1]) - 1, + .LocalZMaximum = (1 << local_size_log2[2]) - 1, + .ThreadGroupIDXDimension = global_size[0], + .ThreadGroupIDYDimension = global_size[1], + .ThreadGroupIDZDimension = global_size[2], + .ExecutionMask = 0xff, + .EmitInlineParameter = true, + .PostSync.MOCS = anv_mocs(pipeline->base.device, NULL, 0), + + .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { .KernelStartPointer = device->rt_trampoline->kernel.offset, .SamplerStatePointer = samplers->offset, /* i965: DIV_ROUND_UP(CLAMP(stage_state->sampler_count, 0, 16), 4), */ @@ -1162,26 +1185,21 @@ cmd_buffer_trace_rays(struct anv_cmd_buffer *cmd_buffer, #if INTEL_NEEDS_WA_14017794102 .ThreadPreemption = false, #endif - }; + }, + }; - struct brw_rt_raygen_trampoline_params trampoline_params = { - .rt_disp_globals_addr = anv_address_physical(rtdg_addr), - .raygen_bsr_addr = - params->is_sbt_indirect ? - (params->indirect_sbts_addr + - offsetof(VkTraceRaysIndirectCommand2KHR, - raygenShaderRecordAddress)) : - params->raygen_sbt->deviceAddress, - .is_indirect = params->is_sbt_indirect, - .local_group_size_log2 = { - local_size_log2[0], - local_size_log2[1], - local_size_log2[2], - }, - }; - STATIC_ASSERT(sizeof(trampoline_params) == 32); - memcpy(cw.InlineData, &trampoline_params, sizeof(trampoline_params)); - } + STATIC_ASSERT(sizeof(trampoline_params) == 32); + memcpy(body.InlineData, &trampoline_params, sizeof(trampoline_params)); + + cmd_buffer->state.last_compute_walker = + anv_batch_emitn( + &cmd_buffer->batch, + GENX(COMPUTE_WALKER_length), + GENX(COMPUTE_WALKER), + .IndirectParameterEnable = params->is_launch_size_indirect, + .PredicateEnable = cmd_buffer->state.conditional_render_enabled, + .body = body, + ); trace_intel_end_rays(&cmd_buffer->trace, params->launch_size[0], diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c index 3e44b23a806..062db96c4f0 100644 --- a/src/intel/vulkan/genX_simple_shader.c +++ b/src/intel/vulkan/genX_simple_shader.c @@ -565,30 +565,30 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state, brw_cs_get_dispatch_info(devinfo, prog_data, NULL); #if GFX_VERx10 >= 125 - anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) { - cw.SIMDSize = dispatch.simd_size / 16; - cw.MessageSIMD = dispatch.simd_size / 16, - cw.IndirectDataStartAddress = push_state.offset; - cw.IndirectDataLength = push_state.alloc_size; - cw.LocalXMaximum = prog_data->local_size[0] - 1; - cw.LocalYMaximum = prog_data->local_size[1] - 1; - cw.LocalZMaximum = prog_data->local_size[2] - 1; - cw.ThreadGroupIDXDimension = DIV_ROUND_UP(num_threads, - dispatch.simd_size); - cw.ThreadGroupIDYDimension = 1; - cw.ThreadGroupIDZDimension = 1; - cw.ExecutionMask = dispatch.right_mask; - cw.PostSync.MOCS = anv_mocs(device, NULL, 0); + struct GENX(COMPUTE_WALKER_BODY) body = { + .SIMDSize = dispatch.simd_size / 16, + .MessageSIMD = dispatch.simd_size / 16, + .IndirectDataStartAddress = push_state.offset, + .IndirectDataLength = push_state.alloc_size, + .LocalXMaximum = prog_data->local_size[0] - 1, + .LocalYMaximum = prog_data->local_size[1] - 1, + .LocalZMaximum = prog_data->local_size[2] - 1, + .ThreadGroupIDXDimension = DIV_ROUND_UP(num_threads, + dispatch.simd_size), + .ThreadGroupIDYDimension = 1, + .ThreadGroupIDZDimension = 1, + .ExecutionMask = dispatch.right_mask, + .PostSync.MOCS = anv_mocs(device, NULL, 0), #if GFX_VERx10 >= 125 - cw.GenerateLocalID = prog_data->generate_local_id != 0; - cw.EmitLocal = prog_data->generate_local_id; - cw.WalkOrder = prog_data->walk_order; - cw.TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? - TileY32bpe : Linear; + .GenerateLocalID = prog_data->generate_local_id != 0, + .EmitLocal = prog_data->generate_local_id, + .WalkOrder = prog_data->walk_order, + .TileLayout = prog_data->walk_order == INTEL_WALK_ORDER_YXZ ? + TileY32bpe : Linear, #endif - cw.InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { + .InterfaceDescriptor = (struct GENX(INTERFACE_DESCRIPTOR_DATA)) { .KernelStartPointer = state->kernel->kernel.offset + brw_cs_prog_data_prog_offset(prog_data, dispatch.simd_size), @@ -599,7 +599,11 @@ genX(emit_simple_shader_dispatch)(struct anv_simple_shader *state, .SharedLocalMemorySize = intel_compute_slm_encode_size(GFX_VER, prog_data->base.total_shared), .NumberOfBarriers = prog_data->uses_barrier, - }; + }, + }; + + anv_batch_emit(batch, GENX(COMPUTE_WALKER), cw) { + cw.body = body; } #else const uint32_t vfe_curbe_allocation =