diff --git a/src/intel/compiler/brw/brw_compiler.h b/src/intel/compiler/brw/brw_compiler.h index 2598b33d7ab..2e21ef4d479 100644 --- a/src/intel/compiler/brw/brw_compiler.h +++ b/src/intel/compiler/brw/brw_compiler.h @@ -633,7 +633,8 @@ struct brw_fs_prog_data { /** * Push constant location of the remapping offset in the instruction heap - * for Wa_18019110168 in bytes. + * for Wa_18019110168 in bytes (the value read by the compiler is a + * uint16_t). */ unsigned per_primitive_remap_param; diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index 9a4cc352000..220fa3b7408 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -283,7 +283,7 @@ brw_dynamic_per_primitive_remap(const struct brw_fs_prog_data *fs_prog_data) { return byte_offset( brw_uniform_reg( - fs_prog_data->per_primitive_remap_param / REG_SIZE, BRW_TYPE_UD), + fs_prog_data->per_primitive_remap_param / REG_SIZE, BRW_TYPE_UW), fs_prog_data->per_primitive_remap_param % REG_SIZE); } diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index 1f1f85c613e..62df1e4c173 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -35,8 +35,13 @@ struct vk_pipeline_robustness_state; #define anv_drv_const_offset(field) \ (offsetof(struct anv_push_constants, field)) +#define anv_drv_const_dword(field) \ + (offsetof(struct anv_push_constants, field) / 4) #define anv_drv_const_size(field) \ (sizeof(((struct anv_push_constants *)0)->field)) +#define anv_drv_const_includes_offset(field, offset) \ + ((offset) >= anv_drv_const_offset(field) && \ + (offset) < (anv_drv_const_offset(field) + anv_drv_const_size(field))) #define anv_load_driver_uniform(b, components, field) \ nir_load_push_data_intel(b, components, \ diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index f4dd3510e8f..fa44071a6e8 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -27,12 +27,13 @@ #include "util/mesa-sha1.h" #include "util/set.h" +#define PUSH_CONSTANTS_DWORDS (sizeof(struct anv_push_constants) / 4) + struct push_data { bool push_ubo_ranges; bool needs_wa_18019110168; bool needs_dyn_tess_config; - unsigned app_start, app_end; - unsigned driver_start, driver_end; + BITSET_DECLARE(push_dwords, PUSH_CONSTANTS_DWORDS); }; static void @@ -52,33 +53,33 @@ adjust_driver_push_values(nir_shader *nir, */ const uint32_t push_reg_mask_start = anv_drv_const_offset(gfx.push_reg_mask[nir->info.stage]); - const uint32_t push_reg_mask_end = - push_reg_mask_start + - anv_drv_const_size(gfx.push_reg_mask[nir->info.stage]); - data->driver_start = MIN2(data->driver_start, push_reg_mask_start); - data->driver_end = MAX2(data->driver_end, push_reg_mask_end); + assert(anv_drv_const_size(gfx.push_reg_mask[nir->info.stage]) <= 4); + BITSET_SET(data->push_dwords, push_reg_mask_start / 4); } if (nir->info.stage == MESA_SHADER_FRAGMENT) { if (push_info->fragment_dynamic) { const uint32_t fs_config_start = anv_drv_const_offset(gfx.fs_config); - const uint32_t fs_config_end = fs_config_start + - anv_drv_const_size(gfx.fs_config); - data->driver_start = MIN2(data->driver_start, fs_config_start); - data->driver_end = MAX2(data->driver_end, fs_config_end); + assert(anv_drv_const_size(gfx.fs_config) <= 4); + BITSET_SET(data->push_dwords, fs_config_start / 4); } if (data->needs_wa_18019110168) { const uint32_t fs_per_prim_remap_start = anv_drv_const_offset(gfx.fs_per_prim_remap_offset); - const uint32_t fs_per_prim_remap_end = - fs_per_prim_remap_start + - anv_drv_const_size(gfx.fs_per_prim_remap_offset); - data->driver_start = MIN2(data->driver_start, fs_per_prim_remap_start); - data->driver_end = MAX2(data->driver_end, fs_per_prim_remap_end); + assert(anv_drv_const_size(gfx.fs_per_prim_remap_offset) <= 4); + BITSET_SET(data->push_dwords, fs_per_prim_remap_start / 4); } } + if (nir->info.stage == MESA_SHADER_MESH && + brw_nir_mesh_shader_needs_wa_18019110168(devinfo, nir)) { + const uint32_t mesh_provoking_vertex_start = + anv_drv_const_offset(gfx.mesh_provoking_vertex); + assert(anv_drv_const_size(gfx.mesh_provoking_vertex) <= 4); + BITSET_SET(data->push_dwords, mesh_provoking_vertex_start / 4); + } + data->needs_dyn_tess_config = (nir->info.stage == MESA_SHADER_TESS_CTRL && (container_of(prog_key, struct brw_tcs_prog_key, base)->input_vertices == 0 || @@ -87,10 +88,8 @@ adjust_driver_push_values(nir_shader *nir, push_info->separate_tessellation); if (data->needs_dyn_tess_config) { const uint32_t tess_config_start = anv_drv_const_offset(gfx.tess_config); - const uint32_t tess_config_end = tess_config_start + - anv_drv_const_size(gfx.tess_config); - data->driver_start = MIN2(data->driver_start, tess_config_start); - data->driver_end = MAX2(data->driver_end, tess_config_end); + assert(anv_drv_const_size(gfx.tess_config) <= 4); + BITSET_SET(data->push_dwords, tess_config_start / 4); } } @@ -104,10 +103,8 @@ gather_push_data(nir_shader *nir, struct set *lowered_ubo_instrs) { bool has_const_ubo = false; - struct push_data data = { - .app_start = UINT_MAX, .app_end = 0, - .driver_start = UINT_MAX, .driver_end = 0, - }; + struct push_data data = { 0, }; + BITSET_ZERO(data.push_dwords); nir_foreach_function_impl(impl, nir) { nir_foreach_block(block, impl) { @@ -127,8 +124,8 @@ gather_push_data(nir_shader *nir, case nir_intrinsic_load_push_constant: { unsigned base = nir_intrinsic_base(intrin); unsigned range = nir_intrinsic_range(intrin); - data.app_start = MIN2(data.app_start, base); - data.app_end = MAX2(data.app_end, base + range); + BITSET_SET_RANGE(data.push_dwords, + base / 4, DIV_ROUND_UP(base + range, 4) - 1); break; } @@ -141,8 +138,8 @@ gather_push_data(nir_shader *nir, unsigned base = nir_intrinsic_base(intrin); unsigned range = nir_intrinsic_range(intrin); - data.driver_start = MIN2(data.driver_start, base); - data.driver_end = MAX2(data.driver_end, base + range); + BITSET_SET_RANGE(data.push_dwords, + base / 4, DIV_ROUND_UP(base + range, 4) - 1); break; } @@ -260,6 +257,36 @@ lower_ubo_to_push_data_intel(nir_builder *b, return true; } +static bool +lower_to_inline_data_intel(nir_builder *b, + nir_intrinsic_instr *intrin, + const struct lower_to_push_data_intel_state *state) +{ + unsigned base = nir_intrinsic_base(intrin); + + /* Check for push data promoted to inline parameters. Because the push data + * is just packed into the inline data, the order is the same (it's just + * packed), so even if the value is a vec3/4, once you find the first + * matching dword, the rest will follow in the right order. + */ + for (unsigned i = 0; i < state->bind_map->inline_dwords_count; i++) { + if (state->bind_map->inline_dwords[i] == base / 4) { + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_inline_data_intel( + b, + intrin->def.num_components, + intrin->def.bit_size, + intrin->src[0].ssa, + .base = i * 4 + base % 4, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + return true; + } + } + + return false; +} + static bool lower_to_push_data_intel(nir_builder *b, nir_intrinsic_instr *intrin, @@ -278,24 +305,36 @@ lower_to_push_data_intel(nir_builder *b, switch (intrin->intrinsic) { case nir_intrinsic_load_push_data_intel: { const unsigned base = nir_intrinsic_base(intrin); - /* For lowered UBOs to push constants, shrink the base by the amount we - * shrunk the driver push constants. - */ - if (_mesa_set_search(state->lowered_ubo_instrs, intrin)) + if (_mesa_set_search(state->lowered_ubo_instrs, intrin)) { + /* For lowered UBOs to push constants, shrink the base by the amount + * we shrinked the driver push constants. + */ nir_intrinsic_set_base(intrin, base - state->reduced_push_ranges); - else - nir_intrinsic_set_base(intrin, base - base_offset); + return true; + } + + if (lower_to_inline_data_intel(b, intrin, state)) + return true; + /* We need to retain this information to update the push constant on * vkCmdDispatch*(). */ - if (b->shader->info.stage == MESA_SHADER_COMPUTE && - base >= anv_drv_const_offset(cs.num_work_groups[0]) && - base < (anv_drv_const_offset(cs.num_work_groups[2]) + 4)) - state->bind_map->binding_mask |= ANV_PIPELINE_BIND_MASK_USES_NUM_WORKGROUP; + if (b->shader->info.stage == MESA_SHADER_COMPUTE) { + if (anv_drv_const_includes_offset(cs.num_workgroups, base)) + state->bind_map->binding_mask |= ANV_PIPELINE_BIND_MASK_NUM_WORKGROUP; + else if (anv_drv_const_includes_offset(cs.base_workgroup, base)) + state->bind_map->binding_mask |= ANV_PIPELINE_BIND_MASK_BASE_WORKGROUP; + else if (anv_drv_const_includes_offset(cs.unaligned_invocations_x, base)) + state->bind_map->binding_mask |= ANV_PIPELINE_BIND_MASK_UNALIGNED_INV_X; + } + nir_intrinsic_set_base(intrin, base - base_offset); return true; } case nir_intrinsic_load_push_constant: { + if (lower_to_inline_data_intel(b, intrin, state)) + return true; + b->cursor = nir_before_instr(&intrin->instr); nir_def *data = nir_load_push_data_intel( b, @@ -314,9 +353,17 @@ lower_to_push_data_intel(nir_builder *b, } static struct anv_push_range -compute_final_push_range(const struct intel_device_info *devinfo, - const struct push_data *data) +compute_final_push_range(const nir_shader *nir, + const struct intel_device_info *devinfo, + const struct push_data *data, + struct anv_pipeline_bind_map *map) { + if (BITSET_IS_EMPTY(data->push_dwords)) { + return (struct anv_push_range) { + .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, + }; + } + /* Align push_start down to a 32B (for 3DSTATE_CONSTANT) and make it no * larger than push_end (no push constants is indicated by push_start = * UINT_MAX). @@ -343,23 +390,53 @@ compute_final_push_range(const struct intel_device_info *devinfo, * (unlike all Gfx stages) and so we can bound+align the allocation there * (see anv_cmd_buffer_cs_push_constants). */ - unsigned push_start = UINT32_MAX; + const bool has_inline_param = + devinfo->verx10 >= 125 && + (nir->info.stage == MESA_SHADER_TASK || + nir->info.stage == MESA_SHADER_MESH || + nir->info.stage == MESA_SHADER_COMPUTE); - if (data->app_end != 0) - push_start = MIN2(push_start, data->app_start); - if (data->driver_end != 0) - push_start = MIN2(push_start, data->driver_start); + map->inline_dwords_count = 0; + + /* Can we fit all the push data in the inline parameters? */ + if (has_inline_param && BITSET_COUNT(data->push_dwords) < 8) { + unsigned i; + map->inline_dwords_count = 0; + BITSET_FOREACH_SET(i, data->push_dwords, PUSH_CONSTANTS_DWORDS) + map->inline_dwords[map->inline_dwords_count++] = i; - if (push_start == UINT32_MAX) { return (struct anv_push_range) { .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, }; } + unsigned push_start = (BITSET_FFS(data->push_dwords) - 1) * 4; + unsigned push_end = BITSET_LAST_BIT(data->push_dwords) * 4; + + if (has_inline_param) { + /* Reserve the first 2 dwords for the push constant address so the + * backend can load the data. + */ + map->inline_dwords[map->inline_dwords_count++] = ANV_INLINE_DWORD_PUSH_ADDRESS_LDW; + map->inline_dwords[map->inline_dwords_count++] = ANV_INLINE_DWORD_PUSH_ADDRESS_UDW; + + /* Can we fit all the driver data in the inline parameters? */ + if ((BITSET_COUNT(data->push_dwords) - + BITSET_PREFIX_SUM(data->push_dwords, MAX_PUSH_CONSTANTS_SIZE / 4)) <= 6) { + unsigned i; + BITSET_FOREACH_SET(i, data->push_dwords, PUSH_CONSTANTS_DWORDS) { + /* Iterate application push constants (not driver values) */ + if (i >= (MAX_PUSH_CONSTANTS_SIZE / 4)) + map->inline_dwords[map->inline_dwords_count++] = i; + } + + push_end = BITSET_LAST_BIT_BEFORE(data->push_dwords, MAX_PUSH_CONSTANTS_SIZE / 4) * 4; + } + } + push_start = ROUND_DOWN_TO(push_start, 32); - const unsigned push_size = align( - MAX2(data->app_end, data->driver_end) - push_start, devinfo->grf_size); + const unsigned push_size = align(push_end - push_start, devinfo->grf_size); return (struct anv_push_range) { .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, @@ -386,7 +463,7 @@ anv_nir_compute_push_layout(nir_shader *nir, gather_push_data(nir, robust_flags, devinfo, push_info, prog_key, map, NULL); struct anv_push_range push_constant_range = - compute_final_push_range(devinfo, &data); + compute_final_push_range(nir, devinfo, &data, map); /* When platforms support Mesh and the fragment shader is not fully linked * to the previous shader, payload format can change if the preceding @@ -499,7 +576,7 @@ anv_nir_compute_push_layout(nir_shader *nir, /* Update the ranges */ struct anv_push_range shrinked_push_constant_range = - compute_final_push_range(devinfo, &data); + compute_final_push_range(nir, devinfo, &data, map); assert(shrinked_push_constant_range.length <= push_constant_range.length); if (shrinked_push_constant_range.length > 0) { @@ -588,8 +665,7 @@ anv_nir_compute_push_layout(nir_shader *nir, case MESA_SHADER_COMPUTE: { const int subgroup_id_index = - data.driver_end == (anv_drv_const_offset(cs.subgroup_id) + - anv_drv_const_size(cs.subgroup_id)) ? + BITSET_TEST(data.push_dwords, anv_drv_const_offset(cs.subgroup_id) / 4) ? (anv_drv_const_offset(cs.subgroup_id) - push_start) / 4 : -1; struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); brw_cs_fill_push_const_info(devinfo, cs_prog_data, subgroup_id_index); diff --git a/src/intel/vulkan/anv_nir_lower_driver_values.c b/src/intel/vulkan/anv_nir_lower_driver_values.c index e0cbec53259..0e897569000 100644 --- a/src/intel/vulkan/anv_nir_lower_driver_values.c +++ b/src/intel/vulkan/anv_nir_lower_driver_values.c @@ -47,7 +47,7 @@ lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin) b->cursor = nir_before_instr(&intrin->instr); nir_def *base_workgroup_id = - anv_load_driver_uniform(b, 3, cs.base_work_group_id[0]); + anv_load_driver_uniform(b, 3, cs.base_workgroup[0]); nir_def_replace(&intrin->def, base_workgroup_id); return true; @@ -105,22 +105,9 @@ lower_num_workgroups(nir_builder *b, nir_intrinsic_instr *intrin, void *data) if (mesa_shader_stage_is_mesh(b->shader->info.stage)) return false; - const struct anv_physical_device *pdevice = data; - b->cursor = nir_before_instr(&intrin->instr); - nir_def *num_workgroups; - /* On Gfx12.5+ we use the inline register to push the values, on prior - * generation we use push constants. - */ - if (pdevice->info.verx10 >= 125) { - num_workgroups = - nir_load_inline_data_intel( - b, 3, 32, nir_imm_int(b, 0), - .base = ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET); - } else { - num_workgroups = - anv_load_driver_uniform(b, 3, cs.num_work_groups[0]); - } + nir_def *num_workgroups = + anv_load_driver_uniform(b, 3, cs.num_workgroups[0]); nir_def *num_workgroups_indirect; nir_push_if(b, nir_ieq_imm(b, nir_channel(b, num_workgroups, 0), UINT32_MAX)); diff --git a/src/intel/vulkan/anv_nir_lower_unaligned_dispatch.c b/src/intel/vulkan/anv_nir_lower_unaligned_dispatch.c index cf0c8efe22c..a838dfd3e59 100644 --- a/src/intel/vulkan/anv_nir_lower_unaligned_dispatch.c +++ b/src/intel/vulkan/anv_nir_lower_unaligned_dispatch.c @@ -15,8 +15,7 @@ anv_nir_lower_unaligned_dispatch(nir_shader *shader) nir_def *global_idx = nir_channel(&b, nir_load_global_invocation_id(&b, 32), 0); nir_def *max_unaligned_invocations_x = - nir_load_inline_data_intel(&b, 1, 32, nir_imm_int(&b, 0), - .base = ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET); + anv_load_driver_uniform(&b, 1, cs.unaligned_invocations_x); nir_push_if(&b, nir_uge(&b, global_idx, max_unaligned_invocations_x)); { diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h index d1f1a6adea8..b0e40fe2fdc 100644 --- a/src/intel/vulkan/anv_private.h +++ b/src/intel/vulkan/anv_private.h @@ -253,9 +253,7 @@ get_max_vbs(const struct intel_device_info *devinfo) { /* Defines where various values are defined in the inline parameter register. */ #define ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET (0) -#define ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET (8) #define ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX (8) -#define ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET (20) /* RENDER_SURFACE_STATE is a bit smaller (48b) but since it is aligned to 64 * and we can't put anything else there we use 64b. @@ -1140,19 +1138,25 @@ struct anv_push_range { }; enum anv_pipeline_bind_mask { - ANV_PIPELINE_BIND_MASK_SET0 = BITFIELD_BIT(0), - ANV_PIPELINE_BIND_MASK_SET1 = BITFIELD_BIT(1), - ANV_PIPELINE_BIND_MASK_SET2 = BITFIELD_BIT(2), - ANV_PIPELINE_BIND_MASK_SET3 = BITFIELD_BIT(3), - ANV_PIPELINE_BIND_MASK_SET4 = BITFIELD_BIT(4), - ANV_PIPELINE_BIND_MASK_SET5 = BITFIELD_BIT(5), - ANV_PIPELINE_BIND_MASK_SET6 = BITFIELD_BIT(6), - ANV_PIPELINE_BIND_MASK_SET7 = BITFIELD_BIT(7), - ANV_PIPELINE_BIND_MASK_USES_NUM_WORKGROUP = BITFIELD_BIT(8), + ANV_PIPELINE_BIND_MASK_SET0 = BITFIELD_BIT(0), + ANV_PIPELINE_BIND_MASK_SET1 = BITFIELD_BIT(1), + ANV_PIPELINE_BIND_MASK_SET2 = BITFIELD_BIT(2), + ANV_PIPELINE_BIND_MASK_SET3 = BITFIELD_BIT(3), + ANV_PIPELINE_BIND_MASK_SET4 = BITFIELD_BIT(4), + ANV_PIPELINE_BIND_MASK_SET5 = BITFIELD_BIT(5), + ANV_PIPELINE_BIND_MASK_SET6 = BITFIELD_BIT(6), + ANV_PIPELINE_BIND_MASK_SET7 = BITFIELD_BIT(7), + ANV_PIPELINE_BIND_MASK_NUM_WORKGROUP = BITFIELD_BIT(8), + ANV_PIPELINE_BIND_MASK_BASE_WORKGROUP = BITFIELD_BIT(9), + ANV_PIPELINE_BIND_MASK_UNALIGNED_INV_X = BITFIELD_BIT(10), }; #define ANV_PIPELINE_BIND_MASK_SET(i) (ANV_PIPELINE_BIND_MASK_SET0 << i) +#define ANV_INLINE_DWORD_PUSH_ADDRESS_LDW (UINT8_MAX - 0) +#define ANV_INLINE_DWORD_PUSH_ADDRESS_UDW (UINT8_MAX - 1) +#define ANV_INLINE_DWORD_MESH_PROVOKING_VERTEX (UINT8_MAX - 2) + struct anv_pipeline_bind_map { unsigned char surface_sha1[SHA1_DIGEST_LENGTH]; unsigned char sampler_sha1[SHA1_DIGEST_LENGTH]; @@ -1167,6 +1171,11 @@ struct anv_pipeline_bind_map { uint8_t sampler_count; uint16_t embedded_sampler_count; + /* Dwords promoted from push constants (each element is a dword index in + * anv_push_constants (we can index up to 1024 bytes). + */ + uint8_t promoted_push_dwords[4]; + struct anv_pipeline_binding * surface_to_descriptor; struct anv_pipeline_binding * sampler_to_descriptor; struct anv_pipeline_embedded_sampler_binding* embedded_sampler_to_binding; @@ -1174,6 +1183,15 @@ struct anv_pipeline_bind_map { struct anv_push_range push_ranges[4]; + /* Number of valid elements in inline_dwords[] */ + uint8_t inline_dwords_count; + + /* Dwords promoted from push constants (each element is a dword index in + * anv_push_constants, we can index up to 1024 bytes minus a few values + * reserved, see ANV_INLINE_PARAM_* above) to inline data parameters. + */ + uint8_t inline_dwords[8]; + /* Bitfield of sets for which the surfaces are accessed */ uint8_t used_surface_sets; @@ -4315,7 +4333,9 @@ struct anv_push_constants { /** Robust access pushed registers. */ uint8_t push_reg_mask[MESA_SHADER_STAGES][4]; - uint32_t fs_per_prim_remap_offset; + /** Wa_18019110168 */ + uint16_t mesh_provoking_vertex; + uint16_t fs_per_prim_remap_offset; } gfx; struct { @@ -4323,10 +4343,12 @@ struct anv_push_constants { * * Used for vkCmdDispatchBase. */ - uint32_t base_work_group_id[3]; + uint32_t base_workgroup[3]; /** gl_NumWorkgroups */ - uint32_t num_work_groups[3]; + uint32_t num_workgroups[3]; + + uint32_t unaligned_invocations_x; /** Subgroup ID * diff --git a/src/intel/vulkan/anv_shader.c b/src/intel/vulkan/anv_shader.c index cec1eeb30f0..28a004acd6b 100644 --- a/src/intel/vulkan/anv_shader.c +++ b/src/intel/vulkan/anv_shader.c @@ -85,7 +85,11 @@ anv_shader_deserialize(struct vk_device *vk_device, sizeof(*data.bind_map.embedded_sampler_to_binding)); blob_copy_bytes(blob, data.bind_map.input_attachments, sizeof(data.bind_map.input_attachments)); - blob_copy_bytes(blob, data.bind_map.push_ranges, sizeof(data.bind_map.push_ranges)); + blob_copy_bytes(blob, data.bind_map.push_ranges, + sizeof(data.bind_map.push_ranges)); + data.bind_map.inline_dwords_count = blob_read_uint8(blob); + blob_copy_bytes(blob, data.bind_map.inline_dwords, + data.bind_map.inline_dwords_count); data.bind_map.used_surface_sets = blob_read_uint8(blob); data.bind_map.used_sampler_sets = blob_read_uint8(blob); data.bind_map.pushed_sets = blob_read_uint8(blob); @@ -167,6 +171,9 @@ anv_shader_serialize(struct vk_device *device, sizeof(shader->bind_map.input_attachments)); blob_write_bytes(blob, shader->bind_map.push_ranges, sizeof(shader->bind_map.push_ranges)); + blob_write_uint8(blob, shader->bind_map.inline_dwords_count); + blob_write_bytes(blob, shader->bind_map.inline_dwords, + shader->bind_map.inline_dwords_count); blob_write_uint8(blob, shader->bind_map.used_surface_sets); blob_write_uint8(blob, shader->bind_map.used_sampler_sets); blob_write_uint8(blob, shader->bind_map.pushed_sets); @@ -263,7 +270,8 @@ write_ir_text(VkPipelineExecutableInternalRepresentationKHR* ir, } static char * -get_shader_bind_map_text(const struct anv_shader *shader) +get_shader_bind_map_text(const struct anv_device *device, + const struct anv_shader *shader) { char *stream_data = NULL; size_t stream_size = 0; @@ -314,7 +322,13 @@ get_shader_bind_map_text(const struct anv_shader *shader) fprintf(stream, "\n"); } fprintf(stream, "\n"); + } + if (shader->bind_map.inline_dwords_count > 0) { + fprintf(stream, "Inline promoted dwords: "); + for (unsigned i = 0; i < bind_map->inline_dwords_count; i++) + fprintf(stream, "%hhu, ", bind_map->inline_dwords[i]); + fprintf(stream, "\n"); } fclose(stream); @@ -394,7 +408,7 @@ anv_shader_get_executable_internal_representations( } } - char *bind_map_text = get_shader_bind_map_text(shader); + char *bind_map_text = get_shader_bind_map_text(device, shader); if (bind_map_text != NULL) { vk_outarray_append_typed(VkPipelineExecutableInternalRepresentationKHR, &out, ir) { VK_COPY_STR(ir->name, "Shader push map"); diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index f176663ba9e..15e0a72789d 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -977,9 +977,20 @@ anv_shader_compile_task(struct anv_device *device, static nir_def * mesh_load_provoking_vertex(nir_builder *b, void *data) { - return nir_load_inline_data_intel( - b, 1, 32, nir_imm_int(b, 0), - .base = ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX); + const struct anv_pipeline_bind_map *bind_map = data; + + for (uint32_t i = 0; i < bind_map->inline_dwords_count; i++) { + if (bind_map->inline_dwords[i] == anv_drv_const_dword(gfx.mesh_provoking_vertex)) { + return nir_load_inline_data_intel( + b, 1, 16, nir_imm_int(b, 0), + .base = i * 4 + anv_drv_const_offset(gfx.mesh_provoking_vertex) % 4); + } + } + + return nir_load_push_data_intel(b, 1, 16, nir_imm_int(b, 0), + .base = anv_drv_const_offset(gfx.mesh_provoking_vertex) - + bind_map->push_ranges[0].start, + .range = anv_drv_const_size(gfx.mesh_provoking_vertex)); } static void @@ -1009,6 +1020,7 @@ anv_shader_compile_mesh(struct anv_device *device, &task_shader_data->prog_data.task.map : NULL, .load_provoking_vertex = mesh_load_provoking_vertex, + .load_provoking_vertex_data = (void *)&mesh_shader_data->bind_map, }; mesh_shader_data->code = (void *)brw_compile_mesh(compiler, ¶ms); diff --git a/src/intel/vulkan/genX_cmd_compute.c b/src/intel/vulkan/genX_cmd_compute.c index 1c9625fab23..f987dce0db7 100644 --- a/src/intel/vulkan/genX_cmd_compute.c +++ b/src/intel/vulkan/genX_cmd_compute.c @@ -26,6 +26,7 @@ #include "anv_private.h" #include "anv_measure.h" +#include "anv_nir.h" #include "common/intel_common.h" #include "common/intel_compute_slm.h" @@ -262,62 +263,60 @@ genX(cmd_buffer_flush_compute_state)(struct anv_cmd_buffer *cmd_buffer) } static void -anv_cmd_buffer_push_workgroups(struct anv_cmd_buffer *cmd_buffer, - const struct anv_pipeline_bind_map *bind_map, - uint32_t baseGroupX, - uint32_t baseGroupY, - uint32_t baseGroupZ, - uint32_t groupCountX, - uint32_t groupCountY, - uint32_t groupCountZ, - struct anv_address indirect_group) +anv_cmd_buffer_push_driver_values(struct anv_cmd_buffer *cmd_buffer, + const struct anv_pipeline_bind_map *bind_map, + uint32_t baseGroupX, + uint32_t baseGroupY, + uint32_t baseGroupZ, + uint32_t groupCountX, + uint32_t groupCountY, + uint32_t groupCountZ, + struct anv_address indirect_group, + uint32_t unaligned_x_offset) { if (anv_batch_has_error(&cmd_buffer->batch)) return; +#define UPDATE_PUSH(field, value) do { \ + if (field != value) { \ + field = value; \ + updated = true; \ + } \ + } while(0) + struct anv_push_constants *push = &cmd_buffer->state.compute.base.push_constants; bool updated = false; - if (push->cs.base_work_group_id[0] != baseGroupX || - push->cs.base_work_group_id[1] != baseGroupY || - push->cs.base_work_group_id[2] != baseGroupZ) { - push->cs.base_work_group_id[0] = baseGroupX; - push->cs.base_work_group_id[1] = baseGroupY; - push->cs.base_work_group_id[2] = baseGroupZ; - updated = true; + if (bind_map->binding_mask & ANV_PIPELINE_BIND_MASK_BASE_WORKGROUP) { + UPDATE_PUSH(push->cs.base_workgroup[0], baseGroupX); + UPDATE_PUSH(push->cs.base_workgroup[1], baseGroupY); + UPDATE_PUSH(push->cs.base_workgroup[2], baseGroupZ); } - /* On Gfx12.5+ this value goes into the inline parameter register */ - if (GFX_VERx10 < 125 && - (bind_map->binding_mask & ANV_PIPELINE_BIND_MASK_USES_NUM_WORKGROUP)) { + if (bind_map->binding_mask & ANV_PIPELINE_BIND_MASK_NUM_WORKGROUP) { if (anv_address_is_null(indirect_group)) { - if (push->cs.num_work_groups[0] != groupCountX || - push->cs.num_work_groups[1] != groupCountY || - push->cs.num_work_groups[2] != groupCountZ) { - push->cs.num_work_groups[0] = groupCountX; - push->cs.num_work_groups[1] = groupCountY; - push->cs.num_work_groups[2] = groupCountZ; - updated = true; - } + UPDATE_PUSH(push->cs.num_workgroups[0], groupCountX); + UPDATE_PUSH(push->cs.num_workgroups[1], groupCountY); + UPDATE_PUSH(push->cs.num_workgroups[2], groupCountZ); } else { uint64_t addr64 = anv_address_physical(indirect_group); uint32_t lower_addr32 = addr64 & 0xffffffff; uint32_t upper_addr32 = addr64 >> 32; - if (push->cs.num_work_groups[0] != UINT32_MAX || - push->cs.num_work_groups[1] != lower_addr32 || - push->cs.num_work_groups[2] != upper_addr32) { - push->cs.num_work_groups[0] = UINT32_MAX; - push->cs.num_work_groups[1] = lower_addr32; - push->cs.num_work_groups[2] = upper_addr32; - updated = true; - } + UPDATE_PUSH(push->cs.num_workgroups[0], UINT32_MAX); + UPDATE_PUSH(push->cs.num_workgroups[1], lower_addr32); + UPDATE_PUSH(push->cs.num_workgroups[2], upper_addr32); } } + if (bind_map->binding_mask & ANV_PIPELINE_BIND_MASK_UNALIGNED_INV_X) + UPDATE_PUSH(push->cs.unaligned_invocations_x, unaligned_x_offset); + if (updated) { cmd_buffer->state.push_constants_dirty |= VK_SHADER_STAGE_COMPUTE_BIT; cmd_buffer->state.compute.base.push_constants_data_dirty = true; } + +#undef UPDATE_PUSH } #define GPGPU_DISPATCHDIMX 0x2500 @@ -435,6 +434,48 @@ compute_update_async_threads_limit(struct anv_cmd_buffer *cmd_buffer, } } +static inline uint32_t +fill_inline_param(uint8_t param_value, + const uint32_t *push_data, + uint64_t push_addr64, + uint32_t base_wg[3], + uint32_t num_wg[3], + uint32_t unaligned_x_offset) +{ + switch (param_value) { + case ANV_INLINE_DWORD_PUSH_ADDRESS_LDW: return push_addr64 & 0xffffffff; + case ANV_INLINE_DWORD_PUSH_ADDRESS_UDW: return push_addr64 >> 32; + case anv_drv_const_dword(cs.num_workgroups[0]): return num_wg[0]; + case anv_drv_const_dword(cs.num_workgroups[1]): return num_wg[1]; + case anv_drv_const_dword(cs.num_workgroups[2]): return num_wg[2]; + case anv_drv_const_dword(cs.base_workgroup[0]): return base_wg[0]; + case anv_drv_const_dword(cs.base_workgroup[1]): return base_wg[1]; + case anv_drv_const_dword(cs.base_workgroup[2]): return base_wg[2]; + case anv_drv_const_dword(cs.unaligned_invocations_x): return unaligned_x_offset; + default: return push_data[param_value]; + } +} + +static inline void +fill_inline_params(struct GENX(COMPUTE_WALKER_BODY) *body, + const struct anv_cmd_compute_state *comp_state, + uint64_t push_addr64, + uint32_t base_wg[3], + uint32_t num_wg[3], + uint32_t unaligned_x_offset) +{ + const uint32_t *push_data = + (const uint32_t *)&comp_state->base.push_constants; + const struct anv_pipeline_bind_map *bind_map = + &comp_state->shader->bind_map; + + for (uint32_t i = 0; i < bind_map->inline_dwords_count; i++) { + body->InlineData[i] = fill_inline_param( + bind_map->inline_dwords[i], push_data, push_addr64, + base_wg, num_wg, unaligned_x_offset); + } +} + static inline void emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, const struct brw_cs_prog_data *prog_data, @@ -457,6 +498,23 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, compute_update_async_threads_limit(cmd_buffer, prog_data, &dispatch); + struct GENX(COMPUTE_WALKER_BODY) body = { + .InterfaceDescriptor = get_interface_descriptor_data_tables(cmd_buffer), + .ExecutionMask = dispatch.right_mask, + .PostSync = { + .MOCS = anv_mocs(cmd_buffer->device, NULL, 0), + }, + }; + + uint32_t num_workgroup_data[3] = { + UINT32_MAX, + indirect_addr64 & 0xffffffff, + indirect_addr64 >> 32, + }; + fill_inline_params(&body, comp_state, push_addr64, + (uint32_t[]) {0, 0, 0}, + num_workgroup_data, 0); + cmd_buffer->state.last_indirect_dispatch = anv_batch_emitn_merge_at( &cmd_buffer->batch, @@ -466,20 +524,7 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, GENX(EXECUTE_INDIRECT_DISPATCH), .PredicateEnable = predicate, .MaxCount = 1, - .body = { - .InterfaceDescriptor = get_interface_descriptor_data_tables(cmd_buffer), - .ExecutionMask = dispatch.right_mask, - .InlineData = { - [ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff, - [ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32, - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = UINT32_MAX, - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = indirect_addr64 & 0xffffffff, - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = indirect_addr64 >> 32, - }, - .PostSync = { - .MOCS = anv_mocs(cmd_buffer->device, NULL, 0), - }, - }, + .body = body, .ArgumentBufferStartAddress = indirect_addr, .MOCS = anv_mocs(cmd_buffer->device, indirect_addr.bo, 0), @@ -488,13 +533,14 @@ emit_indirect_compute_walker(struct anv_cmd_buffer *cmd_buffer, genX(cmd_buffer_post_dispatch_wa)(cmd_buffer); } + + static inline void emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, struct anv_address indirect_addr, const struct brw_cs_prog_data *prog_data, struct intel_cs_dispatch_info dispatch, - uint32_t groupCountX, uint32_t groupCountY, - uint32_t groupCountZ, + uint32_t base_wg[3], uint32_t num_wg[3], uint32_t unaligned_invocations_x) { const struct anv_cmd_compute_state *comp_state = &cmd_buffer->state.compute; @@ -509,9 +555,9 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, num_workgroup_data[1] = indirect_addr64 & 0xffffffff; num_workgroup_data[2] = indirect_addr64 >> 32; } else { - num_workgroup_data[0] = groupCountX; - num_workgroup_data[1] = groupCountY; - num_workgroup_data[2] = groupCountZ; + num_workgroup_data[0] = num_wg[0]; + num_workgroup_data[1] = num_wg[1]; + num_workgroup_data[2] = num_wg[2]; } uint64_t push_addr64 = anv_address_physical( @@ -520,24 +566,19 @@ emit_compute_walker(struct anv_cmd_buffer *cmd_buffer, struct GENX(COMPUTE_WALKER_BODY) body = { .InterfaceDescriptor = get_interface_descriptor_data_tables(cmd_buffer), - .ThreadGroupIDXDimension = groupCountX, - .ThreadGroupIDYDimension = groupCountY, - .ThreadGroupIDZDimension = groupCountZ, + .ThreadGroupIDXDimension = num_wg[0], + .ThreadGroupIDYDimension = num_wg[1], + .ThreadGroupIDZDimension = num_wg[2], .ExecutionMask = dispatch.right_mask, - .InlineData = { - [ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff, - [ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32, - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 0] = num_workgroup_data[0], - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 1] = num_workgroup_data[1], - [ANV_INLINE_PARAM_NUM_WORKGROUPS_OFFSET / 4 + 2] = num_workgroup_data[2], - [ANV_INLINE_PARAM_UNALIGNED_INVOCATIONS_X_OFFSET / 4 + 0] = - unaligned_invocations_x, - }, .PostSync = { .MOCS = anv_mocs(cmd_buffer->device, NULL, 0), }, }; + fill_inline_params(&body, comp_state, push_addr64, + base_wg, num_workgroup_data, + unaligned_invocations_x); + cmd_buffer->state.last_compute_walker = anv_batch_emitn_merge_at( &cmd_buffer->batch, @@ -562,8 +603,7 @@ static inline void emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, bool indirect, const struct brw_cs_prog_data *prog_data, - uint32_t groupCountX, uint32_t groupCountY, - uint32_t groupCountZ) + uint32_t num_wg[3]) { const bool predicate = cmd_buffer->state.conditional_render_enabled; @@ -578,9 +618,9 @@ emit_gpgpu_walker(struct anv_cmd_buffer *cmd_buffer, ggw.ThreadDepthCounterMaximum = 0; ggw.ThreadHeightCounterMaximum = 0; ggw.ThreadWidthCounterMaximum = dispatch.threads - 1; - ggw.ThreadGroupIDXDimension = groupCountX; - ggw.ThreadGroupIDYDimension = groupCountY; - ggw.ThreadGroupIDZDimension = groupCountZ; + ggw.ThreadGroupIDXDimension = num_wg[0]; + ggw.ThreadGroupIDYDimension = num_wg[1]; + ggw.ThreadGroupIDZDimension = num_wg[2]; ggw.RightExecutionMask = dispatch.right_mask; ggw.BottomExecutionMask = 0xffffffff; } @@ -595,7 +635,7 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, const struct brw_cs_prog_data *prog_data, struct intel_cs_dispatch_info dispatch, struct anv_address indirect_addr, - uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ, + uint32_t base_wg[3], uint32_t num_wg[3], bool is_unaligned_size_x, uint32_t unaligned_invocations_x) { struct anv_device *device = cmd_buffer->device; @@ -622,17 +662,17 @@ emit_cs_walker(struct anv_cmd_buffer *cmd_buffer, } #endif - if (is_indirect) + if (is_indirect) { compute_load_indirect_params(cmd_buffer, indirect_addr, - is_unaligned_size_x); + is_unaligned_size_x); + } #if GFX_VERx10 >= 125 emit_compute_walker(cmd_buffer, indirect_addr, prog_data, - dispatch, groupCountX, groupCountY, groupCountZ, + dispatch, base_wg, num_wg, unaligned_invocations_x); #else - emit_gpgpu_walker(cmd_buffer, is_indirect, prog_data, - groupCountX, groupCountY, groupCountZ); + emit_gpgpu_walker(cmd_buffer, is_indirect, prog_data, num_wg); #endif } @@ -655,10 +695,10 @@ void genX(CmdDispatchBase)( if (anv_batch_has_error(&cmd_buffer->batch)) return; - anv_cmd_buffer_push_workgroups(cmd_buffer, bind_map, - baseGroupX, baseGroupY, baseGroupZ, - groupCountX, groupCountY, groupCountZ, - ANV_NULL_ADDRESS); + anv_cmd_buffer_push_driver_values(cmd_buffer, bind_map, + baseGroupX, baseGroupY, baseGroupZ, + groupCountX, groupCountY, groupCountZ, + ANV_NULL_ADDRESS, 0); anv_measure_snapshot(cmd_buffer, INTEL_SNAPSHOT_COMPUTE, @@ -678,7 +718,8 @@ void genX(CmdDispatchBase)( emit_cs_walker(cmd_buffer, prog_data, dispatch, ANV_NULL_ADDRESS /* no indirect data */, - groupCountX, groupCountY, groupCountZ, + (uint32_t[]){ baseGroupX, baseGroupY, baseGroupZ }, + (uint32_t[]){ groupCountX, groupCountY, groupCountZ }, false, 0); genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); @@ -721,8 +762,10 @@ genX(cmd_dispatch_unaligned)( struct intel_cs_dispatch_info dispatch = brw_cs_get_dispatch_info(cmd_buffer->device->info, prog_data, NULL); - anv_cmd_buffer_push_workgroups(cmd_buffer, bind_map, 0, 0, 0, groupCountX, - groupCountY, groupCountZ, ANV_NULL_ADDRESS); + anv_cmd_buffer_push_driver_values(cmd_buffer, bind_map, 0, 0, 0, + groupCountX, groupCountY, groupCountZ, + ANV_NULL_ADDRESS, + invocations_x); /* RT shaders have Y and Z local size set to 1 always. */ assert(prog_data->local_size[1] == 1 && prog_data->local_size[2] == 1); @@ -739,7 +782,7 @@ genX(cmd_dispatch_unaligned)( trace_intel_begin_compute(&cmd_buffer->trace); assert((bind_map->binding_mask & - ANV_PIPELINE_BIND_MASK_USES_NUM_WORKGROUP) == 0); + ANV_PIPELINE_BIND_MASK_NUM_WORKGROUP) == 0); genX(cmd_buffer_flush_compute_state)(cmd_buffer); if (cmd_buffer->state.conditional_render_enabled) genX(cmd_emit_conditional_render_predicate)(cmd_buffer); @@ -748,7 +791,8 @@ genX(cmd_dispatch_unaligned)( emit_cs_walker(cmd_buffer, prog_data, dispatch, ANV_NULL_ADDRESS /* no indirect data */, - groupCountX, groupCountY, groupCountZ, + (uint32_t[]) { 0, 0, 0 }, + (uint32_t[]) { groupCountX, groupCountY, groupCountZ }, false, invocations_x); genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); @@ -777,8 +821,9 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer, if (anv_batch_has_error(&cmd_buffer->batch)) return; - anv_cmd_buffer_push_workgroups(cmd_buffer, bind_map, - 0, 0, 0, 0, 0, 0, indirect_addr); + anv_cmd_buffer_push_driver_values(cmd_buffer, bind_map, + 0, 0, 0, 0, 0, 0, + indirect_addr, 0); anv_measure_snapshot(cmd_buffer, INTEL_SNAPSHOT_COMPUTE, @@ -795,7 +840,9 @@ genX(cmd_buffer_dispatch_indirect)(struct anv_cmd_buffer *cmd_buffer, genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, true); emit_cs_walker(cmd_buffer, prog_data, dispatch, indirect_addr, - 0, 0, 0, is_unaligned_size_x, 0); + (uint32_t[]){0, 0, 0}, + (uint32_t[]){0, 0, 0}, + is_unaligned_size_x, 0); genX(emit_breakpoint)(&cmd_buffer->batch, cmd_buffer->device, false); diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index a2a3bb1635a..18e33768f19 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -26,6 +26,7 @@ #include "anv_private.h" #include "anv_measure.h" +#include "anv_nir.h" #include "genxml/gen_macros.h" #include "genxml/genX_pack.h" @@ -604,6 +605,36 @@ get_mesh_task_push_addr64(struct anv_cmd_buffer *cmd_buffer, bind_map->push_ranges[0].start * 32)); } +static inline void +fill_inline_params(uint32_t *inline_data, + const struct anv_pipeline_bind_map *bind_map, + struct anv_cmd_graphics_state *gfx, + uint64_t push_addr64) +{ + const uint32_t *push_data = (const uint32_t *) &gfx->base.push_constants; + + for (uint32_t i = 0; i < bind_map->inline_dwords_count; i++) { + switch (bind_map->inline_dwords[i]) { + case ANV_INLINE_DWORD_PUSH_ADDRESS_LDW: + inline_data[i] = push_addr64 & 0xffffffff; + break; + case ANV_INLINE_DWORD_PUSH_ADDRESS_UDW: + inline_data[i] = push_addr64 >> 32; + break; + case anv_drv_const_dword(gfx.mesh_provoking_vertex): { + const struct brw_mesh_prog_data *mesh_prog_data = get_gfx_mesh_prog_data(gfx); + inline_data[i] = gfx->dyn_state.mesh_provoking_vertex | + ((gfx->shaders[MESA_SHADER_MESH]->kernel.offset + + mesh_prog_data->wa_18019110168_mapping_offset) >> 16); + break; + } + default: + inline_data[i] = push_data[bind_map->inline_dwords[i]]; + break; + } + } +} + static void cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer, VkShaderStageFlags dirty_stages) @@ -612,25 +643,24 @@ cmd_buffer_flush_mesh_inline_data(struct anv_cmd_buffer *cmd_buffer, if (dirty_stages & VK_SHADER_STAGE_TASK_BIT_EXT && anv_gfx_has_stage(gfx, MESA_SHADER_TASK)) { + const struct anv_pipeline_bind_map *bind_map = + &gfx->shaders[MESA_SHADER_TASK]->bind_map; uint64_t push_addr64 = get_mesh_task_push_addr64(cmd_buffer, gfx, MESA_SHADER_TASK); - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) { - data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff; - data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32; - } + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_TASK_SHADER_DATA), data) + fill_inline_params(data.InlineData, bind_map, gfx, push_addr64); } if (dirty_stages & VK_SHADER_STAGE_MESH_BIT_EXT && anv_gfx_has_stage(gfx, MESA_SHADER_MESH)) { + const struct anv_pipeline_bind_map *bind_map = + &gfx->shaders[MESA_SHADER_MESH]->bind_map; uint64_t push_addr64 = get_mesh_task_push_addr64(cmd_buffer, gfx, MESA_SHADER_MESH); - anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) { - data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 0] = push_addr64 & 0xffffffff; - data.InlineData[ANV_INLINE_PARAM_PUSH_ADDRESS_OFFSET / 4 + 1] = push_addr64 >> 32; - data.InlineData[ANV_INLINE_PARAM_MESH_PROVOKING_VERTEX / 4] = gfx->dyn_state.mesh_provoking_vertex; - } + anv_batch_emit(&cmd_buffer->batch, GENX(3DSTATE_MESH_SHADER_DATA), data) + fill_inline_params(data.InlineData, bind_map, gfx, push_addr64); } } #endif