From f4a0e059703a85a3d595db51da21c7cbd3d8169d Mon Sep 17 00:00:00 2001 From: Lionel Landwerlin Date: Wed, 17 Dec 2025 11:37:22 +0200 Subject: [PATCH] anv/brw/iris: get rid of param array on prog_data Drivers can do all the lowering to push constants to find the only value useful in that array (subgroup_id). Then drivers call into brw_cs_fill_push_const_info() to get the cross/per thread constant layout computed in the prog_data. Signed-off-by: Lionel Landwerlin Reviewed-by: Alyssa Rosenzweig Part-of: --- src/gallium/drivers/iris/iris_disk_cache.c | 9 ---- src/gallium/drivers/iris/iris_program.c | 22 ++++---- src/gallium/drivers/iris/iris_state.c | 43 +++++++++------ src/intel/blorp/blorp_brw.c | 8 ++- src/intel/compiler/brw/brw_compile_cs.cpp | 52 ++----------------- src/intel/compiler/brw/brw_compiler.h | 14 ++--- src/intel/compiler/brw/brw_nir.h | 4 ++ .../brw/brw_nir_lower_cs_intrinsics.c | 33 ++++++++++++ src/intel/compiler/brw/brw_shader.cpp | 18 ------- src/intel/compiler/brw/brw_shader.h | 3 -- src/intel/compiler/brw/brw_thread_payload.cpp | 14 +---- src/intel/vulkan/anv_internal_kernels.c | 3 +- .../vulkan/anv_nir_compute_push_layout.c | 25 +++++---- .../vulkan/anv_nir_lower_driver_values.c | 16 ++++++ src/intel/vulkan/anv_pipeline_cache.c | 3 -- src/intel/vulkan/anv_shader.c | 4 -- src/intel/vulkan/anv_shader_compile.c | 48 ++++++++--------- 17 files changed, 145 insertions(+), 174 deletions(-) diff --git a/src/gallium/drivers/iris/iris_disk_cache.c b/src/gallium/drivers/iris/iris_disk_cache.c index fda45163d81..dda26e54331 100644 --- a/src/gallium/drivers/iris/iris_disk_cache.c +++ b/src/gallium/drivers/iris/iris_disk_cache.c @@ -128,7 +128,6 @@ iris_disk_cache_store(struct disk_cache *cache, union brw_any_prog_data serializable; assert(prog_data_s <= sizeof(serializable)); memcpy(&serializable, shader->brw_prog_data, prog_data_s); - serializable.base.param = NULL; serializable.base.relocs = NULL; blob_write_bytes(&blob, &serializable, prog_data_s); } else { @@ -152,8 +151,6 @@ iris_disk_cache_store(struct disk_cache *cache, if (brw) { blob_write_bytes(&blob, brw->relocs, brw->num_relocs * sizeof(struct intel_shader_reloc)); - blob_write_bytes(&blob, brw->param, - brw->nr_params * sizeof(uint32_t)); } else { #ifdef INTEL_USE_ELK blob_write_bytes(&blob, elk->relocs, @@ -265,12 +262,6 @@ iris_disk_cache_retrieve(struct iris_screen *screen, brw->num_relocs * sizeof(struct intel_shader_reloc)); brw->relocs = relocs; } - - brw->param = NULL; - if (brw->nr_params) { - brw->param = ralloc_array(NULL, uint32_t, brw->nr_params); - blob_copy_bytes(&blob, brw->param, brw->nr_params * sizeof(uint32_t)); - } } else { #ifdef INTEL_USE_ELK elk->relocs = NULL; diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 5879f4d5af7..31bbefa3163 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -165,9 +165,8 @@ iris_apply_brw_cs_prog_data(struct iris_compiled_shader *shader, iris->uses_sampler = brw->uses_sampler; iris->prog_mask = brw->prog_mask; - iris->first_param_is_builtin_subgroup_id = - brw->base.nr_params > 0 && - brw->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID; + /* The pushed constants only contain the subgroup_id */ + iris->first_param_is_builtin_subgroup_id = brw->base.nr_params > 0; } static void @@ -294,7 +293,6 @@ iris_apply_brw_prog_data(struct iris_compiled_shader *shader, ralloc_steal(shader, shader->brw_prog_data); ralloc_steal(shader->brw_prog_data, (void *)brw->relocs); - ralloc_steal(shader->brw_prog_data, brw->param); } #ifdef INTEL_USE_ELK @@ -1213,13 +1211,6 @@ iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS); nir_validate_shader(nir, "after remap"); - /* We don't use params[] but gallium leaves num_uniforms set. We use this - * to detect when cbuf0 exists but we don't need it anymore when we get - * here. Instead, zero it out so that the back-end doesn't get confused - * when nr_params * 4 != num_uniforms != nr_params * 4. - */ - nir->num_uniforms = 0; - *out_system_values = system_values; *out_num_system_values = num_system_values; *out_num_cbufs = num_cbufs; @@ -3111,6 +3102,15 @@ iris_compile_cs(struct iris_screen *screen, struct brw_cs_prog_data *brw_prog_data = rzalloc(mem_ctx, struct brw_cs_prog_data); + bool subgroup_id_lowered = false; + NIR_PASS(subgroup_id_lowered, nir, brw_nir_lower_cs_subgroup_id, devinfo, 0); + if (subgroup_id_lowered) { + brw_prog_data->base.nr_params = 1; + brw_cs_fill_push_const_info(devinfo, brw_prog_data, 0); + } else { + brw_cs_fill_push_const_info(devinfo, brw_prog_data, -1); + } + struct brw_compile_cs_params params = { .base = { .mem_ctx = mem_ctx, diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 1a4ba0c6c65..6fc9a359551 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -9410,21 +9410,34 @@ iris_upload_gpgpu_walker(struct iris_context *ice, if ((stage_dirty & IRIS_STAGE_DIRTY_CS) || (GFX_VER == 12 && !batch->contains_draw) || cs_data->local_size[0] == 0 /* Variable local group size */) { - uint32_t curbe_data_offset = 0; - assert(cs_data->push.cross_thread.dwords == 0 && - cs_data->push.per_thread.dwords == 1 && - cs_data->first_param_is_builtin_subgroup_id); - const unsigned push_const_size = - iris_cs_push_const_total_size(shader, dispatch.threads); - uint32_t *curbe_data_map = - stream_state(batch, ice->state.dynamic_uploader, - &ice->state.last_res.cs_thread_ids, - align(push_const_size, 64), 64, - &curbe_data_offset); - assert(curbe_data_map); - memset(curbe_data_map, 0x5a, align(push_const_size, 64)); - iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads, - curbe_data_map); + uint32_t curbe_data_offset, push_const_size; + uint32_t *curbe_data_map; + if (cs_data->push.cross_thread.dwords == 0 && + cs_data->push.per_thread.dwords == 0) { + push_const_size = 64; + curbe_data_map = + stream_state(batch, ice->state.dynamic_uploader, + &ice->state.last_res.cs_thread_ids, + align(push_const_size, 64), 64, + &curbe_data_offset); + assert(curbe_data_map); + memset(curbe_data_map, 0x5a, align(push_const_size, 64)); + } else { + assert(cs_data->push.cross_thread.dwords == 0 && + cs_data->push.per_thread.dwords == 1 && + cs_data->first_param_is_builtin_subgroup_id); + push_const_size = + iris_cs_push_const_total_size(shader, dispatch.threads); + curbe_data_map = + stream_state(batch, ice->state.dynamic_uploader, + &ice->state.last_res.cs_thread_ids, + align(push_const_size, 64), 64, + &curbe_data_offset); + assert(curbe_data_map); + memset(curbe_data_map, 0x5a, align(push_const_size, 64)); + iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads, + curbe_data_map); + } iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = align(push_const_size, 64); diff --git a/src/intel/blorp/blorp_brw.c b/src/intel/blorp/blorp_brw.c index 427ccf3365b..502e23f0ed8 100644 --- a/src/intel/blorp/blorp_brw.c +++ b/src/intel/blorp/blorp_brw.c @@ -28,7 +28,6 @@ blorp_compile_fs_brw(struct blorp_context *blorp, void *mem_ctx, struct brw_wm_prog_data *wm_prog_data = rzalloc(mem_ctx, struct brw_wm_prog_data); wm_prog_data->base.nr_params = 0; - wm_prog_data->base.param = NULL; struct brw_nir_compiler_opts opts = { .softfp64 = blorp->get_fp64_nir ? blorp->get_fp64_nir(blorp) : NULL, @@ -147,10 +146,12 @@ blorp_compile_cs_brw(struct blorp_context *blorp, void *mem_ctx, struct brw_cs_prog_data *cs_prog_data = rzalloc(mem_ctx, struct brw_cs_prog_data); cs_prog_data->base.nr_params = nr_params; - cs_prog_data->base.param = rzalloc_array(NULL, uint32_t, nr_params); + brw_cs_fill_push_const_info(compiler->devinfo, cs_prog_data, nr_params); NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics, compiler->devinfo, cs_prog_data); + NIR_PASS(_, nir, brw_nir_lower_cs_subgroup_id, compiler->devinfo, + offsetof(struct blorp_wm_inputs, subgroup_id)); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_base_workgroup_id, nir_metadata_control_flow, NULL); @@ -170,9 +171,6 @@ blorp_compile_cs_brw(struct blorp_context *blorp, void *mem_ctx, const unsigned *kernel = brw_compile_cs(compiler, ¶ms); - ralloc_free(cs_prog_data->base.param); - cs_prog_data->base.param = NULL; - return (struct blorp_program) { .kernel = kernel, .kernel_size = cs_prog_data->base.program_size, diff --git a/src/intel/compiler/brw/brw_compile_cs.cpp b/src/intel/compiler/brw/brw_compile_cs.cpp index 502fe101067..dbc00dc0ae0 100644 --- a/src/intel/compiler/brw/brw_compile_cs.cpp +++ b/src/intel/compiler/brw/brw_compile_cs.cpp @@ -25,19 +25,15 @@ fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) block->size = block->regs * 32; } -static void -cs_fill_push_const_info(const struct intel_device_info *devinfo, - struct brw_cs_prog_data *cs_prog_data) +extern "C" void +brw_cs_fill_push_const_info(const struct intel_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data, + int subgroup_id_index) { const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data); - - /* The thread ID should be stored in the last param dword */ - assert(subgroup_id_index == -1 || - subgroup_id_index == (int)prog_data->nr_params - 1); unsigned cross_thread_dwords, per_thread_dwords; - if (subgroup_id_index >= 0) { + if (devinfo->verx10 < 125 && subgroup_id_index >= 0) { /* Fill all but the last register with cross-thread payload */ cross_thread_dwords = 8 * (subgroup_id_index / 8); per_thread_dwords = prog_data->nr_params - cross_thread_dwords; @@ -120,41 +116,6 @@ brw_nir_uses_sampler(nir_shader *shader) NULL); } -static inline uint32_t * -brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data, - unsigned nr_new_params) -{ - unsigned old_nr_params = prog_data->nr_params; - prog_data->nr_params += nr_new_params; - prog_data->param = reralloc(ralloc_parent(prog_data->param), - prog_data->param, uint32_t, - prog_data->nr_params); - return prog_data->param + old_nr_params; -} - -static void -brw_adjust_uniforms(brw_shader &s) -{ - if (s.devinfo->verx10 >= 125) - return; - - assert(mesa_shader_stage_is_compute(s.stage)); - - if (brw_get_subgroup_id_param_index(s.devinfo, s.prog_data) == -1) { - /* Add uniforms for builtins after regular NIR uniforms. */ - assert(s.uniforms == s.prog_data->nr_params); - - /* Subgroup ID must be the last uniform on the list. This will make - * easier later to split between cross thread and per thread - * uniforms. - */ - uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1); - *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; - } - - s.uniforms = s.prog_data->nr_params; -} - const unsigned * brw_compile_cs(const struct brw_compiler *compiler, struct brw_compile_cs_params *params) @@ -233,7 +194,6 @@ brw_compile_cs(const struct brw_compiler *compiler, .archiver = params->base.archiver, }; v[simd] = std::make_unique(&shader_params); - brw_adjust_uniforms(*v[simd]); const bool allow_spilling = simd == 0 || (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) || @@ -245,8 +205,6 @@ brw_compile_cs(const struct brw_compiler *compiler, } if (run_cs(*v[simd], allow_spilling)) { - cs_fill_push_const_info(compiler->devinfo, prog_data); - brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers && diff --git a/src/intel/compiler/brw/brw_compiler.h b/src/intel/compiler/brw/brw_compiler.h index eccccde1df8..15307ba252f 100644 --- a/src/intel/compiler/brw/brw_compiler.h +++ b/src/intel/compiler/brw/brw_compiler.h @@ -543,7 +543,6 @@ enum brw_param_builtin { BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_X, BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Y, BRW_PARAM_BUILTIN_BASE_WORK_GROUP_ID_Z, - BRW_PARAM_BUILTIN_SUBGROUP_ID, BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_X, BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Y, BRW_PARAM_BUILTIN_WORK_GROUP_SIZE_Z, @@ -616,14 +615,6 @@ struct brw_stage_prog_data { uint32_t source_hash; - /* 32-bit identifiers for all push/pull parameters. These can be anything - * the driver wishes them to be; the core of the back-end compiler simply - * re-arranges them. The one restriction is that the bottom 2^16 values - * are reserved for builtins defined in the brw_param_builtin enum defined - * above. - */ - uint32_t *param; - /* Whether shader uses atomic operations. */ bool uses_atomic_load_store; }; @@ -1672,6 +1663,11 @@ unsigned brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, unsigned threads); +void +brw_cs_fill_push_const_info(const struct intel_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data, + int subgroup_id_index); + void brw_write_shader_relocs(const struct brw_isa_info *isa, void *program, diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index cf04102f7fb..d4c2a04e2e9 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -179,6 +179,10 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, bool brw_nir_lower_cs_intrinsics(nir_shader *nir, const struct intel_device_info *devinfo, struct brw_cs_prog_data *prog_data); +bool brw_nir_lower_cs_subgroup_id(nir_shader *nir, + const struct intel_device_info *devinfo, + unsigned subgroup_id_offset); + bool brw_nir_lower_alpha_to_coverage(nir_shader *shader); bool brw_needs_vertex_attributes_bypass(const nir_shader *shader); void brw_nir_lower_fs_barycentrics(nir_shader *shader); diff --git a/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c index b05747652c1..12c7fc7e167 100644 --- a/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c @@ -387,3 +387,36 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir, return state.progress; } + +static bool +lower_cs_subgroup_id_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_subgroup_id) + return false; + + const unsigned *subgroup_id_offset_ptr = data; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def_replace(&intrin->def, + nir_load_uniform( + b, 1, 32, nir_imm_int(b, 0), + .base = *subgroup_id_offset_ptr, + .range = 4)); + + return true; +} + +bool +brw_nir_lower_cs_subgroup_id(nir_shader *nir, + const struct intel_device_info *devinfo, + unsigned subgroup_id_offset) +{ + if (devinfo->verx10 >= 125) + return false; + + return nir_shader_intrinsics_pass(nir, lower_cs_subgroup_id_instr, + nir_metadata_control_flow, + &subgroup_id_offset); +} diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp index 5ad5aaa42d5..28c99517372 100644 --- a/src/intel/compiler/brw/brw_shader.cpp +++ b/src/intel/compiler/brw/brw_shader.cpp @@ -969,24 +969,6 @@ brw_shader::convert_attr_sources_to_hw_regs(brw_inst *inst) } } -int -brw_get_subgroup_id_param_index(const intel_device_info *devinfo, - const brw_stage_prog_data *prog_data) -{ - if (prog_data->nr_params == 0) - return -1; - - if (devinfo->verx10 >= 125) - return -1; - - /* The local thread id is always the last parameter in the list */ - uint32_t last_param = prog_data->param[prog_data->nr_params - 1]; - if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID) - return prog_data->nr_params - 1; - - return -1; -} - uint32_t brw_fb_write_msg_control(const brw_inst *inst, const struct brw_wm_prog_data *prog_data) diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index 3c5e7a4ce3a..a35c9d885f0 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -302,9 +302,6 @@ uint32_t brw_fb_write_msg_control(const brw_inst *inst, void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data); -int brw_get_subgroup_id_param_index(const intel_device_info *devinfo, - const brw_stage_prog_data *prog_data); - void brw_from_nir(brw_shader *s); void brw_shader_phase_update(brw_shader &s, enum brw_shader_phase phase); diff --git a/src/intel/compiler/brw/brw_thread_payload.cpp b/src/intel/compiler/brw/brw_thread_payload.cpp index 78b45cd64f7..adc6d79cecb 100644 --- a/src/intel/compiler/brw/brw_thread_payload.cpp +++ b/src/intel/compiler/brw/brw_thread_payload.cpp @@ -380,19 +380,9 @@ void brw_cs_thread_payload::load_subgroup_id(const brw_builder &bld, brw_reg &dest) const { - auto devinfo = bld.shader->devinfo; + assert(bld.shader->devinfo->verx10 >= 125); dest = retype(dest, BRW_TYPE_UD); - - if (subgroup_id_.file != BAD_FILE) { - assert(devinfo->verx10 >= 125); - bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0))); - } else { - assert(devinfo->verx10 < 125); - assert(mesa_shader_stage_is_compute(bld.shader->stage)); - int index = brw_get_subgroup_id_param_index(devinfo, - bld.shader->prog_data); - bld.MOV(dest, brw_uniform_reg(index, BRW_TYPE_UD)); - } + bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0))); } brw_task_mesh_thread_payload::brw_task_mesh_thread_payload(brw_shader &v) diff --git a/src/intel/vulkan/anv_internal_kernels.c b/src/intel/vulkan/anv_internal_kernels.c index f5389c37ea4..6ebf969b249 100644 --- a/src/intel/vulkan/anv_internal_kernels.c +++ b/src/intel/vulkan/anv_internal_kernels.c @@ -157,7 +157,6 @@ compile_shader(struct anv_device *device, void *temp_ctx = ralloc_context(NULL); prog_data.base.nr_params = nir->num_uniforms / 4; - prog_data.base.param = rzalloc_array(temp_ctx, uint32_t, prog_data.base.nr_params); brw_nir_analyze_ubo_ranges(compiler, nir, prog_data.base.ubo_ranges); @@ -191,6 +190,8 @@ compile_shader(struct anv_device *device, } } } else { + brw_cs_fill_push_const_info(device->info, &prog_data.cs, -1); + struct genisa_stats stats; struct brw_compile_cs_params params = { .base = { diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index 4c517953704..3b768fbe738 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -143,18 +143,6 @@ anv_nir_compute_push_layout(nir_shader *nir, push_end = MAX2(push_end, tess_config_end); } - if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) { - /* For compute shaders, we always have to have the subgroup ID. The - * back-end compiler will "helpfully" add it for us in the last push - * constant slot. Yes, there is an off-by-one error here but that's - * because the back-end will add it so we want to claim the number of - * push constants one dword less than the full amount including - * gl_SubgroupId. - */ - assert(push_end <= anv_drv_const_offset(cs.subgroup_id)); - push_end = anv_drv_const_offset(cs.subgroup_id); - } - /* Align push_start down to a 32B (for 3DSTATE_CONSTANT) and make it no * larger than push_end (no push constants is indicated by push_start = * UINT_MAX). @@ -188,7 +176,18 @@ anv_nir_compute_push_layout(nir_shader *nir, const unsigned alignment = 4; nir->num_uniforms = align(push_end - push_start, alignment); prog_data->nr_params = nir->num_uniforms / 4; - prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); + + /* Fill the compute push constant layout (cross/per thread constants) for + * platforms pre Gfx12.5. + */ + if (nir->info.stage == MESA_SHADER_COMPUTE) { + const int subgroup_id_index = + push_end == (anv_drv_const_offset(cs.subgroup_id) + + anv_drv_const_size(cs.subgroup_id)) ? + (anv_drv_const_offset(cs.subgroup_id) - push_start) / 4 : -1; + struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); + brw_cs_fill_push_const_info(devinfo, cs_prog_data, subgroup_id_index); + } const struct anv_push_range push_constant_range = { .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, diff --git a/src/intel/vulkan/anv_nir_lower_driver_values.c b/src/intel/vulkan/anv_nir_lower_driver_values.c index 7284561eb33..8165f01e1e6 100644 --- a/src/intel/vulkan/anv_nir_lower_driver_values.c +++ b/src/intel/vulkan/anv_nir_lower_driver_values.c @@ -53,6 +53,20 @@ lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin) return true; } +static bool +lower_subgroup_id(nir_builder *b, nir_intrinsic_instr *intrin, + const struct anv_physical_device *pdevice) +{ + if (pdevice->info.verx10 >= 125) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def_replace(&intrin->def, + anv_load_driver_uniform(b, 1, cs.subgroup_id)); + + return true; +} + static bool lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin) { @@ -72,6 +86,8 @@ lower_driver_values(nir_builder *b, nir_intrinsic_instr *intrin, void *data) return lower_load_constant(b, intrin); case nir_intrinsic_load_base_workgroup_id: return lower_base_workgroup_id(b, intrin); + case nir_intrinsic_load_subgroup_id: + return lower_subgroup_id(b, intrin, data); case nir_intrinsic_load_ray_query_global_intel: return lower_ray_query_globals(b, intrin); default: diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index 69ea9263d32..33354d88ac9 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -94,7 +94,6 @@ anv_shader_internal_create(struct anv_device *device, prog_data_size); VK_MULTIALLOC_DECL(&ma, struct intel_shader_reloc, prog_data_relocs, prog_data_in->num_relocs); - VK_MULTIALLOC_DECL(&ma, uint32_t, prog_data_param, prog_data_in->nr_params); VK_MULTIALLOC_DECL(&ma, void, code, kernel_size); VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info, @@ -151,7 +150,6 @@ anv_shader_internal_create(struct anv_device *device, typed_memcpy(prog_data_relocs, prog_data_in->relocs, prog_data_in->num_relocs); prog_data->relocs = prog_data_relocs; - prog_data->param = prog_data_param; shader->prog_data = prog_data; shader->prog_data_size = prog_data_size; @@ -210,7 +208,6 @@ anv_shader_internal_serialize(struct vk_pipeline_cache_object *object, assert(shader->prog_data_size <= sizeof(prog_data)); memcpy(&prog_data, shader->prog_data, shader->prog_data_size); prog_data.base.relocs = NULL; - prog_data.base.param = NULL; blob_write_bytes(blob, &prog_data, shader->prog_data_size); blob_write_bytes(blob, shader->prog_data->relocs, diff --git a/src/intel/vulkan/anv_shader.c b/src/intel/vulkan/anv_shader.c index 219cb5a7b4c..36d18a03456 100644 --- a/src/intel/vulkan/anv_shader.c +++ b/src/intel/vulkan/anv_shader.c @@ -111,7 +111,6 @@ anv_shader_serialize(struct vk_device *device, union brw_any_prog_data prog_data; memcpy(&prog_data, shader->prog_data, brw_prog_data_size(vk_shader->stage)); prog_data.base.relocs = NULL; - prog_data.base.param = NULL; blob_write_bytes(blob, &prog_data, brw_prog_data_size(vk_shader->stage)); @@ -584,9 +583,6 @@ anv_shader_create(struct anv_device *device, const uint32_t cmd_data_dwords = anv_genX(device->info, shader_cmd_size)( device, stage); - /* We never need this at runtime */ - shader_data->prog_data.base.param = NULL; - VK_MULTIALLOC(ma); VK_MULTIALLOC_DECL(&ma, struct anv_shader, shader, 1); VK_MULTIALLOC_DECL(&ma, uint32_t, cmd_data, cmd_data_dwords); diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index 56376b88ae9..63efb6b9c5f 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -1473,8 +1473,6 @@ anv_shader_lower_nir(struct anv_device *device, dynamic_descriptors_offsets, &shader_data->bind_map, &shader_data->push_map, mem_ctx); - NIR_PASS(_, nir, anv_nir_lower_driver_values, pdevice); - NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, anv_nir_ubo_addr_format(pdevice, shader_data->key.base.robust_flags)); NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo, @@ -1545,28 +1543,6 @@ anv_shader_lower_nir(struct anv_device *device, NIR_PASS(_, nir, nir_opt_dce); } - NIR_PASS(_, nir, anv_nir_update_resource_intel_block); - - NIR_PASS(_, nir, anv_nir_compute_push_layout, - pdevice, shader_data->key.base.robust_flags, - &(struct anv_nir_push_layout_info) { - .separate_tessellation = (nir->info.stage == MESA_SHADER_TESS_CTRL && - shader_data->key.tcs.separate_tess_vue_layout) || - (nir->info.stage == MESA_SHADER_TESS_EVAL && - shader_data->key.tes.separate_tess_vue_layout), - .fragment_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && - brw_wm_prog_key_is_dynamic(&shader_data->key.wm), - .mesh_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && - shader_data->key.wm.mesh_input == INTEL_SOMETIMES, - }, - &shader_data->key.base, - &shader_data->prog_data.base, - &shader_data->bind_map, &shader_data->push_map, - mem_ctx); - - NIR_PASS(_, nir, anv_nir_lower_resource_intel, pdevice, - shader_data->bind_map.layout_type); - if (mesa_shader_stage_uses_workgroup(nir->info.stage)) { NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_type_info); @@ -1597,6 +1573,30 @@ anv_shader_lower_nir(struct anv_device *device, &shader_data->prog_data.cs); } + NIR_PASS(_, nir, anv_nir_lower_driver_values, pdevice); + + NIR_PASS(_, nir, anv_nir_update_resource_intel_block); + + NIR_PASS(_, nir, anv_nir_compute_push_layout, + pdevice, shader_data->key.base.robust_flags, + &(struct anv_nir_push_layout_info) { + .separate_tessellation = (nir->info.stage == MESA_SHADER_TESS_CTRL && + shader_data->key.tcs.separate_tess_vue_layout) || + (nir->info.stage == MESA_SHADER_TESS_EVAL && + shader_data->key.tes.separate_tess_vue_layout), + .fragment_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_key_is_dynamic(&shader_data->key.wm), + .mesh_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && + shader_data->key.wm.mesh_input == INTEL_SOMETIMES, + }, + &shader_data->key.base, + &shader_data->prog_data.base, + &shader_data->bind_map, &shader_data->push_map, + mem_ctx); + + NIR_PASS(_, nir, anv_nir_lower_resource_intel, pdevice, + shader_data->bind_map.layout_type); + shader_data->push_desc_info.push_set_buffer = anv_nir_loads_push_desc_buffer( nir, set_layouts, set_layout_count, &shader_data->bind_map);