diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index f778edddce3..140c896c571 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -363,6 +363,12 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) is_divergent = false; break; + case nir_intrinsic_load_push_data_intel: + is_divergent = + (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM) && + src_divergent(instr->src[0], state); + break; + case nir_intrinsic_load_ubo_uniform_block_intel: case nir_intrinsic_load_ssbo_uniform_block_intel: case nir_intrinsic_load_shared_uniform_block_intel: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 962f5a24c0b..43730ca9b14 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -2559,9 +2559,15 @@ system_value("urb_output_handle_intel", 1) load("urb_input_handle_indexed_intel", [1], [], [CAN_ELIMINATE, CAN_REORDER]) # Inline register delivery (available on Gfx12.5+ for CS/Mesh/Task stages) -intrinsic("load_inline_data_intel", [], dest_comp=0, - indices=[BASE], - flags=[CAN_ELIMINATE, CAN_REORDER]) +load("inline_data_intel", [], [BASE], [CAN_ELIMINATE, CAN_REORDER]) + +# Load push data +# src[] = { offset } +# +# We use the ACCESS index mostly for ACCESS_NON_UNIFORM, this allows us to +# preserve the semantic of load_push_constant which is always uniform +# regardless of the offset source. +load("push_data_intel", [1], [BASE, RANGE, ACCESS], [CAN_ELIMINATE, CAN_REORDER]) # Dynamic tesselation parameters (see intel_tess_config). system_value("tess_config_intel", 1) diff --git a/src/gallium/drivers/iris/iris_context.h b/src/gallium/drivers/iris/iris_context.h index 279748c1c75..5c2eaf0078c 100644 --- a/src/gallium/drivers/iris/iris_context.h +++ b/src/gallium/drivers/iris/iris_context.h @@ -47,6 +47,7 @@ struct iris_bo; struct iris_context; struct blorp_batch; struct blorp_params; +struct brw_ubo_range; #define IRIS_MAX_DRAW_BUFFERS 8 #define IRIS_MAX_SOL_BINDINGS 64 @@ -696,10 +697,15 @@ struct iris_compiled_shader { mesa_shader_stage stage; /** - * Data derived from prog_data. + * Data derived from ELK prog_data. */ struct iris_ubo_range ubo_ranges[4]; + /** + * Data derived from BRW prog_data. + */ + uint16_t push_sizes[4]; + unsigned nr_params; unsigned total_scratch; unsigned total_shared; @@ -1350,7 +1356,8 @@ uint32_t iris_bti_to_group_index(const struct iris_binding_table *bt, enum iris_surface_group group, uint32_t bti); void iris_apply_brw_prog_data(struct iris_compiled_shader *shader, - struct brw_stage_prog_data *prog_data); + struct brw_stage_prog_data *prog_data, + struct brw_ubo_range *ubo_ranges); void iris_apply_elk_prog_data(struct iris_compiled_shader *shader, struct elk_stage_prog_data *prog_data); struct intel_cs_dispatch_info diff --git a/src/gallium/drivers/iris/iris_disk_cache.c b/src/gallium/drivers/iris/iris_disk_cache.c index f0ee2dfd1ff..cdc2876dc9d 100644 --- a/src/gallium/drivers/iris/iris_disk_cache.c +++ b/src/gallium/drivers/iris/iris_disk_cache.c @@ -128,7 +128,6 @@ iris_disk_cache_store(struct disk_cache *cache, union brw_any_prog_data serializable; assert(prog_data_s <= sizeof(serializable)); memcpy(&serializable, shader->brw_prog_data, prog_data_s); - serializable.base.param = NULL; serializable.base.relocs = NULL; blob_write_bytes(&blob, &serializable, prog_data_s); } else { @@ -152,8 +151,7 @@ iris_disk_cache_store(struct disk_cache *cache, if (brw) { blob_write_bytes(&blob, brw->relocs, brw->num_relocs * sizeof(struct intel_shader_reloc)); - blob_write_bytes(&blob, brw->param, - brw->nr_params * sizeof(uint32_t)); + blob_write_bytes(&blob, shader->ubo_ranges, sizeof(shader->ubo_ranges)); } else { #ifdef INTEL_USE_ELK blob_write_bytes(&blob, elk->relocs, @@ -265,12 +263,7 @@ iris_disk_cache_retrieve(struct iris_screen *screen, brw->num_relocs * sizeof(struct intel_shader_reloc)); brw->relocs = relocs; } - - brw->param = NULL; - if (brw->nr_params) { - brw->param = ralloc_array(NULL, uint32_t, brw->nr_params); - blob_copy_bytes(&blob, brw->param, brw->nr_params * sizeof(uint32_t)); - } + blob_copy_bytes(&blob, shader->ubo_ranges, sizeof(shader->ubo_ranges)); } else { #ifdef INTEL_USE_ELK elk->relocs = NULL; @@ -320,7 +313,7 @@ iris_disk_cache_retrieve(struct iris_screen *screen, num_cbufs++; if (brw) - iris_apply_brw_prog_data(shader, brw); + iris_apply_brw_prog_data(shader, brw, NULL); else #ifdef INTEL_USE_ELK iris_apply_elk_prog_data(shader, elk); diff --git a/src/gallium/drivers/iris/iris_indirect_gen.c b/src/gallium/drivers/iris/iris_indirect_gen.c index 7b2a19b3103..64eed30d98b 100644 --- a/src/gallium/drivers/iris/iris_indirect_gen.c +++ b/src/gallium/drivers/iris/iris_indirect_gen.c @@ -291,8 +291,7 @@ emit_indirect_generate_draw(struct iris_batch *batch, ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0; #if GFX_VER < 20 - ps.PushConstantEnable = shader->nr_params > 0 || - shader->ubo_ranges[0].length; + ps.PushConstantEnable = shader->push_sizes[0] > 0; #endif #if GFX_VER >= 9 diff --git a/src/gallium/drivers/iris/iris_program.c b/src/gallium/drivers/iris/iris_program.c index 5879f4d5af7..bd540a67b11 100644 --- a/src/gallium/drivers/iris/iris_program.c +++ b/src/gallium/drivers/iris/iris_program.c @@ -165,9 +165,8 @@ iris_apply_brw_cs_prog_data(struct iris_compiled_shader *shader, iris->uses_sampler = brw->uses_sampler; iris->prog_mask = brw->prog_mask; - iris->first_param_is_builtin_subgroup_id = - brw->base.nr_params > 0 && - brw->base.param[0] == BRW_PARAM_BUILTIN_SUBGROUP_ID; + /* The pushed constants only contain the subgroup_id */ + iris->first_param_is_builtin_subgroup_id = brw->base.push_sizes[0] > 0; } static void @@ -249,16 +248,20 @@ iris_apply_brw_gs_prog_data(struct iris_compiled_shader *shader, void iris_apply_brw_prog_data(struct iris_compiled_shader *shader, - struct brw_stage_prog_data *brw) + struct brw_stage_prog_data *brw, + struct brw_ubo_range *ubo_ranges) { - STATIC_ASSERT(ARRAY_SIZE(brw->ubo_ranges) == ARRAY_SIZE(shader->ubo_ranges)); - for (int i = 0; i < ARRAY_SIZE(shader->ubo_ranges); i++) { - shader->ubo_ranges[i].block = brw->ubo_ranges[i].block; - shader->ubo_ranges[i].start = brw->ubo_ranges[i].start; - shader->ubo_ranges[i].length = brw->ubo_ranges[i].length; + if (ubo_ranges != NULL) { + for (int i = 0; i < ARRAY_SIZE(shader->ubo_ranges); i++) { + shader->ubo_ranges[i].block = ubo_ranges[i].block; + shader->ubo_ranges[i].start = ubo_ranges[i].start; + shader->ubo_ranges[i].length = ubo_ranges[i].length; + } } - shader->nr_params = brw->nr_params; + for (int i = 0; i < ARRAY_SIZE(shader->push_sizes); i++) + shader->push_sizes[i] = brw->push_sizes[i]; + shader->total_scratch = brw->total_scratch; shader->total_shared = brw->total_shared; shader->program_size = brw->program_size; @@ -294,7 +297,6 @@ iris_apply_brw_prog_data(struct iris_compiled_shader *shader, ralloc_steal(shader, shader->brw_prog_data); ralloc_steal(shader->brw_prog_data, (void *)brw->relocs); - ralloc_steal(shader->brw_prog_data, brw->param); } #ifdef INTEL_USE_ELK @@ -1213,13 +1215,6 @@ iris_setup_uniforms(ASSERTED const struct intel_device_info *devinfo, assert(num_cbufs < PIPE_MAX_CONSTANT_BUFFERS); nir_validate_shader(nir, "after remap"); - /* We don't use params[] but gallium leaves num_uniforms set. We use this - * to detect when cbuf0 exists but we don't need it anymore when we get - * here. Instead, zero it out so that the back-end doesn't get confused - * when nr_params * 4 != num_uniforms != nr_params * 4. - */ - nir->num_uniforms = 0; - *out_system_values = system_values; *out_num_system_values = num_system_values; *out_num_cbufs = num_cbufs; @@ -1932,7 +1927,9 @@ iris_compile_vs(struct iris_screen *screen, brw_prog_data->base.base.use_alt_mode = nir->info.use_legacy_math_rules; - brw_nir_analyze_ubo_ranges(screen->brw, nir, brw_prog_data->base.base.ubo_ranges); + struct brw_ubo_range ubo_ranges[4] = {}; + brw_nir_analyze_ubo_ranges(screen->brw, nir, ubo_ranges); + NIR_PASS(_, nir, brw_nir_lower_ubo_ranges, ubo_ranges); struct brw_vs_prog_key brw_key = iris_to_brw_vs_key(screen, key); @@ -1951,7 +1948,7 @@ iris_compile_vs(struct iris_screen *screen, program = brw_compile_vs(screen->brw, ¶ms); error = params.base.error_str; if (program) { - iris_apply_brw_prog_data(shader, &brw_prog_data->base.base); + iris_apply_brw_prog_data(shader, &brw_prog_data->base.base, ubo_ranges); iris_debug_recompile_brw(screen, dbg, ish, &brw_key.base); } } else { @@ -2174,7 +2171,10 @@ iris_compile_tcs(struct iris_screen *screen, if (screen->brw) { struct brw_tcs_prog_data *brw_prog_data = rzalloc(mem_ctx, struct brw_tcs_prog_data); - brw_nir_analyze_ubo_ranges(screen->brw, nir, brw_prog_data->base.base.ubo_ranges); + + struct brw_ubo_range ubo_ranges[4] = {}; + brw_nir_analyze_ubo_ranges(screen->brw, nir, ubo_ranges); + NIR_PASS(_, nir, brw_nir_lower_ubo_ranges, ubo_ranges); struct brw_compile_tcs_params params = { .base = { @@ -2192,7 +2192,7 @@ iris_compile_tcs(struct iris_screen *screen, error = params.base.error_str; if (program) { - iris_apply_brw_prog_data(shader, &brw_prog_data->base.base); + iris_apply_brw_prog_data(shader, &brw_prog_data->base.base, ubo_ranges); iris_debug_recompile_brw(screen, dbg, ish, &brw_key.base); } } else { @@ -2377,7 +2377,9 @@ iris_compile_tes(struct iris_screen *screen, struct brw_tes_prog_data *brw_prog_data = rzalloc(mem_ctx, struct brw_tes_prog_data); - brw_nir_analyze_ubo_ranges(screen->brw, nir, brw_prog_data->base.base.ubo_ranges); + struct brw_ubo_range ubo_ranges[4] = {}; + brw_nir_analyze_ubo_ranges(screen->brw, nir, ubo_ranges); + NIR_PASS(_, nir, brw_nir_lower_ubo_ranges, ubo_ranges); struct intel_vue_map input_vue_map; brw_compute_tess_vue_map(&input_vue_map, key->inputs_read, @@ -2403,7 +2405,7 @@ iris_compile_tes(struct iris_screen *screen, if (program) { iris_debug_recompile_brw(screen, dbg, ish, &brw_key.base); - iris_apply_brw_prog_data(shader, &brw_prog_data->base.base); + iris_apply_brw_prog_data(shader, &brw_prog_data->base.base, ubo_ranges); } } else { #ifdef INTEL_USE_ELK @@ -2571,7 +2573,9 @@ iris_compile_gs(struct iris_screen *screen, struct brw_gs_prog_data *brw_prog_data = rzalloc(mem_ctx, struct brw_gs_prog_data); - brw_nir_analyze_ubo_ranges(screen->brw, nir, brw_prog_data->base.base.ubo_ranges); + struct brw_ubo_range ubo_ranges[4] = {}; + brw_nir_analyze_ubo_ranges(screen->brw, nir, ubo_ranges); + NIR_PASS(_, nir, brw_nir_lower_ubo_ranges, ubo_ranges); brw_compute_vue_map(devinfo, &brw_prog_data->base.vue_map, nir->info.outputs_written, @@ -2595,7 +2599,7 @@ iris_compile_gs(struct iris_screen *screen, error = params.base.error_str; if (program) { iris_debug_recompile_brw(screen, dbg, ish, &brw_key.base); - iris_apply_brw_prog_data(shader, &brw_prog_data->base.base); + iris_apply_brw_prog_data(shader, &brw_prog_data->base.base, ubo_ranges); } } else { #ifdef INTEL_USE_ELK @@ -2764,7 +2768,9 @@ iris_compile_fs(struct iris_screen *screen, brw_prog_data->base.use_alt_mode = nir->info.use_legacy_math_rules; - brw_nir_analyze_ubo_ranges(screen->brw, nir, brw_prog_data->base.ubo_ranges); + struct brw_ubo_range ubo_ranges[4] = {}; + brw_nir_analyze_ubo_ranges(screen->brw, nir, ubo_ranges); + NIR_PASS(_, nir, brw_nir_lower_ubo_ranges, ubo_ranges); struct brw_wm_prog_key brw_key = iris_to_brw_fs_key(screen, key); @@ -2788,7 +2794,7 @@ iris_compile_fs(struct iris_screen *screen, error = params.base.error_str; if (program) { iris_debug_recompile_brw(screen, dbg, ish, &brw_key.base); - iris_apply_brw_prog_data(shader, &brw_prog_data->base); + iris_apply_brw_prog_data(shader, &brw_prog_data->base, ubo_ranges); } } else { #ifdef INTEL_USE_ELK @@ -3111,6 +3117,15 @@ iris_compile_cs(struct iris_screen *screen, struct brw_cs_prog_data *brw_prog_data = rzalloc(mem_ctx, struct brw_cs_prog_data); + bool subgroup_id_lowered = false; + NIR_PASS(subgroup_id_lowered, nir, brw_nir_lower_cs_subgroup_id, devinfo, 0); + if (subgroup_id_lowered) { + brw_prog_data->base.push_sizes[0] = 4; + brw_cs_fill_push_const_info(devinfo, brw_prog_data, 0); + } else { + brw_cs_fill_push_const_info(devinfo, brw_prog_data, -1); + } + struct brw_compile_cs_params params = { .base = { .mem_ctx = mem_ctx, @@ -3127,7 +3142,7 @@ iris_compile_cs(struct iris_screen *screen, error = params.base.error_str; if (program) { iris_debug_recompile_brw(screen, dbg, ish, &brw_key.base); - iris_apply_brw_prog_data(shader, &brw_prog_data->base); + iris_apply_brw_prog_data(shader, &brw_prog_data->base, NULL); } } else { #ifdef INTEL_USE_ELK diff --git a/src/gallium/drivers/iris/iris_program_cache.c b/src/gallium/drivers/iris/iris_program_cache.c index febf6b8c21b..4e7cae5bb9a 100644 --- a/src/gallium/drivers/iris/iris_program_cache.c +++ b/src/gallium/drivers/iris/iris_program_cache.c @@ -278,7 +278,7 @@ iris_blorp_upload_shader(struct blorp_batch *blorp_batch, uint32_t stage, memcpy(prog_data, prog_data_templ, prog_data_size); if (screen->brw) { - iris_apply_brw_prog_data(shader, prog_data); + iris_apply_brw_prog_data(shader, prog_data, NULL); } else { #ifdef INTEL_USE_ELK assert(screen->elk); @@ -445,9 +445,7 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch) struct brw_wm_prog_data *prog_data = ralloc_size(NULL, sizeof(*prog_data)); memset(prog_data, 0, sizeof(*prog_data)); - prog_data->base.nr_params = nir->num_uniforms / 4; - - brw_nir_analyze_ubo_ranges(screen->brw, nir, prog_data->base.ubo_ranges); + prog_data->base.push_sizes[0] = uniform_size; struct genisa_stats stats[3]; struct brw_compile_fs_params params = { @@ -463,7 +461,7 @@ iris_ensure_indirect_generation_shader(struct iris_batch *batch) }; program = brw_compile_fs(screen->brw, ¶ms); assert(program); - iris_apply_brw_prog_data(shader, &prog_data->base); + iris_apply_brw_prog_data(shader, &prog_data->base, NULL); } else { #ifdef INTEL_USE_ELK union elk_any_prog_key prog_key; diff --git a/src/gallium/drivers/iris/iris_state.c b/src/gallium/drivers/iris/iris_state.c index 1a4ba0c6c65..6fc9a359551 100644 --- a/src/gallium/drivers/iris/iris_state.c +++ b/src/gallium/drivers/iris/iris_state.c @@ -9410,21 +9410,34 @@ iris_upload_gpgpu_walker(struct iris_context *ice, if ((stage_dirty & IRIS_STAGE_DIRTY_CS) || (GFX_VER == 12 && !batch->contains_draw) || cs_data->local_size[0] == 0 /* Variable local group size */) { - uint32_t curbe_data_offset = 0; - assert(cs_data->push.cross_thread.dwords == 0 && - cs_data->push.per_thread.dwords == 1 && - cs_data->first_param_is_builtin_subgroup_id); - const unsigned push_const_size = - iris_cs_push_const_total_size(shader, dispatch.threads); - uint32_t *curbe_data_map = - stream_state(batch, ice->state.dynamic_uploader, - &ice->state.last_res.cs_thread_ids, - align(push_const_size, 64), 64, - &curbe_data_offset); - assert(curbe_data_map); - memset(curbe_data_map, 0x5a, align(push_const_size, 64)); - iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads, - curbe_data_map); + uint32_t curbe_data_offset, push_const_size; + uint32_t *curbe_data_map; + if (cs_data->push.cross_thread.dwords == 0 && + cs_data->push.per_thread.dwords == 0) { + push_const_size = 64; + curbe_data_map = + stream_state(batch, ice->state.dynamic_uploader, + &ice->state.last_res.cs_thread_ids, + align(push_const_size, 64), 64, + &curbe_data_offset); + assert(curbe_data_map); + memset(curbe_data_map, 0x5a, align(push_const_size, 64)); + } else { + assert(cs_data->push.cross_thread.dwords == 0 && + cs_data->push.per_thread.dwords == 1 && + cs_data->first_param_is_builtin_subgroup_id); + push_const_size = + iris_cs_push_const_total_size(shader, dispatch.threads); + curbe_data_map = + stream_state(batch, ice->state.dynamic_uploader, + &ice->state.last_res.cs_thread_ids, + align(push_const_size, 64), 64, + &curbe_data_offset); + assert(curbe_data_map); + memset(curbe_data_map, 0x5a, align(push_const_size, 64)); + iris_fill_cs_push_const_buffer(screen, shader, dispatch.threads, + curbe_data_map); + } iris_emit_cmd(batch, GENX(MEDIA_CURBE_LOAD), curbe) { curbe.CURBETotalDataLength = align(push_const_size, 64); diff --git a/src/intel/blorp/blorp_brw.c b/src/intel/blorp/blorp_brw.c index 427ccf3365b..88e1241256e 100644 --- a/src/intel/blorp/blorp_brw.c +++ b/src/intel/blorp/blorp_brw.c @@ -27,8 +27,6 @@ blorp_compile_fs_brw(struct blorp_context *blorp, void *mem_ctx, const struct brw_compiler *compiler = blorp->compiler->brw; struct brw_wm_prog_data *wm_prog_data = rzalloc(mem_ctx, struct brw_wm_prog_data); - wm_prog_data->base.nr_params = 0; - wm_prog_data->base.param = NULL; struct brw_nir_compiler_opts opts = { .softfp64 = blorp->get_fp64_nir ? blorp->get_fp64_nir(blorp) : NULL, @@ -125,6 +123,24 @@ lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin, return true; } +static bool +lower_load_uniform(nir_builder *b, nir_intrinsic_instr *intrin, + UNUSED void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_uniform) + return false; + + b->cursor = nir_instr_remove(&intrin->instr); + nir_def_rewrite_uses(&intrin->def, + nir_load_push_data_intel(b, + intrin->def.num_components, + intrin->def.bit_size, + intrin->src[0].ssa, + .base = nir_intrinsic_base(intrin), + .range = nir_intrinsic_range(intrin))); + return true; +} + static struct blorp_program blorp_compile_cs_brw(struct blorp_context *blorp, void *mem_ctx, struct nir_shader *nir) @@ -140,19 +156,24 @@ blorp_compile_cs_brw(struct blorp_context *blorp, void *mem_ctx, NIR_PASS(_, nir, nir_lower_io, nir_var_uniform, type_size_scalar_bytes, (nir_lower_io_options)0); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_load_uniform, + nir_metadata_control_flow, NULL); + STATIC_ASSERT(offsetof(struct blorp_wm_inputs, subgroup_id) + 4 == sizeof(struct blorp_wm_inputs)); - nir->num_uniforms = offsetof(struct blorp_wm_inputs, subgroup_id); - unsigned nr_params = nir->num_uniforms / 4; struct brw_cs_prog_data *cs_prog_data = rzalloc(mem_ctx, struct brw_cs_prog_data); - cs_prog_data->base.nr_params = nr_params; - cs_prog_data->base.param = rzalloc_array(NULL, uint32_t, nr_params); + cs_prog_data->base.push_sizes[0] = sizeof(struct blorp_wm_inputs); + + brw_cs_fill_push_const_info(compiler->devinfo, cs_prog_data, + offsetof(struct blorp_wm_inputs, subgroup_id) / 4); NIR_PASS(_, nir, brw_nir_lower_cs_intrinsics, compiler->devinfo, cs_prog_data); + NIR_PASS(_, nir, brw_nir_lower_cs_subgroup_id, compiler->devinfo, + offsetof(struct blorp_wm_inputs, subgroup_id)); NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_base_workgroup_id, - nir_metadata_control_flow, NULL); + nir_metadata_control_flow, NULL); struct brw_cs_prog_key cs_key; memset(&cs_key, 0, sizeof(cs_key)); @@ -170,9 +191,6 @@ blorp_compile_cs_brw(struct blorp_context *blorp, void *mem_ctx, const unsigned *kernel = brw_compile_cs(compiler, ¶ms); - ralloc_free(cs_prog_data->base.param); - cs_prog_data->base.param = NULL; - return (struct blorp_program) { .kernel = kernel, .kernel_size = cs_prog_data->base.program_size, diff --git a/src/intel/compiler/brw/brw_compile_cs.cpp b/src/intel/compiler/brw/brw_compile_cs.cpp index 502fe101067..43a21607063 100644 --- a/src/intel/compiler/brw/brw_compile_cs.cpp +++ b/src/intel/compiler/brw/brw_compile_cs.cpp @@ -25,26 +25,22 @@ fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords) block->size = block->regs * 32; } -static void -cs_fill_push_const_info(const struct intel_device_info *devinfo, - struct brw_cs_prog_data *cs_prog_data) +extern "C" void +brw_cs_fill_push_const_info(const struct intel_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data, + int subgroup_id_index) { const struct brw_stage_prog_data *prog_data = &cs_prog_data->base; - int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data); - - /* The thread ID should be stored in the last param dword */ - assert(subgroup_id_index == -1 || - subgroup_id_index == (int)prog_data->nr_params - 1); unsigned cross_thread_dwords, per_thread_dwords; - if (subgroup_id_index >= 0) { + if (devinfo->verx10 < 125 && subgroup_id_index >= 0) { /* Fill all but the last register with cross-thread payload */ cross_thread_dwords = 8 * (subgroup_id_index / 8); - per_thread_dwords = prog_data->nr_params - cross_thread_dwords; + per_thread_dwords = prog_data->push_sizes[0] / 4 - cross_thread_dwords; assert(per_thread_dwords > 0 && per_thread_dwords <= 8); } else { /* Fill all data using cross-thread payload */ - cross_thread_dwords = prog_data->nr_params; + cross_thread_dwords = prog_data->push_sizes[0] / 4; per_thread_dwords = 0u; } @@ -55,7 +51,7 @@ cs_fill_push_const_info(const struct intel_device_info *devinfo, cs_prog_data->push.per_thread.size == 0); assert(cs_prog_data->push.cross_thread.dwords + cs_prog_data->push.per_thread.dwords == - prog_data->nr_params); + prog_data->push_sizes[0] / 4); } static bool @@ -120,41 +116,6 @@ brw_nir_uses_sampler(nir_shader *shader) NULL); } -static inline uint32_t * -brw_stage_prog_data_add_params(struct brw_stage_prog_data *prog_data, - unsigned nr_new_params) -{ - unsigned old_nr_params = prog_data->nr_params; - prog_data->nr_params += nr_new_params; - prog_data->param = reralloc(ralloc_parent(prog_data->param), - prog_data->param, uint32_t, - prog_data->nr_params); - return prog_data->param + old_nr_params; -} - -static void -brw_adjust_uniforms(brw_shader &s) -{ - if (s.devinfo->verx10 >= 125) - return; - - assert(mesa_shader_stage_is_compute(s.stage)); - - if (brw_get_subgroup_id_param_index(s.devinfo, s.prog_data) == -1) { - /* Add uniforms for builtins after regular NIR uniforms. */ - assert(s.uniforms == s.prog_data->nr_params); - - /* Subgroup ID must be the last uniform on the list. This will make - * easier later to split between cross thread and per thread - * uniforms. - */ - uint32_t *param = brw_stage_prog_data_add_params(s.prog_data, 1); - *param = BRW_PARAM_BUILTIN_SUBGROUP_ID; - } - - s.uniforms = s.prog_data->nr_params; -} - const unsigned * brw_compile_cs(const struct brw_compiler *compiler, struct brw_compile_cs_params *params) @@ -233,7 +194,6 @@ brw_compile_cs(const struct brw_compiler *compiler, .archiver = params->base.archiver, }; v[simd] = std::make_unique(&shader_params); - brw_adjust_uniforms(*v[simd]); const bool allow_spilling = simd == 0 || (!simd_state.compiled[simd - 1] && !brw_simd_should_compile(simd_state, simd - 1)) || @@ -245,8 +205,6 @@ brw_compile_cs(const struct brw_compiler *compiler, } if (run_cs(*v[simd], allow_spilling)) { - cs_fill_push_const_info(compiler->devinfo, prog_data); - brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers); if (devinfo->ver >= 30 && !v[simd]->spilled_any_registers && diff --git a/src/intel/compiler/brw/brw_compile_fs.cpp b/src/intel/compiler/brw/brw_compile_fs.cpp index d4d4f9d60d9..20f218bc5f8 100644 --- a/src/intel/compiler/brw/brw_compile_fs.cpp +++ b/src/intel/compiler/brw/brw_compile_fs.cpp @@ -600,7 +600,6 @@ brw_emit_repclear_shader(brw_shader &s) brw_send_inst *write = NULL; assert(s.devinfo->ver < 20); - assert(s.uniforms == 0); assume(key->nr_color_regions > 0); brw_reg color_output = retype(brw_vec4_grf(127, 0), BRW_TYPE_UD); @@ -1123,7 +1122,7 @@ gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data) if (wm_prog_data->num_varying_inputs) return; - if (wm_prog_data->base.curb_read_length) + if (wm_prog_data->base.push_sizes[0] > 0) return; wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0; @@ -1296,7 +1295,13 @@ brw_assign_urb_setup(brw_shader &s) struct brw_wm_prog_data *prog_data = brw_wm_prog_data(s.prog_data); - int urb_start = s.payload().num_regs + prog_data->base.curb_read_length; + uint32_t push_size = 0; + for (uint32_t i = 0; i < 4; i++) + push_size += prog_data->base.push_sizes[i]; + + const int urb_start = + s.payload().num_regs + + DIV_ROUND_UP(align(push_size, REG_SIZE * reg_unit(s.devinfo)), REG_SIZE); bool read_attribute_payload = false; /* Offset all the urb_setup[] index by the actual position of the diff --git a/src/intel/compiler/brw/brw_compiler.h b/src/intel/compiler/brw/brw_compiler.h index 638a794ba6c..919654fd74f 100644 --- a/src/intel/compiler/brw/brw_compiler.h +++ b/src/intel/compiler/brw/brw_compiler.h @@ -564,12 +564,16 @@ enum brw_param_builtin { (((param) - BRW_PARAM_BUILTIN_CLIP_PLANE_0_X) & 0x3) struct brw_stage_prog_data { - struct brw_ubo_range ubo_ranges[4]; - - unsigned nr_params; /**< number of float params/constants */ - mesa_shader_stage stage; + /** + * Amount of push data delivered to the shader (in bytes) + * + * The HW can push up to 4 ranges from 4 different virtual addresses. + * Values should be aligned to 32B. + */ + uint16_t push_sizes[4]; + /* If robust_ubo_ranges not 0, push_reg_mask_param specifies the param * index (in 32-bit units) where the 4 UBO range limits will be pushed * as 8-bit integers. The shader will zero byte i of UBO range j if: @@ -582,7 +586,6 @@ struct brw_stage_prog_data { uint8_t robust_ubo_ranges; unsigned push_reg_mask_param; - unsigned curb_read_length; unsigned total_scratch; unsigned total_shared; @@ -613,14 +616,6 @@ struct brw_stage_prog_data { uint32_t source_hash; - /* 32-bit identifiers for all push/pull parameters. These can be anything - * the driver wishes them to be; the core of the back-end compiler simply - * re-arranges them. The one restriction is that the bottom 2^16 values - * are reserved for builtins defined in the brw_param_builtin enum defined - * above. - */ - uint32_t *param; - /* Whether shader uses atomic operations. */ bool uses_atomic_load_store; }; @@ -1669,6 +1664,11 @@ unsigned brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data, unsigned threads); +void +brw_cs_fill_push_const_info(const struct intel_device_info *devinfo, + struct brw_cs_prog_data *cs_prog_data, + int subgroup_id_index); + void brw_write_shader_relocs(const struct brw_isa_info *isa, void *program, diff --git a/src/intel/compiler/brw/brw_eu_defines.h b/src/intel/compiler/brw/brw_eu_defines.h index 74d3b62d39a..7b914f9f1bc 100644 --- a/src/intel/compiler/brw/brw_eu_defines.h +++ b/src/intel/compiler/brw/brw_eu_defines.h @@ -785,7 +785,7 @@ enum ENUM_PACKED brw_reg_file { ADDRESS, VGRF, ATTR, - UNIFORM, /* prog_data->params[reg] */ + UNIFORM, /* pushed constant delivered register */ }; /* Align1 support for 3-src instructions. Bit 35 of the instruction diff --git a/src/intel/compiler/brw/brw_from_nir.cpp b/src/intel/compiler/brw/brw_from_nir.cpp index d2ee6559937..6bf8ef60dbb 100644 --- a/src/intel/compiler/brw/brw_from_nir.cpp +++ b/src/intel/compiler/brw/brw_from_nir.cpp @@ -1992,8 +1992,7 @@ get_nir_def(nir_to_brw_state &ntb, const nir_def &def, bool all_sources_uniform) is_scalar = get_nir_src(ntb, instr->src[1], 0).is_scalar; break; - case nir_intrinsic_load_uniform: - case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_push_data_intel: is_scalar = get_nir_src(ntb, instr->src[0], 0).is_scalar; break; @@ -5393,8 +5392,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, break; } - case nir_intrinsic_load_uniform: - case nir_intrinsic_load_push_constant: { + case nir_intrinsic_load_push_data_intel: { /* Offsets are in bytes but they should always aligned to * the type size */ @@ -5472,120 +5470,35 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb, nir_def_first_component_read(&instr->def); const unsigned last_component = nir_def_last_component_read(&instr->def); - const unsigned num_components = last_component - first_component + 1; - if (!nir_src_is_const(instr->src[1])) { - s.prog_data->has_ubo_pull = true; + if (instr->intrinsic == nir_intrinsic_load_ubo) { + /* load_ubo with non-constant offset. The offset might still be + * uniform on non-LSC platforms when loading fewer than 4 components. + */ + brw_reg base_offset = retype(get_nir_src(ntb, instr->src[1], 0), + BRW_TYPE_UD); + if (nir_intrinsic_has_base(instr)) { + struct brw_reg imm = brw_imm_int(base_offset.type, + nir_intrinsic_base(instr)); + base_offset = bld.ADD(base_offset, imm); + } - if (instr->intrinsic == nir_intrinsic_load_ubo) { - /* load_ubo with non-constant offset. The offset might still be - * uniform on non-LSC platforms when loading fewer than 4 - * components. - */ - brw_reg base_offset = retype(get_nir_src(ntb, instr->src[1], 0), - BRW_TYPE_UD); - if (nir_intrinsic_has_base(instr)) { - struct brw_reg imm = brw_imm_int(base_offset.type, - nir_intrinsic_base(instr)); - base_offset = bld.ADD(base_offset, imm); - } + const unsigned comps_per_load = brw_type_size_bytes(dest.type) == 8 ? 2 : 4; - const unsigned comps_per_load = brw_type_size_bytes(dest.type) == 8 ? 2 : 4; - - for (unsigned i = first_component; - i <= last_component; - i += comps_per_load) { - const unsigned remaining = last_component + 1 - i; - xbld.VARYING_PULL_CONSTANT_LOAD(offset(dest, xbld, i), - surface, surface_handle, - base_offset, - i * brw_type_size_bytes(dest.type), - instr->def.bit_size / 8, - MIN2(remaining, comps_per_load)); - } - } else { - /* load_ubo_uniform_block_intel with non-constant offset */ - brw_from_nir_emit_memory_access(ntb, bld, xbld, instr); + for (unsigned i = first_component; + i <= last_component; + i += comps_per_load) { + const unsigned remaining = last_component + 1 - i; + xbld.VARYING_PULL_CONSTANT_LOAD(offset(dest, xbld, i), + surface, surface_handle, + base_offset, + i * brw_type_size_bytes(dest.type), + instr->def.bit_size / 8, + MIN2(remaining, comps_per_load)); } } else { - /* Even if we are loading doubles, a pull constant load will load - * a 32-bit vec4, so should only reserve vgrf space for that. If we - * need to load a full dvec4 we will have to emit 2 loads. This is - * similar to demote_pull_constants(), except that in that case we - * see individual accesses to each component of the vector and then - * we let CSE deal with duplicate loads. Here we see a vector access - * and we have to split it if necessary. - */ - const unsigned type_size = brw_type_size_bytes(dest.type); - const unsigned load_offset = - nir_src_as_uint(instr->src[1]) + first_component * type_size + - (nir_intrinsic_has_base(instr) ? nir_intrinsic_base(instr) : 0); - const unsigned end_offset = load_offset + num_components * type_size; - const unsigned ubo_block = - brw_nir_ubo_surface_index_get_push_block(instr->src[0]); - const unsigned offset_256b = load_offset / 32; - const unsigned end_256b = DIV_ROUND_UP(end_offset, 32); - - /* See if we've selected this as a push constant candidate */ - brw_reg push_reg; - for (int i = 0; i < 4; i++) { - const struct brw_ubo_range *range = &s.prog_data->ubo_ranges[i]; - if (range->block == ubo_block && - offset_256b >= range->start && - end_256b <= range->start + range->length) { - - push_reg = brw_uniform_reg(UBO_START + i, dest.type); - push_reg.offset = load_offset - 32 * range->start; - break; - } - } - - if (push_reg.file != BAD_FILE) { - for (unsigned i = first_component; i <= last_component; i++) { - xbld.MOV(offset(dest, xbld, i), - byte_offset(push_reg, - (i - first_component) * type_size)); - } - break; - } - - s.prog_data->has_ubo_pull = true; - - if (instr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) { - brw_from_nir_emit_memory_access(ntb, bld, xbld, instr); - break; - } - - const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ - const brw_builder ubld = bld.exec_all().group(block_sz / 4, 0); - - for (unsigned c = 0; c < num_components;) { - const unsigned base = load_offset + c * type_size; - /* Number of usable components in the next block-aligned load. */ - const unsigned count = MIN2(num_components - c, - (block_sz - base % block_sz) / type_size); - - const brw_reg packed_consts = ubld.vgrf(BRW_TYPE_UD); - brw_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; - srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = surface; - srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle; - srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); - srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); - - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts, - srcs, PULL_UNIFORM_CONSTANT_SRCS); - - const brw_reg consts = - retype(byte_offset(packed_consts, base & (block_sz - 1)), - dest.type); - - for (unsigned d = 0; d < count; d++) { - xbld.MOV(offset(dest, xbld, first_component + c + d), - component(consts, d)); - } - - c += count; - } + /* load_ubo_uniform_block_intel with non-constant offset */ + brw_from_nir_emit_memory_access(ntb, bld, xbld, instr); } break; } diff --git a/src/intel/compiler/brw/brw_nir.c b/src/intel/compiler/brw/brw_nir.c index df2f050138e..14bcb787183 100644 --- a/src/intel/compiler/brw/brw_nir.c +++ b/src/intel/compiler/brw/brw_nir.c @@ -3144,8 +3144,7 @@ nir_def * brw_nir_load_global_const(nir_builder *b, nir_intrinsic_instr *load, nir_def *base_addr, unsigned off) { - assert(load->intrinsic == nir_intrinsic_load_push_constant || - load->intrinsic == nir_intrinsic_load_uniform); + assert(load->intrinsic == nir_intrinsic_load_push_data_intel); unsigned bit_size = load->def.bit_size; assert(bit_size >= 8 && bit_size % 8 == 0); diff --git a/src/intel/compiler/brw/brw_nir.h b/src/intel/compiler/brw/brw_nir.h index f0f014542c7..d4c2a04e2e9 100644 --- a/src/intel/compiler/brw/brw_nir.h +++ b/src/intel/compiler/brw/brw_nir.h @@ -179,6 +179,10 @@ brw_nir_link_shaders(const struct brw_compiler *compiler, bool brw_nir_lower_cs_intrinsics(nir_shader *nir, const struct intel_device_info *devinfo, struct brw_cs_prog_data *prog_data); +bool brw_nir_lower_cs_subgroup_id(nir_shader *nir, + const struct intel_device_info *devinfo, + unsigned subgroup_id_offset); + bool brw_nir_lower_alpha_to_coverage(nir_shader *shader); bool brw_needs_vertex_attributes_bypass(const nir_shader *shader); void brw_nir_lower_fs_barycentrics(nir_shader *shader); @@ -354,6 +358,9 @@ void brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler, nir_shader *nir, struct brw_ubo_range out_ranges[4]); +bool brw_nir_lower_ubo_ranges(nir_shader *nir, + struct brw_ubo_range out_ranges[4]); + void brw_nir_optimize(nir_shader *nir, const struct intel_device_info *devinfo); diff --git a/src/intel/compiler/brw/brw_nir_analyze_ubo_ranges.c b/src/intel/compiler/brw/brw_nir_analyze_ubo_ranges.c index 7adb7f1f03a..3312cbd80d4 100644 --- a/src/intel/compiler/brw/brw_nir_analyze_ubo_ranges.c +++ b/src/intel/compiler/brw/brw_nir_analyze_ubo_ranges.c @@ -129,36 +129,37 @@ analyze_ubos_block(struct ubo_analysis_state *state, nir_block *block) if (intrin->intrinsic != nir_intrinsic_load_ubo) continue; - if (brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) && - nir_src_is_const(intrin->src[1])) { - const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]); - const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); - const unsigned sizeof_GRF = REG_SIZE * reg_unit(state->devinfo); - const int offset = byte_offset / sizeof_GRF; + if (!brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) || + !nir_src_is_const(intrin->src[1])) + continue; - /* Avoid shifting by larger than the width of our bitfield, as this - * is undefined in C. Even if we require multiple bits to represent - * the entire value, it's OK to record a partial value - the backend - * is capable of falling back to pull loads for later components of - * vectors, as it has to shrink ranges for other reasons anyway. - */ - if (offset >= 64) - continue; + const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]); + const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); + const unsigned sizeof_GRF = REG_SIZE * reg_unit(state->devinfo); + const int offset = byte_offset / sizeof_GRF; - /* The value might span multiple sizeof(GRF) chunks. */ - const unsigned num_components = - nir_def_last_component_read(&intrin->def) + 1; - const int bytes = num_components * (intrin->def.bit_size / 8); - const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF); - const int end = align(byte_offset + bytes, sizeof_GRF); - const int chunks = (end - start) / sizeof_GRF; + /* Avoid shifting by larger than the width of our bitfield, as this + * is undefined in C. Even if we require multiple bits to represent + * the entire value, it's OK to record a partial value - the backend + * is capable of falling back to pull loads for later components of + * vectors, as it has to shrink ranges for other reasons anyway. + */ + if (offset >= 64) + continue; - /* TODO: should we count uses in loops as higher benefit? */ + /* The value might span multiple sizeof(GRF) chunks. */ + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); + const int start = ROUND_DOWN_TO(byte_offset, sizeof_GRF); + const int end = align(byte_offset + bytes, sizeof_GRF); + const int chunks = (end - start) / sizeof_GRF; - struct ubo_block_info *info = get_block_info(state, block); - info->offsets |= ((1ull << chunks) - 1) << offset; - info->uses[offset]++; - } + /* TODO: should we count uses in loops as higher benefit? */ + + struct ubo_block_info *info = get_block_info(state, block); + info->offsets |= ((1ull << chunks) - 1) << offset; + info->uses[offset]++; } } @@ -316,3 +317,53 @@ brw_nir_analyze_ubo_ranges(const struct brw_compiler *compiler, ralloc_free(ranges.mem_ctx); } + +static bool +lower_load_ubo_instr(nir_builder *b, nir_intrinsic_instr *intrin, void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_ubo) + return false; + + if (!brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) || + !nir_src_is_const(intrin->src[1])) + return false; + + const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]); + const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); + + const struct brw_ubo_range *range = data; + for (uint32_t i = 0; i < 4; i++) { + if (range[i].block != block) + continue; + + if (byte_offset < range[i].start * 32 || + (byte_offset + bytes) > (range[i].start + range[i].length) * 32) + continue; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + nir_def_last_component_read(&intrin->def) + 1, + intrin->def.bit_size, + nir_imm_int(b, 0), + .base = byte_offset - range[i].start * 32, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + + return true; + } + + return false; +} + +bool +brw_nir_lower_ubo_ranges(nir_shader *nir, + struct brw_ubo_range out_ranges[4]) +{ + return nir_shader_intrinsics_pass(nir, lower_load_ubo_instr, + nir_metadata_control_flow, + out_ranges); +} diff --git a/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c b/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c index b05747652c1..64e273d9a7e 100644 --- a/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c +++ b/src/intel/compiler/brw/brw_nir_lower_cs_intrinsics.c @@ -387,3 +387,36 @@ brw_nir_lower_cs_intrinsics(nir_shader *nir, return state.progress; } + +static bool +lower_cs_subgroup_id_instr(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + if (intrin->intrinsic != nir_intrinsic_load_subgroup_id) + return false; + + const unsigned *subgroup_id_offset_ptr = data; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def_replace(&intrin->def, + nir_load_push_data_intel( + b, 1, 32, nir_imm_int(b, 0), + .base = *subgroup_id_offset_ptr, + .range = 4)); + + return true; +} + +bool +brw_nir_lower_cs_subgroup_id(nir_shader *nir, + const struct intel_device_info *devinfo, + unsigned subgroup_id_offset) +{ + if (devinfo->verx10 >= 125) + return false; + + return nir_shader_intrinsics_pass(nir, lower_cs_subgroup_id_instr, + nir_metadata_control_flow, + &subgroup_id_offset); +} diff --git a/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c b/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c index e6aa645f831..38ecce4676d 100644 --- a/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c +++ b/src/intel/compiler/brw/brw_nir_lower_rt_intrinsics.c @@ -135,8 +135,7 @@ lower_rt_intrinsics_impl(nir_function_impl *impl, nir_instr_remove(instr); break; - case nir_intrinsic_load_uniform: - case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_push_data_intel: /* We don't want to lower this in the launch trampoline. * * Also if the driver chooses to use an inline push address, we diff --git a/src/intel/compiler/brw/brw_nir_rt.c b/src/intel/compiler/brw/brw_nir_rt.c index f9abb0a22a0..e54d6431364 100644 --- a/src/intel/compiler/brw/brw_nir_rt.c +++ b/src/intel/compiler/brw/brw_nir_rt.c @@ -427,7 +427,6 @@ brw_nir_create_raygen_trampoline(const struct brw_compiler *compiler, * passed in as push constants in the first register. We deal with the * raygen BSR address here; the global data we'll deal with later. */ - b.shader->num_uniforms = 32; nir_def *raygen_param_bsr_addr = load_trampoline_param(&b, raygen_bsr_addr, 1, 64); nir_def *is_indirect = diff --git a/src/intel/compiler/brw/brw_shader.cpp b/src/intel/compiler/brw/brw_shader.cpp index 5ad5aaa42d5..70d973d3793 100644 --- a/src/intel/compiler/brw/brw_shader.cpp +++ b/src/intel/compiler/brw/brw_shader.cpp @@ -425,7 +425,6 @@ brw_shader::brw_shader(const brw_shader_params *params) this->source_depth_to_render_target = false; this->first_non_payload_grf = 0; - this->uniforms = this->nir->num_uniforms / 4; this->last_scratch = 0; memset(&this->shader_stats, 0, sizeof(this->shader_stats)); @@ -621,40 +620,22 @@ brw_shader::mark_last_urb_write_with_eot() return true; } -static unsigned -round_components_to_whole_registers(const intel_device_info *devinfo, - unsigned c) -{ - return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo); -} - void brw_shader::assign_curb_setup() { - unsigned uniform_push_length = - round_components_to_whole_registers(devinfo, prog_data->nr_params); - - unsigned ubo_push_length = 0; - unsigned ubo_push_start[4]; - for (int i = 0; i < 4; i++) { - ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length); - ubo_push_length += prog_data->ubo_ranges[i].length; - - assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0); - assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0); + uint32_t ranges_start[4]; + this->push_data_size = 0; + for (uint32_t i = 0; i < 4; i++) { + ranges_start[i] = this->push_data_size / REG_SIZE; + this->push_data_size += align(prog_data->push_sizes[i], REG_SIZE); } - prog_data->curb_read_length = uniform_push_length + ubo_push_length; - if (stage == MESA_SHADER_FRAGMENT && - ((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround) - prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length); - uint64_t used = 0; const bool pull_constants = devinfo->verx10 >= 125 && (mesa_shader_stage_is_compute(stage) || mesa_shader_stage_is_mesh(stage)) && - uniform_push_length; + this->push_data_size > 0; if (pull_constants) { const bool pull_constants_a64 = @@ -688,9 +669,11 @@ brw_shader::assign_curb_setup() /* On Gfx12-HP we load constants at the start of the program using A32 * stateless messages. */ - for (unsigned i = 0; i < uniform_push_length;) { + const unsigned n_push_data_regs = reg_unit(devinfo) * + DIV_ROUND_UP(this->push_data_size, reg_unit(devinfo) * REG_SIZE); + for (unsigned i = 0; i < this->push_data_size / REG_SIZE;) { /* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */ - unsigned num_regs = MIN2(uniform_push_length - i, 8); + unsigned num_regs = MIN2(this->push_data_size / REG_SIZE - i, 8); assert(num_regs > 0); num_regs = 1 << util_logbase2(num_regs); @@ -746,7 +729,7 @@ brw_shader::assign_curb_setup() send->size_written = lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE; assert((payload().num_regs + i + send->size_written / REG_SIZE) <= - (payload().num_regs + prog_data->curb_read_length)); + (payload().num_regs + n_push_data_regs)); send->is_volatile = true; send->src[SEND_SRC_DESC] = @@ -766,28 +749,13 @@ brw_shader::assign_curb_setup() for (unsigned int i = 0; i < inst->sources; i++) { if (inst->src[i].file == UNIFORM) { int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4; - int constant_nr; - if (inst->src[i].nr >= UBO_START) { - /* constant_nr is in 32-bit units, the rest are in bytes */ - constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] + - inst->src[i].offset / 4; - } else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { - constant_nr = uniform_nr; - } else { - /* Section 5.11 of the OpenGL 4.1 spec says: - * "Out-of-bounds reads return undefined values, which include - * values from other variables of the active program or zero." - * Just return the first push constant. - */ - constant_nr = 0; - } - assert(constant_nr / 8 < 64); - used |= BITFIELD64_BIT(constant_nr / 8); + assert(uniform_nr / 8 < 64); + used |= BITFIELD64_BIT(uniform_nr / 8); struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs + - constant_nr / 8, - constant_nr % 8); + uniform_nr / 8, + uniform_nr % 8); brw_reg.abs = inst->src[i].abs; brw_reg.negate = inst->src[i].negate; @@ -824,15 +792,16 @@ brw_shader::assign_curb_setup() ubld.group(16, 0).ADD(horiz_offset(offset_base, 16), offset_base, brw_imm_uw(16)); u_foreach_bit(i, prog_data->robust_ubo_ranges) { - struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; + const unsigned range_length = + DIV_ROUND_UP(prog_data->push_sizes[i], REG_SIZE); - unsigned range_start = ubo_push_start[i] / 8; - uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(ubo_range->length); + const unsigned range_start = ranges_start[i]; + uint64_t want_zero = (used >> range_start) & BITFIELD64_MASK(range_length); if (!want_zero) continue; const unsigned grf_start = payload().num_regs + range_start; - const unsigned grf_end = grf_start + ubo_range->length; + const unsigned grf_end = grf_start + range_length; const unsigned max_grf_mask = max_grf_writes * 4; unsigned grf = grf_start; @@ -899,7 +868,10 @@ brw_shader::assign_curb_setup() } /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */ - this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length; + this->first_non_payload_grf = payload().num_regs + + DIV_ROUND_UP( + align(this->push_data_size, REG_SIZE * reg_unit(devinfo)), + REG_SIZE); this->debug_optimizer(this->nir, "assign_curb_setup", 90, 0); } @@ -935,7 +907,9 @@ brw_shader::convert_attr_sources_to_hw_regs(brw_inst *inst) if (inst->src[i].file == ATTR) { assert(inst->src[i].nr == 0); int grf = payload().num_regs + - prog_data->curb_read_length + + DIV_ROUND_UP( + align(this->push_data_size, REG_SIZE * reg_unit(devinfo)), + REG_SIZE) + inst->src[i].offset / REG_SIZE; /* As explained at brw_lower_vgrf_to_fixed_grf, From the Haswell PRM: @@ -969,24 +943,6 @@ brw_shader::convert_attr_sources_to_hw_regs(brw_inst *inst) } } -int -brw_get_subgroup_id_param_index(const intel_device_info *devinfo, - const brw_stage_prog_data *prog_data) -{ - if (prog_data->nr_params == 0) - return -1; - - if (devinfo->verx10 >= 125) - return -1; - - /* The local thread id is always the last parameter in the list */ - uint32_t last_param = prog_data->param[prog_data->nr_params - 1]; - if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID) - return prog_data->nr_params - 1; - - return -1; -} - uint32_t brw_fb_write_msg_control(const brw_inst *inst, const struct brw_wm_prog_data *prog_data) diff --git a/src/intel/compiler/brw/brw_shader.h b/src/intel/compiler/brw/brw_shader.h index 0cac6233ef2..067dcac3cdd 100644 --- a/src/intel/compiler/brw/brw_shader.h +++ b/src/intel/compiler/brw/brw_shader.h @@ -144,8 +144,11 @@ public: brw_analysis def_analysis; brw_analysis ip_ranges_analysis; - /** Number of uniform variable components visited. */ - unsigned uniforms; + /** Amount data push constant data delivered to the shader + * + * Aligned to native GRF registers + */ + unsigned push_data_size; /** Byte-offset for the next available spot in the scratch space buffer. */ unsigned last_scratch; @@ -290,9 +293,6 @@ uint32_t brw_fb_write_msg_control(const brw_inst *inst, void brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data); -int brw_get_subgroup_id_param_index(const intel_device_info *devinfo, - const brw_stage_prog_data *prog_data); - void brw_from_nir(brw_shader *s); void brw_shader_phase_update(brw_shader &s, enum brw_shader_phase phase); diff --git a/src/intel/compiler/brw/brw_thread_payload.cpp b/src/intel/compiler/brw/brw_thread_payload.cpp index 78b45cd64f7..adc6d79cecb 100644 --- a/src/intel/compiler/brw/brw_thread_payload.cpp +++ b/src/intel/compiler/brw/brw_thread_payload.cpp @@ -380,19 +380,9 @@ void brw_cs_thread_payload::load_subgroup_id(const brw_builder &bld, brw_reg &dest) const { - auto devinfo = bld.shader->devinfo; + assert(bld.shader->devinfo->verx10 >= 125); dest = retype(dest, BRW_TYPE_UD); - - if (subgroup_id_.file != BAD_FILE) { - assert(devinfo->verx10 >= 125); - bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0))); - } else { - assert(devinfo->verx10 < 125); - assert(mesa_shader_stage_is_compute(bld.shader->stage)); - int index = brw_get_subgroup_id_param_index(devinfo, - bld.shader->prog_data); - bld.MOV(dest, brw_uniform_reg(index, BRW_TYPE_UD)); - } + bld.AND(dest, subgroup_id_, brw_imm_ud(INTEL_MASK(7, 0))); } brw_task_mesh_thread_payload::brw_task_mesh_thread_payload(brw_shader &v) diff --git a/src/intel/vulkan/anv_internal_kernels.c b/src/intel/vulkan/anv_internal_kernels.c index f43eeeef864..af93f3029d2 100644 --- a/src/intel/vulkan/anv_internal_kernels.c +++ b/src/intel/vulkan/anv_internal_kernels.c @@ -147,15 +147,10 @@ compile_shader(struct anv_device *device, }; NIR_PASS(_, nir, nir_opt_load_store_vectorize, &options); - nir->num_uniforms = uniform_size; + prog_data.base.push_sizes[0] = uniform_size; void *temp_ctx = ralloc_context(NULL); - prog_data.base.nr_params = nir->num_uniforms / 4; - prog_data.base.param = rzalloc_array(temp_ctx, uint32_t, prog_data.base.nr_params); - - brw_nir_analyze_ubo_ranges(compiler, nir, prog_data.base.ubo_ranges); - const unsigned *program; if (stage == MESA_SHADER_FRAGMENT) { struct genisa_stats stats[3]; diff --git a/src/intel/vulkan/anv_nir.h b/src/intel/vulkan/anv_nir.h index 3189dcd97be..32101b2cd2c 100644 --- a/src/intel/vulkan/anv_nir.h +++ b/src/intel/vulkan/anv_nir.h @@ -39,22 +39,23 @@ struct vk_pipeline_robustness_state; (sizeof(((struct anv_push_constants *)0)->field)) #define anv_load_driver_uniform(b, components, field) \ - nir_load_push_constant(b, components, \ - anv_drv_const_size(field) * 8, \ - nir_imm_int(b, 0), \ - .base = anv_drv_const_offset(field), \ - .range = components * anv_drv_const_size(field)) -/* Use load_uniform for indexed values since load_push_constant requires that - * the offset source is dynamically uniform in the subgroup which we cannot - * guarantee. + nir_load_push_data_intel(b, components, \ + anv_drv_const_size(field) * 8, \ + nir_imm_int(b, 0), \ + .base = anv_drv_const_offset(field), \ + .range = components * anv_drv_const_size(field)) +/* Use ACCESS_NON_UNIFORM for indexed values since load_push_constant requires + * that the offset source is dynamically uniform in the subgroup which we + * cannot guarantee. */ #define anv_load_driver_uniform_indexed(b, components, field, idx) \ - nir_load_uniform(b, components, \ - anv_drv_const_size(field[0]) * 8, \ - nir_imul_imm(b, idx, \ - anv_drv_const_size(field[0])), \ - .base = anv_drv_const_offset(field), \ - .range = anv_drv_const_size(field)) + nir_load_push_data_intel(b, components, \ + anv_drv_const_size(field[0]) * 8, \ + nir_imul_imm(b, idx, \ + anv_drv_const_size(field[0])), \ + .base = anv_drv_const_offset(field), \ + .range = anv_drv_const_size(field), \ + .access = ACCESS_NON_UNIFORM) /* This map is represent a mapping where the key is the NIR * nir_intrinsic_resource_intel::block index. It allows mapping bindless UBOs diff --git a/src/intel/vulkan/anv_nir_compute_push_layout.c b/src/intel/vulkan/anv_nir_compute_push_layout.c index f99832d4e84..13781be4933 100644 --- a/src/intel/vulkan/anv_nir_compute_push_layout.c +++ b/src/intel/vulkan/anv_nir_compute_push_layout.c @@ -26,6 +26,94 @@ #include "compiler/brw/brw_nir.h" #include "util/mesa-sha1.h" +struct lower_to_push_data_intel_state { + const struct anv_pipeline_bind_map *bind_map; + const struct anv_pipeline_push_map *push_map; +}; + +static bool +lower_to_push_data_intel(nir_builder *b, + nir_intrinsic_instr *intrin, + void *data) +{ + const struct lower_to_push_data_intel_state *state = data; + /* With bindless shaders we load uniforms with SEND messages. All the push + * constants are located after the RT_DISPATCH_GLOBALS. We just need to add + * the offset to the address right after RT_DISPATCH_GLOBALS (see + * brw_nir_lower_rt_intrinsics.c). + */ + const unsigned base_offset = + brw_shader_stage_is_bindless(b->shader->info.stage) ? + 0 : state->bind_map->push_ranges[0].start * 32; + + switch (intrin->intrinsic) { + case nir_intrinsic_load_push_data_intel: { + nir_intrinsic_set_base(intrin, nir_intrinsic_base(intrin) - base_offset); + return true; + } + + case nir_intrinsic_load_push_constant: { + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + intrin->def.num_components, + intrin->def.bit_size, + intrin->src[0].ssa, + .base = nir_intrinsic_base(intrin) - base_offset, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + return true; + } + + case nir_intrinsic_load_ubo: { + if (!brw_nir_ubo_surface_index_is_pushable(intrin->src[0]) || + !nir_src_is_const(intrin->src[1])) + return false; + + const int block = brw_nir_ubo_surface_index_get_push_block(intrin->src[0]); + const unsigned byte_offset = nir_src_as_uint(intrin->src[1]); + const unsigned num_components = + nir_def_last_component_read(&intrin->def) + 1; + const int bytes = num_components * (intrin->def.bit_size / 8); + + const struct anv_pipeline_binding *binding = + &state->push_map->block_to_descriptor[block]; + + uint32_t range_offset = 0; + const struct anv_push_range *push_range = NULL; + for (uint32_t i = 0; i < 4; i++) { + if (state->bind_map->push_ranges[i].set == binding->set && + state->bind_map->push_ranges[i].index == binding->index && + byte_offset >= state->bind_map->push_ranges[i].start * 32 && + (byte_offset + bytes) <= (state->bind_map->push_ranges[i].start + + state->bind_map->push_ranges[i].length) * 32) { + push_range = &state->bind_map->push_ranges[i]; + break; + } else { + range_offset += state->bind_map->push_ranges[i].length * 32; + } + } + + if (push_range == NULL) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + nir_def *data = nir_load_push_data_intel( + b, + nir_def_last_component_read(&intrin->def) + 1, + intrin->def.bit_size, + nir_imm_int(b, 0), + .base = range_offset + byte_offset - push_range->start * 32, + .range = nir_intrinsic_range(intrin)); + nir_def_replace(&intrin->def, data); + return true; + } + + default: + return false; + } +} + bool anv_nir_compute_push_layout(nir_shader *nir, const struct anv_physical_device *pdevice, @@ -57,8 +145,8 @@ anv_nir_compute_push_layout(nir_shader *nir, has_const_ubo = true; break; - case nir_intrinsic_load_uniform: - case nir_intrinsic_load_push_constant: { + case nir_intrinsic_load_push_constant: + case nir_intrinsic_load_push_data_intel: { unsigned base = nir_intrinsic_base(intrin); unsigned range = nir_intrinsic_range(intrin); push_start = MIN2(push_start, base); @@ -80,8 +168,6 @@ anv_nir_compute_push_layout(nir_shader *nir, } } - const bool has_push_intrinsic = push_start <= push_end; - const bool push_ubo_ranges = has_const_ubo && nir->info.stage != MESA_SHADER_COMPUTE && !brw_shader_stage_requires_bindless_resources(nir->info.stage); @@ -143,18 +229,6 @@ anv_nir_compute_push_layout(nir_shader *nir, push_end = MAX2(push_end, tess_config_end); } - if (nir->info.stage == MESA_SHADER_COMPUTE && devinfo->verx10 < 125) { - /* For compute shaders, we always have to have the subgroup ID. The - * back-end compiler will "helpfully" add it for us in the last push - * constant slot. Yes, there is an off-by-one error here but that's - * because the back-end will add it so we want to claim the number of - * push constants one dword less than the full amount including - * gl_SubgroupId. - */ - assert(push_end <= anv_drv_const_offset(cs.subgroup_id)); - push_end = anv_drv_const_offset(cs.subgroup_id); - } - /* Align push_start down to a 32B (for 3DSTATE_CONSTANT) and make it no * larger than push_end (no push constants is indicated by push_start = * UINT_MAX). @@ -186,9 +260,20 @@ anv_nir_compute_push_layout(nir_shader *nir, /* For scalar, push data size needs to be aligned to a DWORD. */ const unsigned alignment = 4; - nir->num_uniforms = align(push_end - push_start, alignment); - prog_data->nr_params = nir->num_uniforms / 4; - prog_data->param = rzalloc_array(mem_ctx, uint32_t, prog_data->nr_params); + const unsigned push_size = align(push_end - push_start, alignment); + prog_data->push_sizes[0] = push_size; + + /* Fill the compute push constant layout (cross/per thread constants) for + * platforms pre Gfx12.5. + */ + if (nir->info.stage == MESA_SHADER_COMPUTE) { + const int subgroup_id_index = + push_end == (anv_drv_const_offset(cs.subgroup_id) + + anv_drv_const_size(cs.subgroup_id)) ? + (anv_drv_const_offset(cs.subgroup_id) - push_start) / 4 : -1; + struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data); + brw_cs_fill_push_const_info(devinfo, cs_prog_data, subgroup_id_index); + } struct anv_push_range push_constant_range = { .set = ANV_DESCRIPTOR_SET_PUSH_CONSTANTS, @@ -196,39 +281,6 @@ anv_nir_compute_push_layout(nir_shader *nir, .length = align(push_end - push_start, devinfo->grf_size) / 32, }; - if (has_push_intrinsic) { - nir_foreach_function_impl(impl, nir) { - nir_foreach_block(block, impl) { - nir_foreach_instr_safe(instr, block) { - if (instr->type != nir_instr_type_intrinsic) - continue; - - nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); - switch (intrin->intrinsic) { - case nir_intrinsic_load_uniform: - case nir_intrinsic_load_push_constant: { - /* With bindless shaders we load uniforms with SEND - * messages. All the push constants are located after the - * RT_DISPATCH_GLOBALS. We just need to add the offset to - * the address right after RT_DISPATCH_GLOBALS (see - * brw_nir_lower_rt_intrinsics.c). - */ - unsigned base_offset = - brw_shader_stage_is_bindless(nir->info.stage) ? 0 : push_start; - nir_intrinsic_set_base(intrin, - nir_intrinsic_base(intrin) - - base_offset); - break; - } - - default: - break; - } - } - } - } - } - /* When platforms support Mesh and the fragment shader is not fully linked * to the previous shader, payload format can change if the preceding * shader is mesh or not, this is an issue in particular for PrimitiveID @@ -260,15 +312,17 @@ anv_nir_compute_push_layout(nir_shader *nir, unsigned n_push_ranges = 0; if (push_ubo_ranges) { - brw_nir_analyze_ubo_ranges(compiler, nir, prog_data->ubo_ranges); + struct brw_ubo_range ubo_ranges[4] = {}; + + brw_nir_analyze_ubo_ranges(compiler, nir, ubo_ranges); const unsigned max_push_regs = 64; unsigned total_push_regs = push_constant_range.length; for (unsigned i = 0; i < 4; i++) { - if (total_push_regs + prog_data->ubo_ranges[i].length > max_push_regs) - prog_data->ubo_ranges[i].length = max_push_regs - total_push_regs; - total_push_regs += prog_data->ubo_ranges[i].length; + if (total_push_regs + ubo_ranges[i].length > max_push_regs) + ubo_ranges[i].length = max_push_regs - total_push_regs; + total_push_regs += ubo_ranges[i].length; } assert(total_push_regs <= max_push_regs); @@ -286,7 +340,7 @@ anv_nir_compute_push_layout(nir_shader *nir, const unsigned max_push_buffers = needs_padding_per_primitive ? 3 : 4; for (unsigned i = 0; i < 4; i++) { - struct brw_ubo_range *ubo_range = &prog_data->ubo_ranges[i]; + struct brw_ubo_range *ubo_range = &ubo_ranges[i]; if (ubo_range->length == 0) continue; @@ -310,7 +364,7 @@ anv_nir_compute_push_layout(nir_shader *nir, /* We only bother to shader-zero pushed client UBOs */ if (binding->set < MAX_SETS && (robust_flags & BRW_ROBUSTNESS_UBO)) { - prog_data->robust_ubo_ranges |= (uint8_t) (1 << i); + prog_data->robust_ubo_ranges |= (uint8_t) (1 << (i + push_size != 0)); } } } else if (push_constant_range.length > 0) { @@ -340,8 +394,7 @@ anv_nir_compute_push_layout(nir_shader *nir, .start = 0, .length = 1, }; - assert(prog_data->nr_params == 0); - prog_data->nr_params = 32 / 4; + prog_data->push_sizes[0] = 32; } if (needs_padding_per_primitive) { @@ -355,21 +408,36 @@ anv_nir_compute_push_layout(nir_shader *nir, assert(n_push_ranges <= 4); - if (nir->info.stage == MESA_SHADER_TESS_CTRL && needs_dyn_tess_config) { - struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); + bool progress = nir_shader_intrinsics_pass( + nir, lower_to_push_data_intel, + nir_metadata_control_flow, + &(struct lower_to_push_data_intel_state) { + .bind_map = map, + .push_map = push_map, + }); - const uint32_t tess_config_offset = anv_drv_const_offset(gfx.tess_config); - assert(tess_config_offset >= push_start); - tcs_prog_data->tess_config_param = (tess_config_offset - push_start) / 4; - } - if (nir->info.stage == MESA_SHADER_TESS_EVAL && push_info->separate_tessellation) { - struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); + switch (nir->info.stage) { + case MESA_SHADER_TESS_CTRL: + if (needs_dyn_tess_config) { + struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data); - const uint32_t tess_config_offset = anv_drv_const_offset(gfx.tess_config); - assert(tess_config_offset >= push_start); - tes_prog_data->tess_config_param = (tess_config_offset - push_start) / 4; - } - if (nir->info.stage == MESA_SHADER_FRAGMENT) { + const uint32_t tess_config_offset = anv_drv_const_offset(gfx.tess_config); + assert(tess_config_offset >= push_start); + tcs_prog_data->tess_config_param = (tess_config_offset - push_start) / 4; + } + break; + + case MESA_SHADER_TESS_EVAL: + if (push_info->separate_tessellation) { + struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data); + + const uint32_t tess_config_offset = anv_drv_const_offset(gfx.tess_config); + assert(tess_config_offset >= push_start); + tes_prog_data->tess_config_param = (tess_config_offset - push_start) / 4; + } + break; + + case MESA_SHADER_FRAGMENT: { struct brw_wm_prog_data *wm_prog_data = container_of(prog_data, struct brw_wm_prog_data, base); @@ -380,7 +448,6 @@ anv_nir_compute_push_layout(nir_shader *nir, wm_prog_data->msaa_flags_param = (fs_msaa_flags_offset - push_start) / 4; } - if (needs_wa_18019110168) { const uint32_t fs_per_prim_remap_offset = anv_drv_const_offset(gfx.fs_per_prim_remap_offset); @@ -388,8 +455,16 @@ anv_nir_compute_push_layout(nir_shader *nir, wm_prog_data->per_primitive_remap_param = (fs_per_prim_remap_offset - push_start) / 4; } + break; } + default: + break; + } + + for (uint32_t i = 0; i < 4; i++) + prog_data->push_sizes[i] = map->push_ranges[i].length * 32; + #if 0 fprintf(stderr, "stage=%s push ranges:\n", mesa_shader_stage_name(nir->info.stage)); for (unsigned i = 0; i < ARRAY_SIZE(map->push_ranges); i++) @@ -407,7 +482,7 @@ anv_nir_compute_push_layout(nir_shader *nir, _mesa_sha1_compute(map->push_ranges, sizeof(map->push_ranges), map->push_sha1); - return false; + return progress; } void @@ -416,10 +491,9 @@ anv_nir_validate_push_layout(const struct anv_physical_device *pdevice, struct anv_pipeline_bind_map *map) { #ifndef NDEBUG - unsigned prog_data_push_size = align(prog_data->nr_params, pdevice->info.grf_size / 4) / 8; - + unsigned prog_data_push_size = 0; for (unsigned i = 0; i < 4; i++) - prog_data_push_size += prog_data->ubo_ranges[i].length; + prog_data_push_size += DIV_ROUND_UP(prog_data->push_sizes[i], 32); unsigned bind_map_push_size = 0; for (unsigned i = 0; i < 4; i++) { diff --git a/src/intel/vulkan/anv_nir_lower_driver_values.c b/src/intel/vulkan/anv_nir_lower_driver_values.c index 7284561eb33..1dda703c846 100644 --- a/src/intel/vulkan/anv_nir_lower_driver_values.c +++ b/src/intel/vulkan/anv_nir_lower_driver_values.c @@ -53,6 +53,22 @@ lower_base_workgroup_id(nir_builder *b, nir_intrinsic_instr *intrin) return true; } +static bool +lower_subgroup_id(nir_builder *b, nir_intrinsic_instr *intrin, + const struct anv_physical_device *pdevice) +{ + if (pdevice->info.verx10 >= 125) + return false; + + b->cursor = nir_before_instr(&intrin->instr); + + nir_def *subgroup_id = + anv_load_driver_uniform(b, 1, cs.subgroup_id); + nir_def_replace(&intrin->def, subgroup_id); + + return true; +} + static bool lower_ray_query_globals(nir_builder *b, nir_intrinsic_instr *intrin) { @@ -72,6 +88,8 @@ lower_driver_values(nir_builder *b, nir_intrinsic_instr *intrin, void *data) return lower_load_constant(b, intrin); case nir_intrinsic_load_base_workgroup_id: return lower_base_workgroup_id(b, intrin); + case nir_intrinsic_load_subgroup_id: + return lower_subgroup_id(b, intrin, data); case nir_intrinsic_load_ray_query_global_intel: return lower_ray_query_globals(b, intrin); default: diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c index bf60fb0d186..6be68bb441d 100644 --- a/src/intel/vulkan/anv_pipeline_cache.c +++ b/src/intel/vulkan/anv_pipeline_cache.c @@ -94,7 +94,6 @@ anv_shader_internal_create(struct anv_device *device, prog_data_size); VK_MULTIALLOC_DECL(&ma, struct intel_shader_reloc, prog_data_relocs, prog_data_in->num_relocs); - VK_MULTIALLOC_DECL(&ma, uint32_t, prog_data_param, prog_data_in->nr_params); VK_MULTIALLOC_DECL(&ma, void, code, kernel_size); VK_MULTIALLOC_DECL_SIZE(&ma, nir_xfb_info, xfb_info, @@ -151,7 +150,6 @@ anv_shader_internal_create(struct anv_device *device, typed_memcpy(prog_data_relocs, prog_data_in->relocs, prog_data_in->num_relocs); prog_data->relocs = prog_data_relocs; - prog_data->param = prog_data_param; shader->prog_data = prog_data; shader->prog_data_size = prog_data_size; @@ -210,7 +208,6 @@ anv_shader_internal_serialize(struct vk_pipeline_cache_object *object, assert(shader->prog_data_size <= sizeof(prog_data)); memcpy(&prog_data, shader->prog_data, shader->prog_data_size); prog_data.base.relocs = NULL; - prog_data.base.param = NULL; blob_write_bytes(blob, &prog_data, shader->prog_data_size); blob_write_bytes(blob, shader->prog_data->relocs, diff --git a/src/intel/vulkan/anv_shader.c b/src/intel/vulkan/anv_shader.c index 219cb5a7b4c..36d18a03456 100644 --- a/src/intel/vulkan/anv_shader.c +++ b/src/intel/vulkan/anv_shader.c @@ -111,7 +111,6 @@ anv_shader_serialize(struct vk_device *device, union brw_any_prog_data prog_data; memcpy(&prog_data, shader->prog_data, brw_prog_data_size(vk_shader->stage)); prog_data.base.relocs = NULL; - prog_data.base.param = NULL; blob_write_bytes(blob, &prog_data, brw_prog_data_size(vk_shader->stage)); @@ -584,9 +583,6 @@ anv_shader_create(struct anv_device *device, const uint32_t cmd_data_dwords = anv_genX(device->info, shader_cmd_size)( device, stage); - /* We never need this at runtime */ - shader_data->prog_data.base.param = NULL; - VK_MULTIALLOC(ma); VK_MULTIALLOC_DECL(&ma, struct anv_shader, shader, 1); VK_MULTIALLOC_DECL(&ma, uint32_t, cmd_data, cmd_data_dwords); diff --git a/src/intel/vulkan/anv_shader_compile.c b/src/intel/vulkan/anv_shader_compile.c index eaa4460fff7..fe88a91d06a 100644 --- a/src/intel/vulkan/anv_shader_compile.c +++ b/src/intel/vulkan/anv_shader_compile.c @@ -1473,8 +1473,6 @@ anv_shader_lower_nir(struct anv_device *device, dynamic_descriptors_offsets, &shader_data->bind_map, &shader_data->push_map, mem_ctx); - NIR_PASS(_, nir, anv_nir_lower_driver_values, pdevice); - NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ubo, anv_nir_ubo_addr_format(pdevice, shader_data->key.base.robust_flags)); NIR_PASS(_, nir, nir_lower_explicit_io, nir_var_mem_ssbo, @@ -1545,28 +1543,6 @@ anv_shader_lower_nir(struct anv_device *device, NIR_PASS(_, nir, nir_opt_dce); } - NIR_PASS(_, nir, anv_nir_update_resource_intel_block); - - NIR_PASS(_, nir, anv_nir_compute_push_layout, - pdevice, shader_data->key.base.robust_flags, - &(struct anv_nir_push_layout_info) { - .separate_tessellation = (nir->info.stage == MESA_SHADER_TESS_CTRL && - shader_data->key.tcs.separate_tess_vue_layout) || - (nir->info.stage == MESA_SHADER_TESS_EVAL && - shader_data->key.tes.separate_tess_vue_layout), - .fragment_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && - brw_wm_prog_key_is_dynamic(&shader_data->key.wm), - .mesh_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && - shader_data->key.wm.mesh_input == INTEL_SOMETIMES, - }, - &shader_data->key.base, - &shader_data->prog_data.base, - &shader_data->bind_map, &shader_data->push_map, - mem_ctx); - - NIR_PASS(_, nir, anv_nir_lower_resource_intel, pdevice, - shader_data->bind_map.layout_type); - if (mesa_shader_stage_uses_workgroup(nir->info.stage)) { NIR_PASS(_, nir, nir_lower_vars_to_explicit_types, nir_var_mem_shared, shared_type_info); @@ -1597,6 +1573,30 @@ anv_shader_lower_nir(struct anv_device *device, &shader_data->prog_data.cs); } + NIR_PASS(_, nir, anv_nir_lower_driver_values, pdevice); + + NIR_PASS(_, nir, anv_nir_update_resource_intel_block); + + NIR_PASS(_, nir, anv_nir_compute_push_layout, + pdevice, shader_data->key.base.robust_flags, + &(struct anv_nir_push_layout_info) { + .separate_tessellation = (nir->info.stage == MESA_SHADER_TESS_CTRL && + shader_data->key.tcs.separate_tess_vue_layout) || + (nir->info.stage == MESA_SHADER_TESS_EVAL && + shader_data->key.tes.separate_tess_vue_layout), + .fragment_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_key_is_dynamic(&shader_data->key.wm), + .mesh_dynamic = nir->info.stage == MESA_SHADER_FRAGMENT && + shader_data->key.wm.mesh_input == INTEL_SOMETIMES, + }, + &shader_data->key.base, + &shader_data->prog_data.base, + &shader_data->bind_map, &shader_data->push_map, + mem_ctx); + + NIR_PASS(_, nir, anv_nir_lower_resource_intel, pdevice, + shader_data->bind_map.layout_type); + shader_data->push_desc_info.push_set_buffer = anv_nir_loads_push_desc_buffer( nir, set_layouts, set_layout_count, &shader_data->bind_map); diff --git a/src/intel/vulkan/genX_cmd_draw.c b/src/intel/vulkan/genX_cmd_draw.c index 64452a3c2c3..6130a6ff327 100644 --- a/src/intel/vulkan/genX_cmd_draw.c +++ b/src/intel/vulkan/genX_cmd_draw.c @@ -448,24 +448,16 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer, const struct anv_pipeline_bind_map *bind_map = &shader->bind_map; struct anv_push_constants *push = &gfx->base.push_constants; - unsigned ubo_range_index = 0; for (unsigned i = 0; i < 4; i++) { const struct anv_push_range *range = &bind_map->push_ranges[i]; if (range->length == 0) - continue; + break; /* Skip any push ranges that were not promoted from UBOs */ - if (range->set >= MAX_SETS) { - /* The indexing in prog_data->robust_ubo_ranges is based off - * prog_data->ubo_ranges which does not include the - * prog_data->nr_params (Vulkan push constants). - */ - if (range->set != ANV_DESCRIPTOR_SET_PUSH_CONSTANTS) - ubo_range_index++; + if (range->set >= MAX_SETS) continue; - } - assert(shader->prog_data->robust_ubo_ranges & (1 << ubo_range_index)); + assert(shader->prog_data->robust_ubo_ranges & (1 << i)); unsigned bound_size = get_push_range_bound_size(cmd_buffer, shader, range); @@ -482,14 +474,12 @@ cmd_buffer_flush_gfx_push_constants(struct anv_cmd_buffer *cmd_buffer, } /* Update the pushed bound length constant if it changed */ - if (range_mask != push->gfx.push_reg_mask[stage][ubo_range_index]) { - push->gfx.push_reg_mask[stage][ubo_range_index] = range_mask; + if (range_mask != push->gfx.push_reg_mask[stage][i]) { + push->gfx.push_reg_mask[stage][i] = range_mask; cmd_buffer->state.push_constants_dirty |= mesa_to_vk_shader_stage(stage); gfx->base.push_constants_data_dirty = true; } - - ubo_range_index++; } } } diff --git a/src/intel/vulkan/genX_internal_kernels.c b/src/intel/vulkan/genX_internal_kernels.c index e1de861a245..48059cb51c7 100644 --- a/src/intel/vulkan/genX_internal_kernels.c +++ b/src/intel/vulkan/genX_internal_kernels.c @@ -26,10 +26,10 @@ #include "genxml/gen_macros.h" -#define load_param(b, bit_size, struct_name, field_name) \ - nir_load_uniform(b, 1, bit_size, nir_imm_int(b, 0), \ - .base = offsetof(struct_name, field_name), \ - .range = bit_size / 8) +#define load_param(b, bit_size, struct_name, field_name) \ + nir_load_push_data_intel(b, 1, bit_size, nir_imm_int(b, 0), \ + .base = offsetof(struct_name, field_name), \ + .range = bit_size / 8) static nir_def * load_fragment_index(nir_builder *b) diff --git a/src/intel/vulkan/genX_shader.c b/src/intel/vulkan/genX_shader.c index fbe5727256c..747427b8890 100644 --- a/src/intel/vulkan/genX_shader.c +++ b/src/intel/vulkan/genX_shader.c @@ -1064,9 +1064,7 @@ emit_ps_shader(struct anv_batch *batch, ps.SamplerCount = GFX_VER == 11 ? 0 : get_sampler_count(shader); ps.BindingTableEntryCount = shader->bind_map.surface_count; #if GFX_VER < 20 - ps.PushConstantEnable = - wm_prog_data->base.nr_params > 0 || - wm_prog_data->base.ubo_ranges[0].length; + ps.PushConstantEnable = wm_prog_data->base.push_sizes[0] > 0; #endif ps.MaximumNumberofThreadsPerPSD = devinfo->max_threads_per_psd - 1; diff --git a/src/intel/vulkan/genX_simple_shader.c b/src/intel/vulkan/genX_simple_shader.c index f124181a234..8e0aa1ab801 100644 --- a/src/intel/vulkan/genX_simple_shader.c +++ b/src/intel/vulkan/genX_simple_shader.c @@ -205,8 +205,7 @@ genX(emit_simpler_shader_init_fragment)(struct anv_simple_shader *state) ps.BindingTableEntryCount = GFX_VER == 9 ? 1 : 0; #if GFX_VER < 20 - ps.PushConstantEnable = prog_data->base.nr_params > 0 || - prog_data->base.ubo_ranges[0].length; + ps.PushConstantEnable = prog_data->base.push_sizes[0] > 0; #endif ps.DispatchGRFStartRegisterForConstantSetupData0 =