diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index 3862d242b94..d5a82a74a02 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -211,6 +211,9 @@ struct fd_dev_info { * command buffers. We copy this dispatch as is. */ bool cmdbuf_start_a725_quirk; + + bool load_inline_uniforms_via_preamble_ldgk; + bool load_shader_consts_via_preamble; } a7xx; }; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index aca618b5367..3ce66e2523b 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -791,6 +791,8 @@ a7xx_740 = A7XXProps( a7xx_750 = A7XXProps( has_event_write_sample_count = True, + load_inline_uniforms_via_preamble_ldgk = True, + load_shader_consts_via_preamble = True, ) a730_magic_regs = dict( diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 1beb9becbc6..7fcf1b1ed11 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -210,6 +210,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch; compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk; + compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble; } else { compiler->max_const_pipeline = 512; compiler->max_const_geom = 512; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 7a550dcea08..001b6716900 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -249,6 +249,8 @@ struct ir3_compiler { bool has_fs_tex_prefetch; bool stsc_duplication_quirk; + + bool load_shader_consts_via_preamble; }; void ir3_compiler_destroy(struct ir3_compiler *compiler); diff --git a/src/freedreno/ir3/ir3_cp.c b/src/freedreno/ir3/ir3_cp.c index 4495f052036..d4165bb3461 100644 --- a/src/freedreno/ir3/ir3_cp.c +++ b/src/freedreno/ir3/ir3_cp.c @@ -160,6 +160,9 @@ static bool lower_immed(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, unsigned n, struct ir3_register *reg, unsigned new_flags) { + if (ctx->shader->compiler->load_shader_consts_via_preamble) + return false; + if (!(new_flags & IR3_REG_IMMED)) return false; diff --git a/src/freedreno/ir3/ir3_legalize.c b/src/freedreno/ir3/ir3_legalize.c index 990a6efacbc..f5f4fc6259a 100644 --- a/src/freedreno/ir3/ir3_legalize.c +++ b/src/freedreno/ir3/ir3_legalize.c @@ -77,6 +77,15 @@ apply_ss(struct ir3_instruction *instr, state->needs_ss_for_const = false; } +static inline void +apply_sy(struct ir3_instruction *instr, + struct ir3_legalize_state *state, + bool mergedregs) +{ + instr->flags |= IR3_INSTR_SY; + regmask_init(&state->needs_sy, mergedregs); +} + /* We want to evaluate each block from the position of any other * predecessor block, in order that the flags set are the union of * all possible program paths. @@ -176,9 +185,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) { apply_ss(n, state, mergedregs); - - n->flags |= IR3_INSTR_SY; - regmask_init(&state->needs_sy, mergedregs); + apply_sy(n, state, mergedregs); last_input_needs_ss = false; } @@ -211,12 +218,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block) } if (regmask_get(&state->needs_sy, reg)) { - n->flags |= IR3_INSTR_SY; - regmask_init(&state->needs_sy, mergedregs); + apply_sy(n, state, mergedregs); + } + } else if ((reg->flags & IR3_REG_CONST)) { + if (state->needs_ss_for_const) { + apply_ss(n, state, mergedregs); + last_input_needs_ss = false; } - } else if ((reg->flags & IR3_REG_CONST) && state->needs_ss_for_const) { - apply_ss(n, state, mergedregs); - last_input_needs_ss = false; } } diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 793d3084d07..46587509e0a 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -31,6 +31,59 @@ #include "ir3_nir.h" #include "ir3_shader.h" + +nir_def * +ir3_get_driver_ubo(nir_builder *b, struct ir3_driver_ubo *ubo) +{ + /* Pick a UBO index to use as our constant data. Skip UBO 0 since that's + * reserved for gallium's cb0. + */ + if (ubo->idx == -1) { + if (b->shader->info.num_ubos == 0) + b->shader->info.num_ubos++; + ubo->idx = b->shader->info.num_ubos++; + } else { + assert(ubo->idx != 0); + /* Binning shader shared ir3_driver_ubo definitions but not shader info */ + b->shader->info.num_ubos = MAX2(b->shader->info.num_ubos, ubo->idx + 1); + } + + return nir_imm_int(b, ubo->idx); +} + +nir_def * +ir3_load_driver_ubo(nir_builder *b, unsigned components, + struct ir3_driver_ubo *ubo, + unsigned offset) +{ + ubo->size = MAX2(ubo->size, offset + components); + + return nir_load_ubo(b, components, 32, ir3_get_driver_ubo(b, ubo), + nir_imm_int(b, offset * sizeof(uint32_t)), + .align_mul = 16, + .align_offset = (offset % 4) * sizeof(uint32_t), + .range_base = offset * sizeof(uint32_t), + .range = components * sizeof(uint32_t)); +} + +nir_def * +ir3_load_driver_ubo_indirect(nir_builder *b, unsigned components, + struct ir3_driver_ubo *ubo, + unsigned base, nir_def *offset, + unsigned range) +{ + ubo->size = MAX2(ubo->size, base + components + range * 4); + + return nir_load_ubo(b, components, 32, ir3_get_driver_ubo(b, ubo), + nir_iadd(b, nir_imul24(b, offset, nir_imm_int(b, 16)), + nir_imm_int(b, base * sizeof(uint32_t))), + .align_mul = 16, + .align_offset = (base % 4) * sizeof(uint32_t), + .range_base = base * sizeof(uint32_t), + .range = components * sizeof(uint32_t) + + (range - 1) * 16); +} + static bool ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, unsigned bit_size, unsigned num_components, @@ -742,6 +795,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) OPT(s, ir3_nir_opt_subgroups, so); + if (so->compiler->load_shader_consts_via_preamble) + progress |= OPT(s, ir3_nir_lower_driver_params_to_ubo, so); + /* Do the preamble before analysing UBO ranges, because it's usually * higher-value and because it can result in eliminating some indirect UBO * accesses where otherwise we'd have to push the whole range. However we @@ -844,6 +900,75 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s) ir3_setup_const_state(s, so, ir3_const_state(so)); } +bool +ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr, + struct driver_param_info *param_info) +{ + switch (intr->intrinsic) { + case nir_intrinsic_load_base_workgroup_id: + param_info->offset = IR3_DP_BASE_GROUP_X; + break; + case nir_intrinsic_load_num_workgroups: + param_info->offset = IR3_DP_NUM_WORK_GROUPS_X; + break; + case nir_intrinsic_load_workgroup_size: + param_info->offset = IR3_DP_LOCAL_GROUP_SIZE_X; + break; + case nir_intrinsic_load_subgroup_size: + assert(shader->info.stage == MESA_SHADER_COMPUTE || + shader->info.stage == MESA_SHADER_FRAGMENT); + if (shader->info.stage == MESA_SHADER_COMPUTE) { + param_info->offset = IR3_DP_CS_SUBGROUP_SIZE; + } else { + param_info->offset = IR3_DP_FS_SUBGROUP_SIZE; + } + break; + case nir_intrinsic_load_subgroup_id_shift_ir3: + param_info->offset = IR3_DP_SUBGROUP_ID_SHIFT; + break; + case nir_intrinsic_load_work_dim: + param_info->offset = IR3_DP_WORK_DIM; + break; + case nir_intrinsic_load_base_vertex: + case nir_intrinsic_load_first_vertex: + param_info->offset = IR3_DP_VTXID_BASE; + break; + case nir_intrinsic_load_is_indexed_draw: + param_info->offset = IR3_DP_IS_INDEXED_DRAW; + break; + case nir_intrinsic_load_draw_id: + param_info->offset = IR3_DP_DRAWID; + break; + case nir_intrinsic_load_base_instance: + param_info->offset = IR3_DP_INSTID_BASE; + break; + case nir_intrinsic_load_user_clip_plane: { + uint32_t idx = nir_intrinsic_ucp_id(intr); + param_info->offset = IR3_DP_UCP0_X + 4 * idx; + break; + } + case nir_intrinsic_load_tess_level_outer_default: + param_info->offset = IR3_DP_HS_DEFAULT_OUTER_LEVEL_X; + break; + case nir_intrinsic_load_tess_level_inner_default: + param_info->offset = IR3_DP_HS_DEFAULT_INNER_LEVEL_X; + break; + case nir_intrinsic_load_frag_size_ir3: + param_info->offset = IR3_DP_FS_FRAG_SIZE; + break; + case nir_intrinsic_load_frag_offset_ir3: + param_info->offset = IR3_DP_FS_FRAG_OFFSET; + break; + case nir_intrinsic_load_frag_invocation_count: + param_info->offset = IR3_DP_FS_FRAG_INVOCATION_COUNT; + break; + default: + return false; + } + + return true; +} + static void ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout) { @@ -877,84 +1002,16 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st layout->image_dims.count += 3; /* three const per */ } break; - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_first_vertex: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1); - break; - case nir_intrinsic_load_is_indexed_draw: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_IS_INDEXED_DRAW + 1); - break; - case nir_intrinsic_load_base_instance: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1); - break; - case nir_intrinsic_load_user_clip_plane: - idx = nir_intrinsic_ucp_id(intr); - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_UCP0_X + (idx + 1) * 4); - break; - case nir_intrinsic_load_num_workgroups: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1); - break; - case nir_intrinsic_load_workgroup_id: - if (!compiler->has_shared_regfile) { - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_WORKGROUP_ID_Z + 1); - } - break; - case nir_intrinsic_load_workgroup_size: - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_LOCAL_GROUP_SIZE_Z + 1); - break; - case nir_intrinsic_load_base_workgroup_id: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1); - break; - case nir_intrinsic_load_subgroup_size: { - assert(shader->info.stage == MESA_SHADER_COMPUTE || - shader->info.stage == MESA_SHADER_FRAGMENT); - enum ir3_driver_param size = shader->info.stage == MESA_SHADER_COMPUTE ? - IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE; - layout->num_driver_params = - MAX2(layout->num_driver_params, size + 1); - break; - } - case nir_intrinsic_load_subgroup_id_shift_ir3: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1); - break; - case nir_intrinsic_load_draw_id: - layout->num_driver_params = - MAX2(layout->num_driver_params, IR3_DP_DRAWID + 1); - break; - case nir_intrinsic_load_tess_level_outer_default: - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_HS_DEFAULT_OUTER_LEVEL_W + 1); - break; - case nir_intrinsic_load_tess_level_inner_default: - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_HS_DEFAULT_INNER_LEVEL_Y + 1); - break; - case nir_intrinsic_load_frag_size_ir3: - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_FS_FRAG_SIZE + 2 + - (nir_intrinsic_range(intr) - 1) * 4); - break; - case nir_intrinsic_load_frag_offset_ir3: - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_FS_FRAG_OFFSET + 2 + - (nir_intrinsic_range(intr) - 1) * 4); - break; - case nir_intrinsic_load_frag_invocation_count: - layout->num_driver_params = MAX2(layout->num_driver_params, - IR3_DP_FS_FRAG_INVOCATION_COUNT + 1); - break; default: break; } + + struct driver_param_info param_info; + if (ir3_get_driver_param_info(shader, intr, ¶m_info)) { + layout->num_driver_params = + MAX2(layout->num_driver_params, + param_info.offset + nir_intrinsic_dest_components(intr)); + } } } } @@ -999,7 +1056,7 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, const_state->preamble_size; unsigned ptrsz = ir3_pointer_size(compiler); - if (const_state->num_ubos > 0) { + if (const_state->num_ubos > 0 && compiler->gen < 6) { const_state->offsets.ubo = constoff; constoff += align(const_state->num_ubos * ptrsz, 4) / 4; } @@ -1043,6 +1100,26 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; } + if (!compiler->load_shader_consts_via_preamble) { + switch (v->type) { + case MESA_SHADER_TESS_CTRL: + case MESA_SHADER_TESS_EVAL: + const_state->offsets.primitive_param = constoff; + constoff += 2; + + const_state->offsets.primitive_map = constoff; + break; + case MESA_SHADER_GEOMETRY: + const_state->offsets.primitive_param = constoff; + constoff += 1; + + const_state->offsets.primitive_map = constoff; + break; + default: + break; + } + } + switch (v->type) { case MESA_SHADER_VERTEX: const_state->offsets.primitive_param = constoff; @@ -1050,17 +1127,9 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v, break; case MESA_SHADER_TESS_CTRL: case MESA_SHADER_TESS_EVAL: - const_state->offsets.primitive_param = constoff; - constoff += 2; - - const_state->offsets.primitive_map = constoff; constoff += DIV_ROUND_UP(v->input_size, 4); break; case MESA_SHADER_GEOMETRY: - const_state->offsets.primitive_param = constoff; - constoff += 1; - - const_state->offsets.primitive_map = constoff; constoff += DIV_ROUND_UP(v->input_size, 4); break; default: diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h index d311096c2d3..af1ad07d93d 100644 --- a/src/freedreno/ir3/ir3_nir.h +++ b/src/freedreno/ir3/ir3_nir.h @@ -42,6 +42,8 @@ bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader); bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader); bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir, struct ir3_shader_variant *v); +bool ir3_nir_lower_driver_params_to_ubo(nir_shader *nir, + struct ir3_shader_variant *v); bool ir3_nir_move_varying_inputs(nir_shader *shader); int ir3_nir_coord_offset(nir_def *ssa); bool ir3_nir_lower_tex_prefetch(nir_shader *shader); @@ -88,6 +90,23 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b, bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v); +nir_def *ir3_get_driver_ubo(nir_builder *b, struct ir3_driver_ubo *ubo); +nir_def *ir3_load_driver_ubo(nir_builder *b, unsigned components, + struct ir3_driver_ubo *ubo, + unsigned offset); +nir_def *ir3_load_driver_ubo_indirect(nir_builder *b, unsigned components, + struct ir3_driver_ubo *ubo, + unsigned base, nir_def *offset, + unsigned range); + +struct driver_param_info { + uint32_t offset; +}; + +bool ir3_get_driver_param_info(const nir_shader *shader, + nir_intrinsic_instr *intr, + struct driver_param_info *param_info); + static inline nir_intrinsic_instr * ir3_bindless_resource(nir_src src) { diff --git a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c index b2a849666dd..f364c7cdf9d 100644 --- a/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c +++ b/src/freedreno/ir3/ir3_nir_analyze_ubo_ranges.c @@ -337,13 +337,18 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b, } static bool -copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state) +copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state, + bool const_data_via_cp) { const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; - if (state->num_enabled == 0 || - (state->num_enabled == 1 && !state->range[0].ubo.bindless && - state->range[0].ubo.block == const_state->constant_data_ubo)) + if (state->num_enabled == 0) + return false; + + if (state->num_enabled == 1 && + !state->range[0].ubo.bindless && + state->range[0].ubo.block == const_state->consts_ubo.idx && + const_data_via_cp) return false; nir_function_impl *preamble = nir_shader_get_preamble(nir); @@ -358,7 +363,8 @@ copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state) * the CP do it for us. */ if (!range->ubo.bindless && - range->ubo.block == const_state->constant_data_ubo) + range->ubo.block == const_state->consts_ubo.idx && + const_data_via_cp) continue; nir_def *ubo = nir_imm_int(b, range->ubo.block); @@ -502,7 +508,8 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v) nir->info.num_ubos = num_ubos; if (compiler->has_preamble && push_ubos) - progress |= copy_ubo_to_uniform(nir, const_state); + progress |= copy_ubo_to_uniform( + nir, const_state, !compiler->load_shader_consts_via_preamble); return progress; } @@ -584,15 +591,6 @@ ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data) struct ir3_const_state *const_state = data; nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr); - /* Pick a UBO index to use as our constant data. Skip UBO 0 since that's - * reserved for gallium's cb0. - */ - if (const_state->constant_data_ubo == -1) { - if (b->shader->info.num_ubos == 0) - b->shader->info.num_ubos++; - const_state->constant_data_ubo = b->shader->info.num_ubos++; - } - unsigned num_components = instr->num_components; unsigned bit_size = instr->def.bit_size; if (instr->def.bit_size == 16) { @@ -606,7 +604,7 @@ ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data) bit_size = 32; } unsigned base = nir_intrinsic_base(instr); - nir_def *index = nir_imm_int(b, const_state->constant_data_ubo); + nir_def *index = ir3_get_driver_ubo(b, &const_state->consts_ubo); nir_def *offset = nir_iadd_imm(b, instr->src[0].ssa, base); @@ -640,8 +638,6 @@ ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v) { struct ir3_const_state *const_state = ir3_const_state(v); - const_state->constant_data_ubo = -1; - bool progress = nir_shader_lower_instructions( nir, ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr, const_state); diff --git a/src/freedreno/ir3/ir3_nir_lower_driver_params_to_ubo.c b/src/freedreno/ir3/ir3_nir_lower_driver_params_to_ubo.c new file mode 100644 index 00000000000..e650dc4c83d --- /dev/null +++ b/src/freedreno/ir3/ir3_nir_lower_driver_params_to_ubo.c @@ -0,0 +1,81 @@ +/* + * Copyright © 2023 Igalia S.L. + * SPDX-License-Identifier: MIT + */ + +#include "compiler/nir/nir.h" +#include "compiler/nir/nir_builder.h" +#include "util/u_math.h" +#include "ir3_compiler.h" +#include "ir3_nir.h" + +static bool +lower_driver_param_to_ubo(nir_builder *b, nir_intrinsic_instr *intr, void *in) +{ + struct ir3_const_state *const_state = in; + + if (b->shader->info.stage == MESA_SHADER_VERTEX) + return false; + + unsigned components = nir_intrinsic_dest_components(intr); + + b->cursor = nir_before_instr(&intr->instr); + + nir_def *result; + switch (intr->intrinsic) { + case nir_intrinsic_load_primitive_location_ir3: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_map_ubo, + nir_intrinsic_driver_location(intr)); + break; + case nir_intrinsic_load_vs_primitive_stride_ir3: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_param_ubo, 0); + break; + case nir_intrinsic_load_vs_vertex_stride_ir3: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_param_ubo, 1); + break; + case nir_intrinsic_load_hs_patch_stride_ir3: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_param_ubo, 2); + break; + case nir_intrinsic_load_patch_vertices_in: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_param_ubo, 3); + break; + case nir_intrinsic_load_tess_param_base_ir3: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_param_ubo, 4); + break; + case nir_intrinsic_load_tess_factor_base_ir3: + result = ir3_load_driver_ubo(b, components, + &const_state->primitive_param_ubo, 6); + break; + default: { + struct driver_param_info param_info; + if (!ir3_get_driver_param_info(b->shader, intr, ¶m_info)) + return false; + + result = ir3_load_driver_ubo(b, components, + &const_state->driver_params_ubo, + param_info.offset); + } + } + + nir_instr_remove(&intr->instr); + nir_def_rewrite_uses(&intr->def, result); + + return true; +} + +bool +ir3_nir_lower_driver_params_to_ubo(nir_shader *nir, + struct ir3_shader_variant *v) +{ + bool result = nir_shader_intrinsics_pass( + nir, lower_driver_param_to_ubo, + nir_metadata_block_index | nir_metadata_dominance, ir3_const_state(v)); + + return result; +} diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index d4c64e242cb..bf7f6a7c71f 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -303,6 +303,10 @@ alloc_variant(struct ir3_shader *shader, const struct ir3_shader_key *key, if (!v->binning_pass) { v->const_state = rzalloc_size(v, sizeof(*v->const_state)); v->const_state->push_consts_type = shader->options.push_consts_type; + v->const_state->consts_ubo.idx = -1; + v->const_state->driver_params_ubo.idx = -1; + v->const_state->primitive_map_ubo.idx = -1; + v->const_state->primitive_param_ubo.idx = -1; } return v; diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 2f658cf521d..8f087f2dcc2 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -154,6 +154,16 @@ enum ir3_push_consts_type { IR3_PUSH_CONSTS_SHARED_PREAMBLE, }; +/* This represents an internal UBO filled out by the driver. There are a few + * common UBOs that must be filled out identically by all drivers, for example + * for shader linkage, but drivers can also add their own that they manage + * themselves. + */ +struct ir3_driver_ubo { + int32_t idx; + uint32_t size; +}; + /** * Describes the layout of shader consts in the const register file. * @@ -186,8 +196,11 @@ struct ir3_const_state { unsigned num_ubos; unsigned num_driver_params; /* scalar */ - /* UBO that should be mapped to the NIR shader's constant_data (or -1). */ - int32_t constant_data_ubo; + struct ir3_driver_ubo consts_ubo; + struct ir3_driver_ubo driver_params_ubo; + struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo; + + int32_t constant_data_dynamic_offsets; struct { /* user const start at zero */ diff --git a/src/freedreno/ir3/meson.build b/src/freedreno/ir3/meson.build index 6edae70e6dd..3578bbaf9b8 100644 --- a/src/freedreno/ir3/meson.build +++ b/src/freedreno/ir3/meson.build @@ -93,6 +93,7 @@ libfreedreno_ir3_files = files( 'ir3_nir.h', 'ir3_nir_analyze_ubo_ranges.c', 'ir3_nir_lower_64b.c', + 'ir3_nir_lower_driver_params_to_ubo.c', 'ir3_nir_lower_load_barycentric_at_sample.c', 'ir3_nir_lower_load_barycentric_at_offset.c', 'ir3_nir_lower_push_consts_to_preamble.c', diff --git a/src/freedreno/vulkan/tu_cmd_buffer.cc b/src/freedreno/vulkan/tu_cmd_buffer.cc index 1141edfab46..6fe343645c7 100644 --- a/src/freedreno/vulkan/tu_cmd_buffer.cc +++ b/src/freedreno/vulkan/tu_cmd_buffer.cc @@ -4326,6 +4326,8 @@ tu6_emit_inline_ubo(struct tu_cs *cs, gl_shader_stage type, struct tu_descriptor_state *descriptors) { + assert(const_state->num_inline_ubos == 0 || !cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble); + /* Emit loads of inline uniforms. These load directly from the uniform's * storage space inside the descriptor set. */ @@ -4578,7 +4580,7 @@ fs_params_offset(struct tu_cmd_buffer *cmd) &cmd->state.program.link[MESA_SHADER_FRAGMENT]; const struct ir3_const_state *const_state = &link->const_state; - if (const_state->num_driver_params <= IR3_DP_FS_DYNAMIC) + if (const_state->num_driver_params < IR3_DP_FS_DYNAMIC) return 0; if (const_state->offsets.driver_param + IR3_DP_FS_DYNAMIC / 4 >= link->constlen) @@ -5412,102 +5414,193 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd, const struct tu_shader *shader = cmd->state.shaders[MESA_SHADER_COMPUTE]; const struct ir3_shader_variant *variant = shader->variant; const struct ir3_const_state *const_state = variant->const_state; - uint32_t offset = const_state->offsets.driver_param; unsigned subgroup_size = variant->info.subgroup_size; unsigned subgroup_shift = util_logbase2(subgroup_size); - if (variant->constlen <= offset) - return; + if (cmd->device->physical_device->info->a7xx.load_shader_consts_via_preamble) { + uint32_t num_consts = const_state->driver_params_ubo.size; + if (num_consts == 0) + return; - uint32_t num_consts = MIN2(const_state->num_driver_params, - (variant->constlen - offset) * 4); + bool direct_indirect_load = + !(info->indirect_offset & 0xf) && + !(info->indirect && num_consts > IR3_DP_BASE_GROUP_X); - if (!info->indirect) { - uint32_t driver_params[12] = { - [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], - [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], - [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], - [IR3_DP_WORK_DIM] = 0, - [IR3_DP_BASE_GROUP_X] = info->offsets[0], - [IR3_DP_BASE_GROUP_Y] = info->offsets[1], - [IR3_DP_BASE_GROUP_Z] = info->offsets[2], - [IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size, - [IR3_DP_LOCAL_GROUP_SIZE_X] = 0, - [IR3_DP_LOCAL_GROUP_SIZE_Y] = 0, - [IR3_DP_LOCAL_GROUP_SIZE_Z] = 0, - [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift, - }; + uint64_t iova = 0; - assert(num_consts <= ARRAY_SIZE(driver_params)); + if (!info->indirect) { + uint32_t driver_params[12] = { + [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], + [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], + [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], + [IR3_DP_WORK_DIM] = 0, + [IR3_DP_BASE_GROUP_X] = info->offsets[0], + [IR3_DP_BASE_GROUP_Y] = info->offsets[1], + [IR3_DP_BASE_GROUP_Z] = info->offsets[2], + [IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size, + [IR3_DP_LOCAL_GROUP_SIZE_X] = 0, + [IR3_DP_LOCAL_GROUP_SIZE_Y] = 0, + [IR3_DP_LOCAL_GROUP_SIZE_Z] = 0, + [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift, + }; - /* push constants */ - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4)); - tu_cs_emit(cs, 0); - tu_cs_emit(cs, 0); - uint32_t i; - for (i = 0; i < num_consts; i++) - tu_cs_emit(cs, driver_params[i]); - } else if (!(info->indirect_offset & 0xf)) { - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset); - } else { - /* Vulkan guarantees only 4 byte alignment for indirect_offset. - * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. - */ + assert(num_consts <= ARRAY_SIZE(driver_params)); - uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; + struct tu_cs_memory consts; + uint32_t consts_vec4 = DIV_ROUND_UP(num_consts, 4); + VkResult result = tu_cs_alloc(&cmd->sub_cs, consts_vec4, 4, &consts); + if (result != VK_SUCCESS) { + vk_command_buffer_set_error(&cmd->vk, result); + return; + } + memcpy(consts.map, driver_params, num_consts * sizeof(uint32_t)); + iova = consts.iova; + } else if (direct_indirect_load) { + iova = info->indirect->iova + info->indirect_offset; + } else { + /* Vulkan guarantees only 4 byte alignment for indirect_offset. + * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. + */ - for (uint32_t i = 0; i < 3; i++) { - tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); - tu_cs_emit(cs, 0); - tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i)); - tu_cs_emit_qw(cs, indirect_iova + i * 4); + uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; + + for (uint32_t i = 0; i < 3; i++) { + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); + tu_cs_emit(cs, 0); + tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i)); + tu_cs_emit_qw(cs, indirect_iova + i * sizeof(uint32_t)); + } + + /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for + * indirect dispatch. + */ + if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { + uint32_t indirect_driver_params[8] = { + 0, 0, 0, subgroup_size, + 0, 0, 0, subgroup_shift, + }; + bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X; + uint32_t emit_size = emit_local ? 8 : 4; + + tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + emit_size); + tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, 0) + 4 * sizeof(uint32_t)); + for (uint32_t i = 0; i < emit_size; i++) { + tu_cs_emit(cs, indirect_driver_params[i]); + } + } + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); + + iova = global_iova(cmd, cs_indirect_xyz[0]); } - tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); - tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(const_state->driver_params_ubo.idx) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + int size_vec4s = DIV_ROUND_UP(num_consts, 4); + tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32)); - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + } else { + uint32_t offset = const_state->offsets.driver_param; + if (variant->constlen <= offset) + return; + + uint32_t num_consts = MIN2(const_state->num_driver_params, + (variant->constlen - offset) * 4); + + if (!info->indirect) { + uint32_t driver_params[12] = { + [IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0], + [IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1], + [IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2], + [IR3_DP_WORK_DIM] = 0, + [IR3_DP_BASE_GROUP_X] = info->offsets[0], + [IR3_DP_BASE_GROUP_Y] = info->offsets[1], + [IR3_DP_BASE_GROUP_Z] = info->offsets[2], + [IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size, + [IR3_DP_LOCAL_GROUP_SIZE_X] = 0, + [IR3_DP_LOCAL_GROUP_SIZE_Y] = 0, + [IR3_DP_LOCAL_GROUP_SIZE_Z] = 0, + [IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift, + }; + + assert(num_consts <= ARRAY_SIZE(driver_params)); + + /* push constants */ + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT(1)); - tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0])); - } + CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4)); + tu_cs_emit(cs, 0); + tu_cs_emit(cs, 0); + uint32_t i; + for (i = 0; i < num_consts; i++) + tu_cs_emit(cs, driver_params[i]); + } else if (!(info->indirect_offset & 0xf)) { + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset); + } else { + /* Vulkan guarantees only 4 byte alignment for indirect_offset. + * However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment. + */ - /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for - * indirect dispatch. - */ - if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { - bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0)); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | - CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4)); - tu_cs_emit_qw(cs, 0); - tu_cs_emit(cs, 0); /* BASE_GROUP_X */ - tu_cs_emit(cs, 0); /* BASE_GROUP_Y */ - tu_cs_emit(cs, 0); /* BASE_GROUP_Z */ - tu_cs_emit(cs, subgroup_size); - if (emit_local) { - assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4)); - tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */ - tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */ - tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */ - tu_cs_emit(cs, subgroup_shift); + uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; + + for (uint32_t i = 0; i < 3; i++) { + tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5); + tu_cs_emit(cs, 0); + tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i)); + tu_cs_emit_qw(cs, indirect_iova + i * 4); + } + + tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); + tu_emit_event_write(cmd, cs, FD_CACHE_INVALIDATE); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0])); + } + + /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for + * indirect dispatch. + */ + if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { + bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X; + tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0)); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | + CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4)); + tu_cs_emit_qw(cs, 0); + tu_cs_emit(cs, 0); /* BASE_GROUP_X */ + tu_cs_emit(cs, 0); /* BASE_GROUP_Y */ + tu_cs_emit(cs, 0); /* BASE_GROUP_Z */ + tu_cs_emit(cs, subgroup_size); + if (emit_local) { + assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4)); + tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */ + tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */ + tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */ + tu_cs_emit(cs, subgroup_shift); + } } } } diff --git a/src/freedreno/vulkan/tu_cs.cc b/src/freedreno/vulkan/tu_cs.cc index 8fb6d3204b4..a52109bb396 100644 --- a/src/freedreno/vulkan/tu_cs.cc +++ b/src/freedreno/vulkan/tu_cs.cc @@ -507,6 +507,26 @@ tu_cs_reset(struct tu_cs *cs) cs->entry_count = 0; } +uint64_t +tu_cs_emit_data_nop(struct tu_cs *cs, + const uint32_t *data, + uint32_t size, + uint32_t align_dwords) +{ + uint32_t total_size = size + (align_dwords - 1); + tu_cs_emit_pkt7(cs, CP_NOP, total_size); + + uint64_t iova = tu_cs_get_cur_iova(cs); + uint64_t iova_aligned = align64(iova, align_dwords * sizeof(uint32_t)); + size_t offset = (iova_aligned - iova) / sizeof(uint32_t); + cs->cur += offset; + memcpy(cs->cur, data, size * sizeof(uint32_t)); + + cs->cur += total_size - offset; + + return iova + offset * sizeof(uint32_t); +} + void tu_cs_emit_debug_string(struct tu_cs *cs, const char *string, int len) { diff --git a/src/freedreno/vulkan/tu_cs.h b/src/freedreno/vulkan/tu_cs.h index 0d99fe7b1c5..89e28e83f91 100644 --- a/src/freedreno/vulkan/tu_cs.h +++ b/src/freedreno/vulkan/tu_cs.h @@ -480,6 +480,12 @@ tu_cond_exec_end(struct tu_cs *cs) } } +uint64_t +tu_cs_emit_data_nop(struct tu_cs *cs, + const uint32_t *data, + uint32_t size, + uint32_t align); + /* Temporary struct for tracking a register state to be written, used by * a6xx-pack.h and tu_cs_emit_regs() */ diff --git a/src/freedreno/vulkan/tu_device.cc b/src/freedreno/vulkan/tu_device.cc index d3ade679798..788e0c2ca57 100644 --- a/src/freedreno/vulkan/tu_device.cc +++ b/src/freedreno/vulkan/tu_device.cc @@ -231,7 +231,7 @@ get_device_extensions(const struct tu_physical_device *device, .EXT_extended_dynamic_state3 = true, .EXT_external_memory_dma_buf = true, .EXT_filter_cubic = device->info->a6xx.has_tex_filter_cubic, - .EXT_fragment_density_map = true, + .EXT_fragment_density_map = !device->info->a7xx.load_shader_consts_via_preamble, .EXT_global_priority = true, .EXT_global_priority_query = true, .EXT_graphics_pipeline_library = true, diff --git a/src/freedreno/vulkan/tu_device.h b/src/freedreno/vulkan/tu_device.h index e27cba5ca61..ffa680bba95 100644 --- a/src/freedreno/vulkan/tu_device.h +++ b/src/freedreno/vulkan/tu_device.h @@ -196,7 +196,7 @@ struct tu6_global uint32_t pad[7]; } flush_base[4]; - alignas(16) uint32_t cs_indirect_xyz[3]; + alignas(16) uint32_t cs_indirect_xyz[12]; volatile uint32_t vtx_stats_query_not_running; diff --git a/src/freedreno/vulkan/tu_pipeline.cc b/src/freedreno/vulkan/tu_pipeline.cc index 7d445c293e8..475762b0985 100644 --- a/src/freedreno/vulkan/tu_pipeline.cc +++ b/src/freedreno/vulkan/tu_pipeline.cc @@ -408,22 +408,53 @@ tu6_emit_dynamic_offset(struct tu_cs *cs, const struct tu_program_state *program) { const struct tu_physical_device *phys_dev = cs->device->physical_device; - if (!xs || shader->const_state.dynamic_offset_loc == UINT32_MAX) + + if (!xs) return; - tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) | - CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4))); - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) { + if (shader->const_state.dynamic_offsets_ubo.size == 0) + return; - for (unsigned i = 0; i < phys_dev->usable_sets; i++) { - unsigned dynamic_offset_start = - program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4); - tu_cs_emit(cs, dynamic_offset_start); + uint32_t offsets[MAX_SETS]; + for (unsigned i = 0; i < phys_dev->usable_sets; i++) { + unsigned dynamic_offset_start = + program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4); + offsets[i] = dynamic_offset_start; + } + + /* A7XX TODO: Emit data via sub_cs instead of NOP */ + uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4); + uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx; + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4); + tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32)); + } else { + if (shader->const_state.dynamic_offset_loc == UINT32_MAX) + return; + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) | + CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4))); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + + for (unsigned i = 0; i < phys_dev->usable_sets; i++) { + unsigned dynamic_offset_start = + program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4); + tu_cs_emit(cs, dynamic_offset_start); + } } } @@ -569,24 +600,77 @@ tu6_setup_streamout(struct tu_cs *cs, } } +enum tu_geom_consts_type +{ + TU_CONSTS_PRIMITIVE_MAP, + TU_CONSTS_PRIMITIVE_PARAM, +}; + static void -tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, - enum a6xx_state_block block, uint32_t offset, - uint32_t size, const uint32_t *dwords) { +tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type, + const struct ir3_const_state *const_state, + unsigned constlen, enum a6xx_state_block block, + uint32_t offset, uint32_t size, const uint32_t *dwords) { assert(size % 4 == 0); - - tu_cs_emit_pkt7(cs, opcode, 3 + size); - tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(block) | - CP_LOAD_STATE6_0_NUM_UNIT(size / 4)); - - tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); - tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; - tu_cs_emit_array(cs, dwords, size); + if (block == SB6_VS_SHADER || !cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) { + uint32_t base; + switch (type) { + case TU_CONSTS_PRIMITIVE_MAP: + base = const_state->offsets.primitive_map; + break; + case TU_CONSTS_PRIMITIVE_PARAM: + base = const_state->offsets.primitive_param; + break; + default: + unreachable("bad consts type"); + } + + int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4; + if (adjusted_size <= 0) + return; + + tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(block) | + CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4)); + + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + + tu_cs_emit_array(cs, dwords, adjusted_size); + } else { + uint32_t base; + switch (type) { + case TU_CONSTS_PRIMITIVE_MAP: + base = const_state->primitive_map_ubo.idx; + break; + case TU_CONSTS_PRIMITIVE_PARAM: + base = const_state->primitive_param_ubo.idx; + break; + default: + unreachable("bad consts type"); + } + if (base == -1) + return; + + /* A7XX TODO: Emit data via sub_cs instead of NOP */ + uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4); + + tu_cs_emit_pkt7(cs, opcode, 5); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(block) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + int size_vec4s = DIV_ROUND_UP(size, 4); + tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32)); + } } static void @@ -596,15 +680,13 @@ tu6_emit_link_map(struct tu_cs *cs, enum a6xx_state_block sb) { const struct ir3_const_state *const_state = ir3_const_state(consumer); - uint32_t base = const_state->offsets.primitive_map; - int size = DIV_ROUND_UP(consumer->input_size, 4); + uint32_t size = ALIGN(consumer->input_size, 4); - size = (MIN2(size + base, consumer->constlen) - base) * 4; - if (size <= 0) + if (size == 0) return; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size, - producer->output_loc); + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP, + const_state, consumer->constlen, sb, 0, size, producer->output_loc); } static int @@ -992,8 +1074,8 @@ tu6_emit_vs_params(struct tu_cs *cs, 0, 0, }; - uint32_t vs_base = const_state->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM, + const_state, constlen, SB6_VS_SHADER, 0, ARRAY_SIZE(vs_params), vs_params); } @@ -1018,6 +1100,8 @@ static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[] MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS, }; +#define HS_PARAMS_SIZE 8 + template static unsigned tu6_patch_control_points_size(struct tu_device *dev, @@ -1027,10 +1111,17 @@ tu6_patch_control_points_size(struct tu_device *dev, const struct tu_program_state *program, uint32_t patch_control_points) { -#define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords) - return EMIT_CONST_DWORDS(4) + - EMIT_CONST_DWORDS(program->hs_param_dwords) + 2 + 2 + 2; + if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) { +#define EMIT_CONST_DWORDS(const_dwords) (5 + const_dwords + 4) + return EMIT_CONST_DWORDS(4) + + EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2; #undef EMIT_CONST_DWORDS + } else { +#define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords) + return EMIT_CONST_DWORDS(4) + + EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2; +#undef EMIT_CONST_DWORDS + } } template @@ -1056,7 +1147,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs, uint64_t tess_factor_iova, tess_param_iova; tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); - uint32_t hs_params[8] = { + uint32_t hs_params[HS_PARAMS_SIZE] = { vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */ vs->variant->output_size * 4, /* hs vertex stride */ tcs->variant->output_size, @@ -1069,9 +1160,10 @@ tu6_emit_patch_control_points(struct tu_cs *cs, const struct ir3_const_state *hs_const = &program->link[MESA_SHADER_TESS_CTRL].const_state; - uint32_t hs_base = hs_const->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, - program->hs_param_dwords, hs_params); + unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen; + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM, + hs_const, hs_constlen, SB6_HS_SHADER, 0, + ARRAY_SIZE(hs_params), hs_params); uint32_t patch_local_mem_size_16b = patch_control_points * vs->variant->output_size / 4; @@ -1146,10 +1238,9 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs, tess_factor_iova >> 32, }; - uint32_t ds_base = ds->const_state->offsets.primitive_param; - uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params)); - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, - ds_param_dwords, ds_params); + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM, + ds->const_state, ds->constlen, SB6_DS_SHADER, 0, + ARRAY_SIZE(ds_params), ds_params); } if (gs) { @@ -1160,8 +1251,8 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs, 0, 0, }; - uint32_t gs_base = gs->const_state->offsets.primitive_param; - tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, + tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM, + gs->const_state, gs->constlen, SB6_GS_SHADER, 0, ARRAY_SIZE(gs_params), gs_params); } } @@ -2145,15 +2236,6 @@ tu_emit_program_state(struct tu_cs *sub_cs, tu6_emit_vpc(&prog_cs, vs, hs, ds, gs, fs); prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs); - if (hs) { - const struct ir3_const_state *hs_const = - &prog->link[MESA_SHADER_TESS_CTRL].const_state; - unsigned hs_constlen = - prog->link[MESA_SHADER_TESS_CTRL].constlen; - uint32_t hs_base = hs_const->offsets.primitive_param; - prog->hs_param_dwords = MIN2((hs_constlen - hs_base) * 4, 8); - } - const struct ir3_shader_variant *last_shader; if (gs) last_shader = gs; diff --git a/src/freedreno/vulkan/tu_pipeline.h b/src/freedreno/vulkan/tu_pipeline.h index a99675ccd4c..0a932131e6b 100644 --- a/src/freedreno/vulkan/tu_pipeline.h +++ b/src/freedreno/vulkan/tu_pipeline.h @@ -93,8 +93,6 @@ struct tu_program_state struct tu_draw_state vpc_state; struct tu_draw_state fs_state; - uint32_t hs_param_dwords; - struct tu_push_constant_range shared_consts; struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index f72f2bd5cf5..6aa4b57f2d1 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -97,6 +97,13 @@ tu_spirv_to_nir(struct tu_device *dev, if (result != VK_SUCCESS) return NULL; + /* ir3 uses num_ubos and num_ssbos to track the number of *bindful* + * UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects + * which is useless for us, so reset them here. + */ + nir->info.num_ubos = 0; + nir->info.num_ssbos = 0; + if (TU_DEBUG(NIR)) { fprintf(stderr, "translated nir:\n"); nir_print_shader(nir, stderr); @@ -175,6 +182,7 @@ lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b, struct tu_shader *shader, const struct tu_pipeline_layout *layout) { + struct ir3_compiler *compiler = dev->compiler; nir_def *vulkan_idx = instr->src[0].ssa; unsigned set = nir_intrinsic_desc_set(instr); @@ -209,9 +217,15 @@ lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b, * get it from the const file instead. */ base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS)); - nir_def *dynamic_offset_start = - nir_load_uniform(b, 1, 32, nir_imm_int(b, 0), - .base = shader->const_state.dynamic_offset_loc + set); + nir_def *dynamic_offset_start; + if (compiler->load_shader_consts_via_preamble) { + dynamic_offset_start = + ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set); + } else { + dynamic_offset_start = + nir_load_uniform(b, 1, 32, nir_imm_int(b, 0), + .base = shader->const_state.dynamic_offset_loc + set); + } base = nir_iadd(b, base, dynamic_offset_start); } else { base = nir_imm_int(b, (offset + @@ -271,7 +285,7 @@ lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin) nir_instr_remove(&intrin->instr); } -static void +static bool lower_ssbo_ubo_intrinsic(struct tu_device *dev, nir_builder *b, nir_intrinsic_instr *intrin) { @@ -291,6 +305,10 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev, buffer_src = 0; } + /* Don't lower non-bindless UBO loads of driver params */ + if (intrin->src[buffer_src].ssa->num_components == 1) + return false; + nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0); nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1); @@ -310,7 +328,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev, nir_def *bindless = nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx)); nir_src_rewrite(&intrin->src[buffer_src], bindless); - return; + return true; } nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp); @@ -361,6 +379,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev, if (info->has_dest) nir_def_rewrite_uses(&intrin->def, result); nir_instr_remove(&intrin->instr); + return true; } static nir_def * @@ -461,8 +480,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, case nir_intrinsic_ssbo_atomic: case nir_intrinsic_ssbo_atomic_swap: case nir_intrinsic_get_ssbo_size: - lower_ssbo_ubo_intrinsic(dev, b, instr); - return true; + return lower_ssbo_ubo_intrinsic(dev, b, instr); case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_store: @@ -473,6 +491,37 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, lower_image_deref(dev, b, instr, shader, layout); return true; + case nir_intrinsic_load_frag_size_ir3: + case nir_intrinsic_load_frag_offset_ir3: { + if (!dev->compiler->load_shader_consts_via_preamble) + return false; + + enum ir3_driver_param param = + instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ? + IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET; + + nir_def *view = instr->src[0].ssa; + nir_def *result = + ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo, + param, view, nir_intrinsic_range(instr)); + + nir_def_rewrite_uses(&instr->def, result); + nir_instr_remove(&instr->instr); + return true; + } + case nir_intrinsic_load_frag_invocation_count: { + if (!dev->compiler->load_shader_consts_via_preamble) + return false; + + nir_def *result = + ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo, + IR3_DP_FS_FRAG_INVOCATION_COUNT); + + nir_def_rewrite_uses(&instr->def, result); + nir_instr_remove(&instr->instr); + return true; + } + default: return false; } @@ -1219,6 +1268,7 @@ tu6_emit_xs(struct tu_cs *cs, unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); if (immediate_size > 0) { + assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble); tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | @@ -1231,13 +1281,14 @@ tu6_emit_xs(struct tu_cs *cs, tu_cs_emit_array(cs, const_state->immediates, immediate_size); } - if (const_state->constant_data_ubo != -1) { + if (const_state->consts_ubo.idx != -1) { uint64_t iova = binary_iova + xs->info.constant_data_offset; + uint32_t offset = const_state->consts_ubo.idx; /* Upload UBO state for the constant data. */ tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); tu_cs_emit(cs, - CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | + CP_LOAD_STATE6_0_DST_OFF(offset) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | @@ -1252,30 +1303,50 @@ tu6_emit_xs(struct tu_cs *cs, /* Upload the constant data to the const file if needed. */ const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; - for (int i = 0; i < ubo_state->num_enabled; i++) { - if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || - ubo_state->range[i].ubo.bindless) { - continue; + if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) { + for (int i = 0; i < ubo_state->num_enabled; i++) { + if (ubo_state->range[i].ubo.block != offset || + ubo_state->range[i].ubo.bindless) { + continue; + } + + uint32_t start = ubo_state->range[i].start; + uint32_t end = ubo_state->range[i].end; + uint32_t size = MIN2(end - start, + (16 * xs->constlen) - ubo_state->range[i].offset); + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); + tu_cs_emit(cs, + CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | + CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); + tu_cs_emit_qw(cs, iova + start); } - - uint32_t start = ubo_state->range[i].start; - uint32_t end = ubo_state->range[i].end; - uint32_t size = MIN2(end - start, - (16 * xs->constlen) - ubo_state->range[i].offset); - - tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3); - tu_cs_emit(cs, - CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) | - CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | - CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | - CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | - CP_LOAD_STATE6_0_NUM_UNIT(size / 16)); - tu_cs_emit_qw(cs, iova + start); } } /* emit statically-known FS driver param */ - if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) { + if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) { + uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0}; + uint32_t size = ARRAY_SIZE(data); + + /* A7XX TODO: Emit data via sub_cs instead of NOP */ + uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4); + uint32_t base = const_state->driver_params_ubo.idx; + + tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); + tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | + CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) | + CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | + CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | + CP_LOAD_STATE6_0_NUM_UNIT(1)); + tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0)); + tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0)); + int size_vec4s = DIV_ROUND_UP(size, 4); + tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32)); + } else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) { uint32_t base = const_state->offsets.driver_param; int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4); size = MAX2(MIN2(size + base, xs->constlen) - base, 0); @@ -2176,6 +2247,9 @@ tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size) vk_pipeline_cache_object_init(&dev->vk, &shader->base, &tu_shader_ops, obj_key_data, key_size); + shader->const_state.fdm_ubo.idx = -1; + shader->const_state.dynamic_offsets_ubo.idx = -1; + return shader; } diff --git a/src/freedreno/vulkan/tu_shader.h b/src/freedreno/vulkan/tu_shader.h index 495c42d962e..029bf992ba4 100644 --- a/src/freedreno/vulkan/tu_shader.h +++ b/src/freedreno/vulkan/tu_shader.h @@ -49,6 +49,9 @@ struct tu_const_state uint32_t dynamic_offset_loc; unsigned num_inline_ubos; struct tu_inline_ubo ubos[MAX_INLINE_UBOS]; + + struct ir3_driver_ubo fdm_ubo; + struct ir3_driver_ubo dynamic_offsets_ubo; }; struct tu_shader diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc index f689e55c32e..ff9e51f51d7 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_const.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_const.cc @@ -208,7 +208,7 @@ fd6_emit_ubos(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring, for (int i = 0; i < num_ubos; i++) { /* NIR constant data is packed into the end of the shader. */ - if (i == const_state->constant_data_ubo) { + if (i == const_state->consts_ubo.idx) { int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16); OUT_RELOC(ring, v->bo, v->info.constant_data_offset, (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0); diff --git a/src/gallium/drivers/freedreno/ir3/ir3_const.h b/src/gallium/drivers/freedreno/ir3/ir3_const.h index 922601d7f44..fab31130616 100644 --- a/src/gallium/drivers/freedreno/ir3/ir3_const.h +++ b/src/gallium/drivers/freedreno/ir3/ir3_const.h @@ -119,7 +119,7 @@ ir3_emit_constant_data(const struct ir3_shader_variant *v, for (unsigned i = 0; i < state->num_enabled; i++) { unsigned ubo = state->range[i].ubo.block; - if (ubo != const_state->constant_data_ubo) + if (ubo != const_state->consts_ubo.idx) continue; uint32_t size = state->range[i].end - state->range[i].start; @@ -161,7 +161,7 @@ ir3_emit_user_consts(const struct ir3_shader_variant *v, assert(!state->range[i].ubo.bindless); unsigned ubo = state->range[i].ubo.block; if (!(constbuf->enabled_mask & (1 << ubo)) || - ubo == const_state->constant_data_ubo) { + ubo == const_state->consts_ubo.idx) { continue; } struct pipe_constant_buffer *cb = &constbuf->cb[ubo]; @@ -218,7 +218,7 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v, struct fd_bo *bos[params]; for (uint32_t i = 0; i < params; i++) { - if (i == const_state->constant_data_ubo) { + if (i == const_state->consts_ubo.idx) { bos[i] = v->bo; offsets[i] = v->info.constant_data_offset; continue;