turnip,ir3/a750: Implement consts loading via preamble

A750 expects driver params loaded through the preamble, old path
does work but has issues when the same LOAD_STATE is used between
several draw calls (it seems that LOAD_STATE is executed only for
the first draw call).

To solve this we now lower driver params to UBOs and let NIR deal with
them.

Notes:
- VS params are loaded via old path since blob do the same and there
  are no issues observed.
- FDM is not supported at the moment.
- For now driver params data is emitted via CP_NOP because it's tricky
  to allocate space for the data. (It is emitted when we are already in
  sub_cs)

Co-Authored-By: Connor Abbott <cwabbott0@gmail.com>
Signed-off-by: Danylo Piliaiev <dpiliaiev@igalia.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26934>
This commit is contained in:
Danylo Piliaiev 2023-11-24 15:27:19 +01:00 committed by Marge Bot
parent 7429ca3115
commit 76e417ca59
24 changed files with 765 additions and 287 deletions

View file

@ -211,6 +211,9 @@ struct fd_dev_info {
* command buffers. We copy this dispatch as is. * command buffers. We copy this dispatch as is.
*/ */
bool cmdbuf_start_a725_quirk; bool cmdbuf_start_a725_quirk;
bool load_inline_uniforms_via_preamble_ldgk;
bool load_shader_consts_via_preamble;
} a7xx; } a7xx;
}; };

View file

@ -791,6 +791,8 @@ a7xx_740 = A7XXProps(
a7xx_750 = A7XXProps( a7xx_750 = A7XXProps(
has_event_write_sample_count = True, has_event_write_sample_count = True,
load_inline_uniforms_via_preamble_ldgk = True,
load_shader_consts_via_preamble = True,
) )
a730_magic_regs = dict( a730_magic_regs = dict(

View file

@ -210,6 +210,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch; compiler->has_fs_tex_prefetch = dev_info->a6xx.has_fs_tex_prefetch;
compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk; compiler->stsc_duplication_quirk = dev_info->a7xx.stsc_duplication_quirk;
compiler->load_shader_consts_via_preamble = dev_info->a7xx.load_shader_consts_via_preamble;
} else { } else {
compiler->max_const_pipeline = 512; compiler->max_const_pipeline = 512;
compiler->max_const_geom = 512; compiler->max_const_geom = 512;

View file

@ -249,6 +249,8 @@ struct ir3_compiler {
bool has_fs_tex_prefetch; bool has_fs_tex_prefetch;
bool stsc_duplication_quirk; bool stsc_duplication_quirk;
bool load_shader_consts_via_preamble;
}; };
void ir3_compiler_destroy(struct ir3_compiler *compiler); void ir3_compiler_destroy(struct ir3_compiler *compiler);

View file

@ -160,6 +160,9 @@ static bool
lower_immed(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, unsigned n, lower_immed(struct ir3_cp_ctx *ctx, struct ir3_instruction *instr, unsigned n,
struct ir3_register *reg, unsigned new_flags) struct ir3_register *reg, unsigned new_flags)
{ {
if (ctx->shader->compiler->load_shader_consts_via_preamble)
return false;
if (!(new_flags & IR3_REG_IMMED)) if (!(new_flags & IR3_REG_IMMED))
return false; return false;

View file

@ -77,6 +77,15 @@ apply_ss(struct ir3_instruction *instr,
state->needs_ss_for_const = false; state->needs_ss_for_const = false;
} }
static inline void
apply_sy(struct ir3_instruction *instr,
struct ir3_legalize_state *state,
bool mergedregs)
{
instr->flags |= IR3_INSTR_SY;
regmask_init(&state->needs_sy, mergedregs);
}
/* We want to evaluate each block from the position of any other /* We want to evaluate each block from the position of any other
* predecessor block, in order that the flags set are the union of * predecessor block, in order that the flags set are the union of
* all possible program paths. * all possible program paths.
@ -176,9 +185,7 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) { if ((last_n && is_barrier(last_n)) || n->opc == OPC_SHPE) {
apply_ss(n, state, mergedregs); apply_ss(n, state, mergedregs);
apply_sy(n, state, mergedregs);
n->flags |= IR3_INSTR_SY;
regmask_init(&state->needs_sy, mergedregs);
last_input_needs_ss = false; last_input_needs_ss = false;
} }
@ -211,12 +218,13 @@ legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
} }
if (regmask_get(&state->needs_sy, reg)) { if (regmask_get(&state->needs_sy, reg)) {
n->flags |= IR3_INSTR_SY; apply_sy(n, state, mergedregs);
regmask_init(&state->needs_sy, mergedregs); }
} else if ((reg->flags & IR3_REG_CONST)) {
if (state->needs_ss_for_const) {
apply_ss(n, state, mergedregs);
last_input_needs_ss = false;
} }
} else if ((reg->flags & IR3_REG_CONST) && state->needs_ss_for_const) {
apply_ss(n, state, mergedregs);
last_input_needs_ss = false;
} }
} }

View file

@ -31,6 +31,59 @@
#include "ir3_nir.h" #include "ir3_nir.h"
#include "ir3_shader.h" #include "ir3_shader.h"
nir_def *
ir3_get_driver_ubo(nir_builder *b, struct ir3_driver_ubo *ubo)
{
/* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
* reserved for gallium's cb0.
*/
if (ubo->idx == -1) {
if (b->shader->info.num_ubos == 0)
b->shader->info.num_ubos++;
ubo->idx = b->shader->info.num_ubos++;
} else {
assert(ubo->idx != 0);
/* Binning shader shared ir3_driver_ubo definitions but not shader info */
b->shader->info.num_ubos = MAX2(b->shader->info.num_ubos, ubo->idx + 1);
}
return nir_imm_int(b, ubo->idx);
}
nir_def *
ir3_load_driver_ubo(nir_builder *b, unsigned components,
struct ir3_driver_ubo *ubo,
unsigned offset)
{
ubo->size = MAX2(ubo->size, offset + components);
return nir_load_ubo(b, components, 32, ir3_get_driver_ubo(b, ubo),
nir_imm_int(b, offset * sizeof(uint32_t)),
.align_mul = 16,
.align_offset = (offset % 4) * sizeof(uint32_t),
.range_base = offset * sizeof(uint32_t),
.range = components * sizeof(uint32_t));
}
nir_def *
ir3_load_driver_ubo_indirect(nir_builder *b, unsigned components,
struct ir3_driver_ubo *ubo,
unsigned base, nir_def *offset,
unsigned range)
{
ubo->size = MAX2(ubo->size, base + components + range * 4);
return nir_load_ubo(b, components, 32, ir3_get_driver_ubo(b, ubo),
nir_iadd(b, nir_imul24(b, offset, nir_imm_int(b, 16)),
nir_imm_int(b, base * sizeof(uint32_t))),
.align_mul = 16,
.align_offset = (base % 4) * sizeof(uint32_t),
.range_base = base * sizeof(uint32_t),
.range = components * sizeof(uint32_t) +
(range - 1) * 16);
}
static bool static bool
ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset, ir3_nir_should_vectorize_mem(unsigned align_mul, unsigned align_offset,
unsigned bit_size, unsigned num_components, unsigned bit_size, unsigned num_components,
@ -742,6 +795,9 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
OPT(s, ir3_nir_opt_subgroups, so); OPT(s, ir3_nir_opt_subgroups, so);
if (so->compiler->load_shader_consts_via_preamble)
progress |= OPT(s, ir3_nir_lower_driver_params_to_ubo, so);
/* Do the preamble before analysing UBO ranges, because it's usually /* Do the preamble before analysing UBO ranges, because it's usually
* higher-value and because it can result in eliminating some indirect UBO * higher-value and because it can result in eliminating some indirect UBO
* accesses where otherwise we'd have to push the whole range. However we * accesses where otherwise we'd have to push the whole range. However we
@ -844,6 +900,75 @@ ir3_nir_lower_variant(struct ir3_shader_variant *so, nir_shader *s)
ir3_setup_const_state(s, so, ir3_const_state(so)); ir3_setup_const_state(s, so, ir3_const_state(so));
} }
bool
ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr,
struct driver_param_info *param_info)
{
switch (intr->intrinsic) {
case nir_intrinsic_load_base_workgroup_id:
param_info->offset = IR3_DP_BASE_GROUP_X;
break;
case nir_intrinsic_load_num_workgroups:
param_info->offset = IR3_DP_NUM_WORK_GROUPS_X;
break;
case nir_intrinsic_load_workgroup_size:
param_info->offset = IR3_DP_LOCAL_GROUP_SIZE_X;
break;
case nir_intrinsic_load_subgroup_size:
assert(shader->info.stage == MESA_SHADER_COMPUTE ||
shader->info.stage == MESA_SHADER_FRAGMENT);
if (shader->info.stage == MESA_SHADER_COMPUTE) {
param_info->offset = IR3_DP_CS_SUBGROUP_SIZE;
} else {
param_info->offset = IR3_DP_FS_SUBGROUP_SIZE;
}
break;
case nir_intrinsic_load_subgroup_id_shift_ir3:
param_info->offset = IR3_DP_SUBGROUP_ID_SHIFT;
break;
case nir_intrinsic_load_work_dim:
param_info->offset = IR3_DP_WORK_DIM;
break;
case nir_intrinsic_load_base_vertex:
case nir_intrinsic_load_first_vertex:
param_info->offset = IR3_DP_VTXID_BASE;
break;
case nir_intrinsic_load_is_indexed_draw:
param_info->offset = IR3_DP_IS_INDEXED_DRAW;
break;
case nir_intrinsic_load_draw_id:
param_info->offset = IR3_DP_DRAWID;
break;
case nir_intrinsic_load_base_instance:
param_info->offset = IR3_DP_INSTID_BASE;
break;
case nir_intrinsic_load_user_clip_plane: {
uint32_t idx = nir_intrinsic_ucp_id(intr);
param_info->offset = IR3_DP_UCP0_X + 4 * idx;
break;
}
case nir_intrinsic_load_tess_level_outer_default:
param_info->offset = IR3_DP_HS_DEFAULT_OUTER_LEVEL_X;
break;
case nir_intrinsic_load_tess_level_inner_default:
param_info->offset = IR3_DP_HS_DEFAULT_INNER_LEVEL_X;
break;
case nir_intrinsic_load_frag_size_ir3:
param_info->offset = IR3_DP_FS_FRAG_SIZE;
break;
case nir_intrinsic_load_frag_offset_ir3:
param_info->offset = IR3_DP_FS_FRAG_OFFSET;
break;
case nir_intrinsic_load_frag_invocation_count:
param_info->offset = IR3_DP_FS_FRAG_INVOCATION_COUNT;
break;
default:
return false;
}
return true;
}
static void static void
ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout) ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, struct ir3_const_state *layout)
{ {
@ -877,84 +1002,16 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st
layout->image_dims.count += 3; /* three const per */ layout->image_dims.count += 3; /* three const per */
} }
break; break;
case nir_intrinsic_load_base_vertex:
case nir_intrinsic_load_first_vertex:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1);
break;
case nir_intrinsic_load_is_indexed_draw:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_IS_INDEXED_DRAW + 1);
break;
case nir_intrinsic_load_base_instance:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1);
break;
case nir_intrinsic_load_user_clip_plane:
idx = nir_intrinsic_ucp_id(intr);
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_UCP0_X + (idx + 1) * 4);
break;
case nir_intrinsic_load_num_workgroups:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_NUM_WORK_GROUPS_Z + 1);
break;
case nir_intrinsic_load_workgroup_id:
if (!compiler->has_shared_regfile) {
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_WORKGROUP_ID_Z + 1);
}
break;
case nir_intrinsic_load_workgroup_size:
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_LOCAL_GROUP_SIZE_Z + 1);
break;
case nir_intrinsic_load_base_workgroup_id:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_BASE_GROUP_Z + 1);
break;
case nir_intrinsic_load_subgroup_size: {
assert(shader->info.stage == MESA_SHADER_COMPUTE ||
shader->info.stage == MESA_SHADER_FRAGMENT);
enum ir3_driver_param size = shader->info.stage == MESA_SHADER_COMPUTE ?
IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE;
layout->num_driver_params =
MAX2(layout->num_driver_params, size + 1);
break;
}
case nir_intrinsic_load_subgroup_id_shift_ir3:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_SUBGROUP_ID_SHIFT + 1);
break;
case nir_intrinsic_load_draw_id:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_DRAWID + 1);
break;
case nir_intrinsic_load_tess_level_outer_default:
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_HS_DEFAULT_OUTER_LEVEL_W + 1);
break;
case nir_intrinsic_load_tess_level_inner_default:
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_HS_DEFAULT_INNER_LEVEL_Y + 1);
break;
case nir_intrinsic_load_frag_size_ir3:
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_FS_FRAG_SIZE + 2 +
(nir_intrinsic_range(intr) - 1) * 4);
break;
case nir_intrinsic_load_frag_offset_ir3:
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_FS_FRAG_OFFSET + 2 +
(nir_intrinsic_range(intr) - 1) * 4);
break;
case nir_intrinsic_load_frag_invocation_count:
layout->num_driver_params = MAX2(layout->num_driver_params,
IR3_DP_FS_FRAG_INVOCATION_COUNT + 1);
break;
default: default:
break; break;
} }
struct driver_param_info param_info;
if (ir3_get_driver_param_info(shader, intr, &param_info)) {
layout->num_driver_params =
MAX2(layout->num_driver_params,
param_info.offset + nir_intrinsic_dest_components(intr));
}
} }
} }
} }
@ -999,7 +1056,7 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
const_state->preamble_size; const_state->preamble_size;
unsigned ptrsz = ir3_pointer_size(compiler); unsigned ptrsz = ir3_pointer_size(compiler);
if (const_state->num_ubos > 0) { if (const_state->num_ubos > 0 && compiler->gen < 6) {
const_state->offsets.ubo = constoff; const_state->offsets.ubo = constoff;
constoff += align(const_state->num_ubos * ptrsz, 4) / 4; constoff += align(const_state->num_ubos * ptrsz, 4) / 4;
} }
@ -1043,6 +1100,26 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4; constoff += align(IR3_MAX_SO_BUFFERS * ptrsz, 4) / 4;
} }
if (!compiler->load_shader_consts_via_preamble) {
switch (v->type) {
case MESA_SHADER_TESS_CTRL:
case MESA_SHADER_TESS_EVAL:
const_state->offsets.primitive_param = constoff;
constoff += 2;
const_state->offsets.primitive_map = constoff;
break;
case MESA_SHADER_GEOMETRY:
const_state->offsets.primitive_param = constoff;
constoff += 1;
const_state->offsets.primitive_map = constoff;
break;
default:
break;
}
}
switch (v->type) { switch (v->type) {
case MESA_SHADER_VERTEX: case MESA_SHADER_VERTEX:
const_state->offsets.primitive_param = constoff; const_state->offsets.primitive_param = constoff;
@ -1050,17 +1127,9 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
break; break;
case MESA_SHADER_TESS_CTRL: case MESA_SHADER_TESS_CTRL:
case MESA_SHADER_TESS_EVAL: case MESA_SHADER_TESS_EVAL:
const_state->offsets.primitive_param = constoff;
constoff += 2;
const_state->offsets.primitive_map = constoff;
constoff += DIV_ROUND_UP(v->input_size, 4); constoff += DIV_ROUND_UP(v->input_size, 4);
break; break;
case MESA_SHADER_GEOMETRY: case MESA_SHADER_GEOMETRY:
const_state->offsets.primitive_param = constoff;
constoff += 1;
const_state->offsets.primitive_map = constoff;
constoff += DIV_ROUND_UP(v->input_size, 4); constoff += DIV_ROUND_UP(v->input_size, 4);
break; break;
default: default:

View file

@ -42,6 +42,8 @@ bool ir3_nir_lower_load_barycentric_at_sample(nir_shader *shader);
bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader); bool ir3_nir_lower_load_barycentric_at_offset(nir_shader *shader);
bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir, bool ir3_nir_lower_push_consts_to_preamble(nir_shader *nir,
struct ir3_shader_variant *v); struct ir3_shader_variant *v);
bool ir3_nir_lower_driver_params_to_ubo(nir_shader *nir,
struct ir3_shader_variant *v);
bool ir3_nir_move_varying_inputs(nir_shader *shader); bool ir3_nir_move_varying_inputs(nir_shader *shader);
int ir3_nir_coord_offset(nir_def *ssa); int ir3_nir_coord_offset(nir_def *ssa);
bool ir3_nir_lower_tex_prefetch(nir_shader *shader); bool ir3_nir_lower_tex_prefetch(nir_shader *shader);
@ -88,6 +90,23 @@ nir_def *ir3_nir_try_propagate_bit_shift(nir_builder *b,
bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v); bool ir3_nir_opt_subgroups(nir_shader *nir, struct ir3_shader_variant *v);
nir_def *ir3_get_driver_ubo(nir_builder *b, struct ir3_driver_ubo *ubo);
nir_def *ir3_load_driver_ubo(nir_builder *b, unsigned components,
struct ir3_driver_ubo *ubo,
unsigned offset);
nir_def *ir3_load_driver_ubo_indirect(nir_builder *b, unsigned components,
struct ir3_driver_ubo *ubo,
unsigned base, nir_def *offset,
unsigned range);
struct driver_param_info {
uint32_t offset;
};
bool ir3_get_driver_param_info(const nir_shader *shader,
nir_intrinsic_instr *intr,
struct driver_param_info *param_info);
static inline nir_intrinsic_instr * static inline nir_intrinsic_instr *
ir3_bindless_resource(nir_src src) ir3_bindless_resource(nir_src src)
{ {

View file

@ -337,13 +337,18 @@ lower_ubo_load_to_uniform(nir_intrinsic_instr *instr, nir_builder *b,
} }
static bool static bool
copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state) copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state,
bool const_data_via_cp)
{ {
const struct ir3_ubo_analysis_state *state = &const_state->ubo_state; const struct ir3_ubo_analysis_state *state = &const_state->ubo_state;
if (state->num_enabled == 0 || if (state->num_enabled == 0)
(state->num_enabled == 1 && !state->range[0].ubo.bindless && return false;
state->range[0].ubo.block == const_state->constant_data_ubo))
if (state->num_enabled == 1 &&
!state->range[0].ubo.bindless &&
state->range[0].ubo.block == const_state->consts_ubo.idx &&
const_data_via_cp)
return false; return false;
nir_function_impl *preamble = nir_shader_get_preamble(nir); nir_function_impl *preamble = nir_shader_get_preamble(nir);
@ -358,7 +363,8 @@ copy_ubo_to_uniform(nir_shader *nir, const struct ir3_const_state *const_state)
* the CP do it for us. * the CP do it for us.
*/ */
if (!range->ubo.bindless && if (!range->ubo.bindless &&
range->ubo.block == const_state->constant_data_ubo) range->ubo.block == const_state->consts_ubo.idx &&
const_data_via_cp)
continue; continue;
nir_def *ubo = nir_imm_int(b, range->ubo.block); nir_def *ubo = nir_imm_int(b, range->ubo.block);
@ -502,7 +508,8 @@ ir3_nir_lower_ubo_loads(nir_shader *nir, struct ir3_shader_variant *v)
nir->info.num_ubos = num_ubos; nir->info.num_ubos = num_ubos;
if (compiler->has_preamble && push_ubos) if (compiler->has_preamble && push_ubos)
progress |= copy_ubo_to_uniform(nir, const_state); progress |= copy_ubo_to_uniform(
nir, const_state, !compiler->load_shader_consts_via_preamble);
return progress; return progress;
} }
@ -584,15 +591,6 @@ ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
struct ir3_const_state *const_state = data; struct ir3_const_state *const_state = data;
nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr); nir_intrinsic_instr *instr = nir_instr_as_intrinsic(in_instr);
/* Pick a UBO index to use as our constant data. Skip UBO 0 since that's
* reserved for gallium's cb0.
*/
if (const_state->constant_data_ubo == -1) {
if (b->shader->info.num_ubos == 0)
b->shader->info.num_ubos++;
const_state->constant_data_ubo = b->shader->info.num_ubos++;
}
unsigned num_components = instr->num_components; unsigned num_components = instr->num_components;
unsigned bit_size = instr->def.bit_size; unsigned bit_size = instr->def.bit_size;
if (instr->def.bit_size == 16) { if (instr->def.bit_size == 16) {
@ -606,7 +604,7 @@ ir3_nir_lower_load_const_instr(nir_builder *b, nir_instr *in_instr, void *data)
bit_size = 32; bit_size = 32;
} }
unsigned base = nir_intrinsic_base(instr); unsigned base = nir_intrinsic_base(instr);
nir_def *index = nir_imm_int(b, const_state->constant_data_ubo); nir_def *index = ir3_get_driver_ubo(b, &const_state->consts_ubo);
nir_def *offset = nir_def *offset =
nir_iadd_imm(b, instr->src[0].ssa, base); nir_iadd_imm(b, instr->src[0].ssa, base);
@ -640,8 +638,6 @@ ir3_nir_lower_load_constant(nir_shader *nir, struct ir3_shader_variant *v)
{ {
struct ir3_const_state *const_state = ir3_const_state(v); struct ir3_const_state *const_state = ir3_const_state(v);
const_state->constant_data_ubo = -1;
bool progress = nir_shader_lower_instructions( bool progress = nir_shader_lower_instructions(
nir, ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr, nir, ir3_lower_load_const_filter, ir3_nir_lower_load_const_instr,
const_state); const_state);

View file

@ -0,0 +1,81 @@
/*
* Copyright © 2023 Igalia S.L.
* SPDX-License-Identifier: MIT
*/
#include "compiler/nir/nir.h"
#include "compiler/nir/nir_builder.h"
#include "util/u_math.h"
#include "ir3_compiler.h"
#include "ir3_nir.h"
static bool
lower_driver_param_to_ubo(nir_builder *b, nir_intrinsic_instr *intr, void *in)
{
struct ir3_const_state *const_state = in;
if (b->shader->info.stage == MESA_SHADER_VERTEX)
return false;
unsigned components = nir_intrinsic_dest_components(intr);
b->cursor = nir_before_instr(&intr->instr);
nir_def *result;
switch (intr->intrinsic) {
case nir_intrinsic_load_primitive_location_ir3:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_map_ubo,
nir_intrinsic_driver_location(intr));
break;
case nir_intrinsic_load_vs_primitive_stride_ir3:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_param_ubo, 0);
break;
case nir_intrinsic_load_vs_vertex_stride_ir3:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_param_ubo, 1);
break;
case nir_intrinsic_load_hs_patch_stride_ir3:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_param_ubo, 2);
break;
case nir_intrinsic_load_patch_vertices_in:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_param_ubo, 3);
break;
case nir_intrinsic_load_tess_param_base_ir3:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_param_ubo, 4);
break;
case nir_intrinsic_load_tess_factor_base_ir3:
result = ir3_load_driver_ubo(b, components,
&const_state->primitive_param_ubo, 6);
break;
default: {
struct driver_param_info param_info;
if (!ir3_get_driver_param_info(b->shader, intr, &param_info))
return false;
result = ir3_load_driver_ubo(b, components,
&const_state->driver_params_ubo,
param_info.offset);
}
}
nir_instr_remove(&intr->instr);
nir_def_rewrite_uses(&intr->def, result);
return true;
}
bool
ir3_nir_lower_driver_params_to_ubo(nir_shader *nir,
struct ir3_shader_variant *v)
{
bool result = nir_shader_intrinsics_pass(
nir, lower_driver_param_to_ubo,
nir_metadata_block_index | nir_metadata_dominance, ir3_const_state(v));
return result;
}

View file

@ -303,6 +303,10 @@ alloc_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
if (!v->binning_pass) { if (!v->binning_pass) {
v->const_state = rzalloc_size(v, sizeof(*v->const_state)); v->const_state = rzalloc_size(v, sizeof(*v->const_state));
v->const_state->push_consts_type = shader->options.push_consts_type; v->const_state->push_consts_type = shader->options.push_consts_type;
v->const_state->consts_ubo.idx = -1;
v->const_state->driver_params_ubo.idx = -1;
v->const_state->primitive_map_ubo.idx = -1;
v->const_state->primitive_param_ubo.idx = -1;
} }
return v; return v;

View file

@ -154,6 +154,16 @@ enum ir3_push_consts_type {
IR3_PUSH_CONSTS_SHARED_PREAMBLE, IR3_PUSH_CONSTS_SHARED_PREAMBLE,
}; };
/* This represents an internal UBO filled out by the driver. There are a few
* common UBOs that must be filled out identically by all drivers, for example
* for shader linkage, but drivers can also add their own that they manage
* themselves.
*/
struct ir3_driver_ubo {
int32_t idx;
uint32_t size;
};
/** /**
* Describes the layout of shader consts in the const register file. * Describes the layout of shader consts in the const register file.
* *
@ -186,8 +196,11 @@ struct ir3_const_state {
unsigned num_ubos; unsigned num_ubos;
unsigned num_driver_params; /* scalar */ unsigned num_driver_params; /* scalar */
/* UBO that should be mapped to the NIR shader's constant_data (or -1). */ struct ir3_driver_ubo consts_ubo;
int32_t constant_data_ubo; struct ir3_driver_ubo driver_params_ubo;
struct ir3_driver_ubo primitive_map_ubo, primitive_param_ubo;
int32_t constant_data_dynamic_offsets;
struct { struct {
/* user const start at zero */ /* user const start at zero */

View file

@ -93,6 +93,7 @@ libfreedreno_ir3_files = files(
'ir3_nir.h', 'ir3_nir.h',
'ir3_nir_analyze_ubo_ranges.c', 'ir3_nir_analyze_ubo_ranges.c',
'ir3_nir_lower_64b.c', 'ir3_nir_lower_64b.c',
'ir3_nir_lower_driver_params_to_ubo.c',
'ir3_nir_lower_load_barycentric_at_sample.c', 'ir3_nir_lower_load_barycentric_at_sample.c',
'ir3_nir_lower_load_barycentric_at_offset.c', 'ir3_nir_lower_load_barycentric_at_offset.c',
'ir3_nir_lower_push_consts_to_preamble.c', 'ir3_nir_lower_push_consts_to_preamble.c',

View file

@ -4326,6 +4326,8 @@ tu6_emit_inline_ubo(struct tu_cs *cs,
gl_shader_stage type, gl_shader_stage type,
struct tu_descriptor_state *descriptors) struct tu_descriptor_state *descriptors)
{ {
assert(const_state->num_inline_ubos == 0 || !cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
/* Emit loads of inline uniforms. These load directly from the uniform's /* Emit loads of inline uniforms. These load directly from the uniform's
* storage space inside the descriptor set. * storage space inside the descriptor set.
*/ */
@ -4578,7 +4580,7 @@ fs_params_offset(struct tu_cmd_buffer *cmd)
&cmd->state.program.link[MESA_SHADER_FRAGMENT]; &cmd->state.program.link[MESA_SHADER_FRAGMENT];
const struct ir3_const_state *const_state = &link->const_state; const struct ir3_const_state *const_state = &link->const_state;
if (const_state->num_driver_params <= IR3_DP_FS_DYNAMIC) if (const_state->num_driver_params < IR3_DP_FS_DYNAMIC)
return 0; return 0;
if (const_state->offsets.driver_param + IR3_DP_FS_DYNAMIC / 4 >= link->constlen) if (const_state->offsets.driver_param + IR3_DP_FS_DYNAMIC / 4 >= link->constlen)
@ -5412,102 +5414,193 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
const struct tu_shader *shader = cmd->state.shaders[MESA_SHADER_COMPUTE]; const struct tu_shader *shader = cmd->state.shaders[MESA_SHADER_COMPUTE];
const struct ir3_shader_variant *variant = shader->variant; const struct ir3_shader_variant *variant = shader->variant;
const struct ir3_const_state *const_state = variant->const_state; const struct ir3_const_state *const_state = variant->const_state;
uint32_t offset = const_state->offsets.driver_param;
unsigned subgroup_size = variant->info.subgroup_size; unsigned subgroup_size = variant->info.subgroup_size;
unsigned subgroup_shift = util_logbase2(subgroup_size); unsigned subgroup_shift = util_logbase2(subgroup_size);
if (variant->constlen <= offset) if (cmd->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
return; uint32_t num_consts = const_state->driver_params_ubo.size;
if (num_consts == 0)
return;
uint32_t num_consts = MIN2(const_state->num_driver_params, bool direct_indirect_load =
(variant->constlen - offset) * 4); !(info->indirect_offset & 0xf) &&
!(info->indirect && num_consts > IR3_DP_BASE_GROUP_X);
if (!info->indirect) { uint64_t iova = 0;
uint32_t driver_params[12] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
[IR3_DP_WORK_DIM] = 0,
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
[IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size,
[IR3_DP_LOCAL_GROUP_SIZE_X] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Y] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Z] = 0,
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
};
assert(num_consts <= ARRAY_SIZE(driver_params)); if (!info->indirect) {
uint32_t driver_params[12] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
[IR3_DP_WORK_DIM] = 0,
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
[IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size,
[IR3_DP_LOCAL_GROUP_SIZE_X] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Y] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Z] = 0,
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
};
/* push constants */ assert(num_consts <= ARRAY_SIZE(driver_params));
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
uint32_t i;
for (i = 0; i < num_consts; i++)
tu_cs_emit(cs, driver_params[i]);
} else if (!(info->indirect_offset & 0xf)) {
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset);
} else {
/* Vulkan guarantees only 4 byte alignment for indirect_offset.
* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
*/
uint64_t indirect_iova = info->indirect->iova + info->indirect_offset; struct tu_cs_memory consts;
uint32_t consts_vec4 = DIV_ROUND_UP(num_consts, 4);
VkResult result = tu_cs_alloc(&cmd->sub_cs, consts_vec4, 4, &consts);
if (result != VK_SUCCESS) {
vk_command_buffer_set_error(&cmd->vk, result);
return;
}
memcpy(consts.map, driver_params, num_consts * sizeof(uint32_t));
iova = consts.iova;
} else if (direct_indirect_load) {
iova = info->indirect->iova + info->indirect_offset;
} else {
/* Vulkan guarantees only 4 byte alignment for indirect_offset.
* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
*/
for (uint32_t i = 0; i < 3; i++) { uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
tu_cs_emit(cs, 0); for (uint32_t i = 0; i < 3; i++) {
tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i)); tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
tu_cs_emit_qw(cs, indirect_iova + i * 4); tu_cs_emit(cs, 0);
tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i));
tu_cs_emit_qw(cs, indirect_iova + i * sizeof(uint32_t));
}
/* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
* indirect dispatch.
*/
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
uint32_t indirect_driver_params[8] = {
0, 0, 0, subgroup_size,
0, 0, 0, subgroup_shift,
};
bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X;
uint32_t emit_size = emit_local ? 8 : 4;
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + emit_size);
tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, 0) + 4 * sizeof(uint32_t));
for (uint32_t i = 0; i < emit_size; i++) {
tu_cs_emit(cs, indirect_driver_params[i]);
}
}
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
iova = global_iova(cmd, cs_indirect_xyz[0]);
} }
tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0); tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 5);
tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE); tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(const_state->driver_params_ubo.idx) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
int size_vec4s = DIV_ROUND_UP(num_consts, 4);
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3); } else {
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) | uint32_t offset = const_state->offsets.driver_param;
if (variant->constlen <= offset)
return;
uint32_t num_consts = MIN2(const_state->num_driver_params,
(variant->constlen - offset) * 4);
if (!info->indirect) {
uint32_t driver_params[12] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
[IR3_DP_WORK_DIM] = 0,
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
[IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size,
[IR3_DP_LOCAL_GROUP_SIZE_X] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Y] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Z] = 0,
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
};
assert(num_consts <= ARRAY_SIZE(driver_params));
/* push constants */
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) | CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1)); CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0])); tu_cs_emit(cs, 0);
} tu_cs_emit(cs, 0);
uint32_t i;
for (i = 0; i < num_consts; i++)
tu_cs_emit(cs, driver_params[i]);
} else if (!(info->indirect_offset & 0xf)) {
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit_qw(cs, info->indirect->iova + info->indirect_offset);
} else {
/* Vulkan guarantees only 4 byte alignment for indirect_offset.
* However, CP_LOAD_STATE.EXT_SRC_ADDR needs 16 byte alignment.
*/
/* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for uint64_t indirect_iova = info->indirect->iova + info->indirect_offset;
* indirect dispatch.
*/ for (uint32_t i = 0; i < 3; i++) {
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) { tu_cs_emit_pkt7(cs, CP_MEM_TO_MEM, 5);
bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X; tu_cs_emit(cs, 0);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0)); tu_cs_emit_qw(cs, global_iova_arr(cmd, cs_indirect_xyz, i));
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) | tu_cs_emit_qw(cs, indirect_iova + i * 4);
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | }
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) | tu_cs_emit_pkt7(cs, CP_WAIT_MEM_WRITES, 0);
CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4)); tu_emit_event_write<CHIP>(cmd, cs, FD_CACHE_INVALIDATE);
tu_cs_emit_qw(cs, 0);
tu_cs_emit(cs, 0); /* BASE_GROUP_X */ tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
tu_cs_emit(cs, 0); /* BASE_GROUP_Y */ tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
tu_cs_emit(cs, 0); /* BASE_GROUP_Z */ CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
tu_cs_emit(cs, subgroup_size); CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
if (emit_local) { CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4)); CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */ tu_cs_emit_qw(cs, global_iova(cmd, cs_indirect_xyz[0]));
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */ }
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
tu_cs_emit(cs, subgroup_shift); /* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
* indirect dispatch.
*/
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X;
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0));
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
tu_cs_emit_qw(cs, 0);
tu_cs_emit(cs, 0); /* BASE_GROUP_X */
tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
tu_cs_emit(cs, subgroup_size);
if (emit_local) {
assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */
tu_cs_emit(cs, subgroup_shift);
}
} }
} }
} }

View file

@ -507,6 +507,26 @@ tu_cs_reset(struct tu_cs *cs)
cs->entry_count = 0; cs->entry_count = 0;
} }
uint64_t
tu_cs_emit_data_nop(struct tu_cs *cs,
const uint32_t *data,
uint32_t size,
uint32_t align_dwords)
{
uint32_t total_size = size + (align_dwords - 1);
tu_cs_emit_pkt7(cs, CP_NOP, total_size);
uint64_t iova = tu_cs_get_cur_iova(cs);
uint64_t iova_aligned = align64(iova, align_dwords * sizeof(uint32_t));
size_t offset = (iova_aligned - iova) / sizeof(uint32_t);
cs->cur += offset;
memcpy(cs->cur, data, size * sizeof(uint32_t));
cs->cur += total_size - offset;
return iova + offset * sizeof(uint32_t);
}
void void
tu_cs_emit_debug_string(struct tu_cs *cs, const char *string, int len) tu_cs_emit_debug_string(struct tu_cs *cs, const char *string, int len)
{ {

View file

@ -480,6 +480,12 @@ tu_cond_exec_end(struct tu_cs *cs)
} }
} }
uint64_t
tu_cs_emit_data_nop(struct tu_cs *cs,
const uint32_t *data,
uint32_t size,
uint32_t align);
/* Temporary struct for tracking a register state to be written, used by /* Temporary struct for tracking a register state to be written, used by
* a6xx-pack.h and tu_cs_emit_regs() * a6xx-pack.h and tu_cs_emit_regs()
*/ */

View file

@ -231,7 +231,7 @@ get_device_extensions(const struct tu_physical_device *device,
.EXT_extended_dynamic_state3 = true, .EXT_extended_dynamic_state3 = true,
.EXT_external_memory_dma_buf = true, .EXT_external_memory_dma_buf = true,
.EXT_filter_cubic = device->info->a6xx.has_tex_filter_cubic, .EXT_filter_cubic = device->info->a6xx.has_tex_filter_cubic,
.EXT_fragment_density_map = true, .EXT_fragment_density_map = !device->info->a7xx.load_shader_consts_via_preamble,
.EXT_global_priority = true, .EXT_global_priority = true,
.EXT_global_priority_query = true, .EXT_global_priority_query = true,
.EXT_graphics_pipeline_library = true, .EXT_graphics_pipeline_library = true,

View file

@ -196,7 +196,7 @@ struct tu6_global
uint32_t pad[7]; uint32_t pad[7];
} flush_base[4]; } flush_base[4];
alignas(16) uint32_t cs_indirect_xyz[3]; alignas(16) uint32_t cs_indirect_xyz[12];
volatile uint32_t vtx_stats_query_not_running; volatile uint32_t vtx_stats_query_not_running;

View file

@ -408,22 +408,53 @@ tu6_emit_dynamic_offset(struct tu_cs *cs,
const struct tu_program_state *program) const struct tu_program_state *program)
{ {
const struct tu_physical_device *phys_dev = cs->device->physical_device; const struct tu_physical_device *phys_dev = cs->device->physical_device;
if (!xs || shader->const_state.dynamic_offset_loc == UINT32_MAX)
if (!xs)
return; return;
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets); if (cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) | if (shader->const_state.dynamic_offsets_ubo.size == 0)
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | return;
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
for (unsigned i = 0; i < phys_dev->usable_sets; i++) { uint32_t offsets[MAX_SETS];
unsigned dynamic_offset_start = for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4); unsigned dynamic_offset_start =
tu_cs_emit(cs, dynamic_offset_start); program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
offsets[i] = dynamic_offset_start;
}
/* A7XX TODO: Emit data via sub_cs instead of NOP */
uint64_t iova = tu_cs_emit_data_nop(cs, offsets, phys_dev->usable_sets, 4);
uint32_t offset = shader->const_state.dynamic_offsets_ubo.idx;
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 5);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
int size_vec4s = DIV_ROUND_UP(phys_dev->usable_sets, 4);
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
} else {
if (shader->const_state.dynamic_offset_loc == UINT32_MAX)
return;
tu_cs_emit_pkt7(cs, tu6_stage2opcode(xs->type), 3 + phys_dev->usable_sets);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(shader->const_state.dynamic_offset_loc / 4) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(xs->type)) |
CP_LOAD_STATE6_0_NUM_UNIT(DIV_ROUND_UP(phys_dev->usable_sets, 4)));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
for (unsigned i = 0; i < phys_dev->usable_sets; i++) {
unsigned dynamic_offset_start =
program->dynamic_descriptor_offsets[i] / (A6XX_TEX_CONST_DWORDS * 4);
tu_cs_emit(cs, dynamic_offset_start);
}
} }
} }
@ -569,24 +600,77 @@ tu6_setup_streamout(struct tu_cs *cs,
} }
} }
enum tu_geom_consts_type
{
TU_CONSTS_PRIMITIVE_MAP,
TU_CONSTS_PRIMITIVE_PARAM,
};
static void static void
tu6_emit_const(struct tu_cs *cs, uint32_t opcode, uint32_t base, tu6_emit_const(struct tu_cs *cs, uint32_t opcode, enum tu_geom_consts_type type,
enum a6xx_state_block block, uint32_t offset, const struct ir3_const_state *const_state,
uint32_t size, const uint32_t *dwords) { unsigned constlen, enum a6xx_state_block block,
uint32_t offset, uint32_t size, const uint32_t *dwords) {
assert(size % 4 == 0); assert(size % 4 == 0);
tu_cs_emit_pkt7(cs, opcode, 3 + size);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(block) |
CP_LOAD_STATE6_0_NUM_UNIT(size / 4));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
dwords = (uint32_t *)&((uint8_t *)dwords)[offset]; dwords = (uint32_t *)&((uint8_t *)dwords)[offset];
tu_cs_emit_array(cs, dwords, size); if (block == SB6_VS_SHADER || !cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
uint32_t base;
switch (type) {
case TU_CONSTS_PRIMITIVE_MAP:
base = const_state->offsets.primitive_map;
break;
case TU_CONSTS_PRIMITIVE_PARAM:
base = const_state->offsets.primitive_param;
break;
default:
unreachable("bad consts type");
}
int32_t adjusted_size = MIN2(base * 4 + size, constlen * 4) - base * 4;
if (adjusted_size <= 0)
return;
tu_cs_emit_pkt7(cs, opcode, 3 + adjusted_size);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(block) |
CP_LOAD_STATE6_0_NUM_UNIT(adjusted_size / 4));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
tu_cs_emit_array(cs, dwords, adjusted_size);
} else {
uint32_t base;
switch (type) {
case TU_CONSTS_PRIMITIVE_MAP:
base = const_state->primitive_map_ubo.idx;
break;
case TU_CONSTS_PRIMITIVE_PARAM:
base = const_state->primitive_param_ubo.idx;
break;
default:
unreachable("bad consts type");
}
if (base == -1)
return;
/* A7XX TODO: Emit data via sub_cs instead of NOP */
uint64_t iova = tu_cs_emit_data_nop(cs, dwords, size, 4);
tu_cs_emit_pkt7(cs, opcode, 5);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(block) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
int size_vec4s = DIV_ROUND_UP(size, 4);
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
}
} }
static void static void
@ -596,15 +680,13 @@ tu6_emit_link_map(struct tu_cs *cs,
enum a6xx_state_block sb) enum a6xx_state_block sb)
{ {
const struct ir3_const_state *const_state = ir3_const_state(consumer); const struct ir3_const_state *const_state = ir3_const_state(consumer);
uint32_t base = const_state->offsets.primitive_map; uint32_t size = ALIGN(consumer->input_size, 4);
int size = DIV_ROUND_UP(consumer->input_size, 4);
size = (MIN2(size + base, consumer->constlen) - base) * 4; if (size == 0)
if (size <= 0)
return; return;
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, base, sb, 0, size, tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_MAP,
producer->output_loc); const_state, consumer->constlen, sb, 0, size, producer->output_loc);
} }
static int static int
@ -992,8 +1074,8 @@ tu6_emit_vs_params(struct tu_cs *cs,
0, 0,
0, 0,
}; };
uint32_t vs_base = const_state->offsets.primitive_param; tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, vs_base, SB6_VS_SHADER, 0, const_state, constlen, SB6_VS_SHADER, 0,
ARRAY_SIZE(vs_params), vs_params); ARRAY_SIZE(vs_params), vs_params);
} }
@ -1018,6 +1100,8 @@ static const enum mesa_vk_dynamic_graphics_state tu_patch_control_points_state[]
MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS, MESA_VK_DYNAMIC_TS_PATCH_CONTROL_POINTS,
}; };
#define HS_PARAMS_SIZE 8
template <chip CHIP> template <chip CHIP>
static unsigned static unsigned
tu6_patch_control_points_size(struct tu_device *dev, tu6_patch_control_points_size(struct tu_device *dev,
@ -1027,10 +1111,17 @@ tu6_patch_control_points_size(struct tu_device *dev,
const struct tu_program_state *program, const struct tu_program_state *program,
uint32_t patch_control_points) uint32_t patch_control_points)
{ {
#define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords) if (dev->physical_device->info->a7xx.load_shader_consts_via_preamble) {
return EMIT_CONST_DWORDS(4) + #define EMIT_CONST_DWORDS(const_dwords) (5 + const_dwords + 4)
EMIT_CONST_DWORDS(program->hs_param_dwords) + 2 + 2 + 2; return EMIT_CONST_DWORDS(4) +
EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
#undef EMIT_CONST_DWORDS #undef EMIT_CONST_DWORDS
} else {
#define EMIT_CONST_DWORDS(const_dwords) (4 + const_dwords)
return EMIT_CONST_DWORDS(4) +
EMIT_CONST_DWORDS(HS_PARAMS_SIZE) + 2 + 2 + 2;
#undef EMIT_CONST_DWORDS
}
} }
template <chip CHIP> template <chip CHIP>
@ -1056,7 +1147,7 @@ tu6_emit_patch_control_points(struct tu_cs *cs,
uint64_t tess_factor_iova, tess_param_iova; uint64_t tess_factor_iova, tess_param_iova;
tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova); tu_get_tess_iova(dev, &tess_factor_iova, &tess_param_iova);
uint32_t hs_params[8] = { uint32_t hs_params[HS_PARAMS_SIZE] = {
vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */ vs->variant->output_size * patch_control_points * 4, /* hs primitive stride */
vs->variant->output_size * 4, /* hs vertex stride */ vs->variant->output_size * 4, /* hs vertex stride */
tcs->variant->output_size, tcs->variant->output_size,
@ -1069,9 +1160,10 @@ tu6_emit_patch_control_points(struct tu_cs *cs,
const struct ir3_const_state *hs_const = const struct ir3_const_state *hs_const =
&program->link[MESA_SHADER_TESS_CTRL].const_state; &program->link[MESA_SHADER_TESS_CTRL].const_state;
uint32_t hs_base = hs_const->offsets.primitive_param; unsigned hs_constlen = program->link[MESA_SHADER_TESS_CTRL].constlen;
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, hs_base, SB6_HS_SHADER, 0, tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
program->hs_param_dwords, hs_params); hs_const, hs_constlen, SB6_HS_SHADER, 0,
ARRAY_SIZE(hs_params), hs_params);
uint32_t patch_local_mem_size_16b = uint32_t patch_local_mem_size_16b =
patch_control_points * vs->variant->output_size / 4; patch_control_points * vs->variant->output_size / 4;
@ -1146,10 +1238,9 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
tess_factor_iova >> 32, tess_factor_iova >> 32,
}; };
uint32_t ds_base = ds->const_state->offsets.primitive_param; tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
uint32_t ds_param_dwords = MIN2((ds->constlen - ds_base) * 4, ARRAY_SIZE(ds_params)); ds->const_state, ds->constlen, SB6_DS_SHADER, 0,
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, ds_base, SB6_DS_SHADER, 0, ARRAY_SIZE(ds_params), ds_params);
ds_param_dwords, ds_params);
} }
if (gs) { if (gs) {
@ -1160,8 +1251,8 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
0, 0,
0, 0,
}; };
uint32_t gs_base = gs->const_state->offsets.primitive_param; tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, TU_CONSTS_PRIMITIVE_PARAM,
tu6_emit_const(cs, CP_LOAD_STATE6_GEOM, gs_base, SB6_GS_SHADER, 0, gs->const_state, gs->constlen, SB6_GS_SHADER, 0,
ARRAY_SIZE(gs_params), gs_params); ARRAY_SIZE(gs_params), gs_params);
} }
} }
@ -2145,15 +2236,6 @@ tu_emit_program_state(struct tu_cs *sub_cs,
tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs); tu6_emit_vpc<CHIP>(&prog_cs, vs, hs, ds, gs, fs);
prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs); prog->vpc_state = tu_cs_end_draw_state(sub_cs, &prog_cs);
if (hs) {
const struct ir3_const_state *hs_const =
&prog->link[MESA_SHADER_TESS_CTRL].const_state;
unsigned hs_constlen =
prog->link[MESA_SHADER_TESS_CTRL].constlen;
uint32_t hs_base = hs_const->offsets.primitive_param;
prog->hs_param_dwords = MIN2((hs_constlen - hs_base) * 4, 8);
}
const struct ir3_shader_variant *last_shader; const struct ir3_shader_variant *last_shader;
if (gs) if (gs)
last_shader = gs; last_shader = gs;

View file

@ -93,8 +93,6 @@ struct tu_program_state
struct tu_draw_state vpc_state; struct tu_draw_state vpc_state;
struct tu_draw_state fs_state; struct tu_draw_state fs_state;
uint32_t hs_param_dwords;
struct tu_push_constant_range shared_consts; struct tu_push_constant_range shared_consts;
struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES]; struct tu_program_descriptor_linkage link[MESA_SHADER_STAGES];

View file

@ -97,6 +97,13 @@ tu_spirv_to_nir(struct tu_device *dev,
if (result != VK_SUCCESS) if (result != VK_SUCCESS)
return NULL; return NULL;
/* ir3 uses num_ubos and num_ssbos to track the number of *bindful*
* UBOs/SSBOs, but spirv_to_nir sets them to the total number of objects
* which is useless for us, so reset them here.
*/
nir->info.num_ubos = 0;
nir->info.num_ssbos = 0;
if (TU_DEBUG(NIR)) { if (TU_DEBUG(NIR)) {
fprintf(stderr, "translated nir:\n"); fprintf(stderr, "translated nir:\n");
nir_print_shader(nir, stderr); nir_print_shader(nir, stderr);
@ -175,6 +182,7 @@ lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
struct tu_shader *shader, struct tu_shader *shader,
const struct tu_pipeline_layout *layout) const struct tu_pipeline_layout *layout)
{ {
struct ir3_compiler *compiler = dev->compiler;
nir_def *vulkan_idx = instr->src[0].ssa; nir_def *vulkan_idx = instr->src[0].ssa;
unsigned set = nir_intrinsic_desc_set(instr); unsigned set = nir_intrinsic_desc_set(instr);
@ -209,9 +217,15 @@ lower_vulkan_resource_index(struct tu_device *dev, nir_builder *b,
* get it from the const file instead. * get it from the const file instead.
*/ */
base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS)); base = nir_imm_int(b, binding_layout->dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS));
nir_def *dynamic_offset_start = nir_def *dynamic_offset_start;
nir_load_uniform(b, 1, 32, nir_imm_int(b, 0), if (compiler->load_shader_consts_via_preamble) {
.base = shader->const_state.dynamic_offset_loc + set); dynamic_offset_start =
ir3_load_driver_ubo(b, 1, &shader->const_state.dynamic_offsets_ubo, set);
} else {
dynamic_offset_start =
nir_load_uniform(b, 1, 32, nir_imm_int(b, 0),
.base = shader->const_state.dynamic_offset_loc + set);
}
base = nir_iadd(b, base, dynamic_offset_start); base = nir_iadd(b, base, dynamic_offset_start);
} else { } else {
base = nir_imm_int(b, (offset + base = nir_imm_int(b, (offset +
@ -271,7 +285,7 @@ lower_load_vulkan_descriptor(nir_builder *b, nir_intrinsic_instr *intrin)
nir_instr_remove(&intrin->instr); nir_instr_remove(&intrin->instr);
} }
static void static bool
lower_ssbo_ubo_intrinsic(struct tu_device *dev, lower_ssbo_ubo_intrinsic(struct tu_device *dev,
nir_builder *b, nir_intrinsic_instr *intrin) nir_builder *b, nir_intrinsic_instr *intrin)
{ {
@ -291,6 +305,10 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
buffer_src = 0; buffer_src = 0;
} }
/* Don't lower non-bindless UBO loads of driver params */
if (intrin->src[buffer_src].ssa->num_components == 1)
return false;
nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0); nir_scalar scalar_idx = nir_scalar_resolved(intrin->src[buffer_src].ssa, 0);
nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1); nir_def *descriptor_idx = nir_channel(b, intrin->src[buffer_src].ssa, 1);
@ -310,7 +328,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
nir_def *bindless = nir_def *bindless =
nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx)); nir_bindless_resource_ir3(b, 32, descriptor_idx, .desc_set = nir_scalar_as_uint(scalar_idx));
nir_src_rewrite(&intrin->src[buffer_src], bindless); nir_src_rewrite(&intrin->src[buffer_src], bindless);
return; return true;
} }
nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp); nir_def *base_idx = nir_channel(b, scalar_idx.def, scalar_idx.comp);
@ -361,6 +379,7 @@ lower_ssbo_ubo_intrinsic(struct tu_device *dev,
if (info->has_dest) if (info->has_dest)
nir_def_rewrite_uses(&intrin->def, result); nir_def_rewrite_uses(&intrin->def, result);
nir_instr_remove(&intrin->instr); nir_instr_remove(&intrin->instr);
return true;
} }
static nir_def * static nir_def *
@ -461,8 +480,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
case nir_intrinsic_ssbo_atomic: case nir_intrinsic_ssbo_atomic:
case nir_intrinsic_ssbo_atomic_swap: case nir_intrinsic_ssbo_atomic_swap:
case nir_intrinsic_get_ssbo_size: case nir_intrinsic_get_ssbo_size:
lower_ssbo_ubo_intrinsic(dev, b, instr); return lower_ssbo_ubo_intrinsic(dev, b, instr);
return true;
case nir_intrinsic_image_deref_load: case nir_intrinsic_image_deref_load:
case nir_intrinsic_image_deref_store: case nir_intrinsic_image_deref_store:
@ -473,6 +491,37 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
lower_image_deref(dev, b, instr, shader, layout); lower_image_deref(dev, b, instr, shader, layout);
return true; return true;
case nir_intrinsic_load_frag_size_ir3:
case nir_intrinsic_load_frag_offset_ir3: {
if (!dev->compiler->load_shader_consts_via_preamble)
return false;
enum ir3_driver_param param =
instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
nir_def *view = instr->src[0].ssa;
nir_def *result =
ir3_load_driver_ubo_indirect(b, 2, &shader->const_state.fdm_ubo,
param, view, nir_intrinsic_range(instr));
nir_def_rewrite_uses(&instr->def, result);
nir_instr_remove(&instr->instr);
return true;
}
case nir_intrinsic_load_frag_invocation_count: {
if (!dev->compiler->load_shader_consts_via_preamble)
return false;
nir_def *result =
ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
IR3_DP_FS_FRAG_INVOCATION_COUNT);
nir_def_rewrite_uses(&instr->def, result);
nir_instr_remove(&instr->instr);
return true;
}
default: default:
return false; return false;
} }
@ -1219,6 +1268,7 @@ tu6_emit_xs(struct tu_cs *cs,
unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs); unsigned immediate_size = tu_xs_get_immediates_packet_size_dwords(xs);
if (immediate_size > 0) { if (immediate_size > 0) {
assert(!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size); tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3 + immediate_size);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) | tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) | CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
@ -1231,13 +1281,14 @@ tu6_emit_xs(struct tu_cs *cs,
tu_cs_emit_array(cs, const_state->immediates, immediate_size); tu_cs_emit_array(cs, const_state->immediates, immediate_size);
} }
if (const_state->constant_data_ubo != -1) { if (const_state->consts_ubo.idx != -1) {
uint64_t iova = binary_iova + xs->info.constant_data_offset; uint64_t iova = binary_iova + xs->info.constant_data_offset;
uint32_t offset = const_state->consts_ubo.idx;
/* Upload UBO state for the constant data. */ /* Upload UBO state for the constant data. */
tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5); tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
tu_cs_emit(cs, tu_cs_emit(cs,
CP_LOAD_STATE6_0_DST_OFF(const_state->constant_data_ubo) | CP_LOAD_STATE6_0_DST_OFF(offset) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)| CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO)|
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) | CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) | CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
@ -1252,30 +1303,50 @@ tu6_emit_xs(struct tu_cs *cs,
/* Upload the constant data to the const file if needed. */ /* Upload the constant data to the const file if needed. */
const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state; const struct ir3_ubo_analysis_state *ubo_state = &const_state->ubo_state;
for (int i = 0; i < ubo_state->num_enabled; i++) { if (!cs->device->physical_device->info->a7xx.load_shader_consts_via_preamble) {
if (ubo_state->range[i].ubo.block != const_state->constant_data_ubo || for (int i = 0; i < ubo_state->num_enabled; i++) {
ubo_state->range[i].ubo.bindless) { if (ubo_state->range[i].ubo.block != offset ||
continue; ubo_state->range[i].ubo.bindless) {
continue;
}
uint32_t start = ubo_state->range[i].start;
uint32_t end = ubo_state->range[i].end;
uint32_t size = MIN2(end - start,
(16 * xs->constlen) - ubo_state->range[i].offset);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
tu_cs_emit(cs,
CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
tu_cs_emit_qw(cs, iova + start);
} }
uint32_t start = ubo_state->range[i].start;
uint32_t end = ubo_state->range[i].end;
uint32_t size = MIN2(end - start,
(16 * xs->constlen) - ubo_state->range[i].offset);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 3);
tu_cs_emit(cs,
CP_LOAD_STATE6_0_DST_OFF(ubo_state->range[i].offset / 16) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_INDIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
CP_LOAD_STATE6_0_NUM_UNIT(size / 16));
tu_cs_emit_qw(cs, iova + start);
} }
} }
/* emit statically-known FS driver param */ /* emit statically-known FS driver param */
if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) { if (stage == MESA_SHADER_FRAGMENT && const_state->driver_params_ubo.size > 0) {
uint32_t data[4] = {xs->info.double_threadsize ? 128 : 64, 0, 0, 0};
uint32_t size = ARRAY_SIZE(data);
/* A7XX TODO: Emit data via sub_cs instead of NOP */
uint64_t iova = tu_cs_emit_data_nop(cs, data, size, 4);
uint32_t base = const_state->driver_params_ubo.idx;
tu_cs_emit_pkt7(cs, tu6_stage2opcode(stage), 5);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(base) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_UBO) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(stage)) |
CP_LOAD_STATE6_0_NUM_UNIT(1));
tu_cs_emit(cs, CP_LOAD_STATE6_1_EXT_SRC_ADDR(0));
tu_cs_emit(cs, CP_LOAD_STATE6_2_EXT_SRC_ADDR_HI(0));
int size_vec4s = DIV_ROUND_UP(size, 4);
tu_cs_emit_qw(cs, iova | ((uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32));
} else if (stage == MESA_SHADER_FRAGMENT && const_state->num_driver_params > 0) {
uint32_t base = const_state->offsets.driver_param; uint32_t base = const_state->offsets.driver_param;
int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4); int32_t size = DIV_ROUND_UP(MAX2(const_state->num_driver_params, 4), 4);
size = MAX2(MIN2(size + base, xs->constlen) - base, 0); size = MAX2(MIN2(size + base, xs->constlen) - base, 0);
@ -2176,6 +2247,9 @@ tu_shader_init(struct tu_device *dev, const void *key_data, size_t key_size)
vk_pipeline_cache_object_init(&dev->vk, &shader->base, vk_pipeline_cache_object_init(&dev->vk, &shader->base,
&tu_shader_ops, obj_key_data, key_size); &tu_shader_ops, obj_key_data, key_size);
shader->const_state.fdm_ubo.idx = -1;
shader->const_state.dynamic_offsets_ubo.idx = -1;
return shader; return shader;
} }

View file

@ -49,6 +49,9 @@ struct tu_const_state
uint32_t dynamic_offset_loc; uint32_t dynamic_offset_loc;
unsigned num_inline_ubos; unsigned num_inline_ubos;
struct tu_inline_ubo ubos[MAX_INLINE_UBOS]; struct tu_inline_ubo ubos[MAX_INLINE_UBOS];
struct ir3_driver_ubo fdm_ubo;
struct ir3_driver_ubo dynamic_offsets_ubo;
}; };
struct tu_shader struct tu_shader

View file

@ -208,7 +208,7 @@ fd6_emit_ubos(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
for (int i = 0; i < num_ubos; i++) { for (int i = 0; i < num_ubos; i++) {
/* NIR constant data is packed into the end of the shader. */ /* NIR constant data is packed into the end of the shader. */
if (i == const_state->constant_data_ubo) { if (i == const_state->consts_ubo.idx) {
int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16); int size_vec4s = DIV_ROUND_UP(v->constant_data_size, 16);
OUT_RELOC(ring, v->bo, v->info.constant_data_offset, OUT_RELOC(ring, v->bo, v->info.constant_data_offset,
(uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0); (uint64_t)A6XX_UBO_1_SIZE(size_vec4s) << 32, 0);

View file

@ -119,7 +119,7 @@ ir3_emit_constant_data(const struct ir3_shader_variant *v,
for (unsigned i = 0; i < state->num_enabled; i++) { for (unsigned i = 0; i < state->num_enabled; i++) {
unsigned ubo = state->range[i].ubo.block; unsigned ubo = state->range[i].ubo.block;
if (ubo != const_state->constant_data_ubo) if (ubo != const_state->consts_ubo.idx)
continue; continue;
uint32_t size = state->range[i].end - state->range[i].start; uint32_t size = state->range[i].end - state->range[i].start;
@ -161,7 +161,7 @@ ir3_emit_user_consts(const struct ir3_shader_variant *v,
assert(!state->range[i].ubo.bindless); assert(!state->range[i].ubo.bindless);
unsigned ubo = state->range[i].ubo.block; unsigned ubo = state->range[i].ubo.block;
if (!(constbuf->enabled_mask & (1 << ubo)) || if (!(constbuf->enabled_mask & (1 << ubo)) ||
ubo == const_state->constant_data_ubo) { ubo == const_state->consts_ubo.idx) {
continue; continue;
} }
struct pipe_constant_buffer *cb = &constbuf->cb[ubo]; struct pipe_constant_buffer *cb = &constbuf->cb[ubo];
@ -218,7 +218,7 @@ ir3_emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
struct fd_bo *bos[params]; struct fd_bo *bos[params];
for (uint32_t i = 0; i < params; i++) { for (uint32_t i = 0; i < params; i++) {
if (i == const_state->constant_data_ubo) { if (i == const_state->consts_ubo.idx) {
bos[i] = v->bo; bos[i] = v->bo;
offsets[i] = v->info.constant_data_offset; offsets[i] = v->info.constant_data_offset;
continue; continue;