freedreno/ir3+tu: Convert driver-params to structs

This at least lets us de-dup the dp setup between the push-const path
and preamble-loads-from-ubo path.

Signed-off-by: Rob Clark <robdclark@chromium.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/31534>
This commit is contained in:
Rob Clark 2024-10-01 11:10:26 -07:00 committed by Marge Bot
parent 81d8387dbc
commit 7e9b948430
8 changed files with 271 additions and 203 deletions

View file

@ -171,7 +171,7 @@ create_frag_input(struct ir3_context *ctx, struct ir3_instruction *coord,
}
static struct ir3_instruction *
create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
create_driver_param(struct ir3_context *ctx, uint32_t dp)
{
/* first four vec4 sysval's reserved for UBOs: */
/* NOTE: dp is in scalar, but there can be >4 dp components: */
@ -182,7 +182,7 @@ create_driver_param(struct ir3_context *ctx, enum ir3_driver_param dp)
}
static struct ir3_instruction *
create_driver_param_indirect(struct ir3_context *ctx, enum ir3_driver_param dp,
create_driver_param_indirect(struct ir3_context *ctx, uint32_t dp,
struct ir3_instruction *address)
{
/* first four vec4 sysval's reserved for UBOs: */
@ -2798,25 +2798,25 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
case nir_intrinsic_load_base_vertex:
case nir_intrinsic_load_first_vertex:
if (!ctx->basevertex) {
ctx->basevertex = create_driver_param(ctx, IR3_DP_VTXID_BASE);
ctx->basevertex = create_driver_param(ctx, IR3_DP_VS(vtxid_base));
}
dst[0] = ctx->basevertex;
break;
case nir_intrinsic_load_is_indexed_draw:
if (!ctx->is_indexed_draw) {
ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_IS_INDEXED_DRAW);
ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_VS(is_indexed_draw));
}
dst[0] = ctx->is_indexed_draw;
break;
case nir_intrinsic_load_draw_id:
if (!ctx->draw_id) {
ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);
ctx->draw_id = create_driver_param(ctx, IR3_DP_VS(draw_id));
}
dst[0] = ctx->draw_id;
break;
case nir_intrinsic_load_base_instance:
if (!ctx->base_instance) {
ctx->base_instance = create_driver_param(ctx, IR3_DP_INSTID_BASE);
ctx->base_instance = create_driver_param(ctx, IR3_DP_VS(instid_base));
}
dst[0] = ctx->base_instance;
break;
@ -2863,7 +2863,7 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
idx = nir_intrinsic_ucp_id(intr);
for (int i = 0; i < dest_components; i++) {
unsigned n = idx * 4 + i;
dst[i] = create_driver_param(ctx, IR3_DP_UCP0_X + n);
dst[i] = create_driver_param(ctx, IR3_DP_VS(ucp[0].x) + n);
}
create_rpt = true;
break;
@ -2899,41 +2899,41 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
} else {
/* For a3xx/a4xx, this comes in via const injection by the hw */
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_WORKGROUP_ID_X + i);
dst[i] = create_driver_param(ctx, IR3_DP_CS(workgroup_id_x) + i);
}
}
break;
case nir_intrinsic_load_base_workgroup_id:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_BASE_GROUP_X + i);
dst[i] = create_driver_param(ctx, IR3_DP_CS(base_group_x) + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_num_workgroups:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_NUM_WORK_GROUPS_X + i);
dst[i] = create_driver_param(ctx, IR3_DP_CS(num_work_groups_x) + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_workgroup_size:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_LOCAL_GROUP_SIZE_X + i);
dst[i] = create_driver_param(ctx, IR3_DP_CS(local_group_size_x) + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_subgroup_size: {
assert(ctx->so->type == MESA_SHADER_COMPUTE ||
ctx->so->type == MESA_SHADER_FRAGMENT);
enum ir3_driver_param size = ctx->so->type == MESA_SHADER_COMPUTE ?
IR3_DP_CS_SUBGROUP_SIZE : IR3_DP_FS_SUBGROUP_SIZE;
unsigned size = ctx->so->type == MESA_SHADER_COMPUTE ?
IR3_DP_CS(subgroup_size) : IR3_DP_FS(subgroup_size);
dst[0] = create_driver_param(ctx, size);
break;
}
case nir_intrinsic_load_subgroup_id_shift_ir3:
dst[0] = create_driver_param(ctx, IR3_DP_SUBGROUP_ID_SHIFT);
dst[0] = create_driver_param(ctx, IR3_DP_CS(subgroup_id_shift));
break;
case nir_intrinsic_load_work_dim:
dst[0] = create_driver_param(ctx, IR3_DP_WORK_DIM);
dst[0] = create_driver_param(ctx, IR3_DP_CS(work_dim));
break;
case nir_intrinsic_load_subgroup_invocation:
assert(ctx->compiler->has_getfiberid);
@ -2943,24 +2943,24 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
break;
case nir_intrinsic_load_tess_level_outer_default:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_OUTER_LEVEL_X + i);
dst[i] = create_driver_param(ctx, IR3_DP_TCS(default_outer_level_x) + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_tess_level_inner_default:
for (int i = 0; i < dest_components; i++) {
dst[i] = create_driver_param(ctx, IR3_DP_HS_DEFAULT_INNER_LEVEL_X + i);
dst[i] = create_driver_param(ctx, IR3_DP_TCS(default_inner_level_x) + i);
}
create_rpt = true;
break;
case nir_intrinsic_load_frag_invocation_count:
dst[0] = create_driver_param(ctx, IR3_DP_FS_FRAG_INVOCATION_COUNT);
dst[0] = create_driver_param(ctx, IR3_DP_FS(frag_invocation_count));
break;
case nir_intrinsic_load_frag_size_ir3:
case nir_intrinsic_load_frag_offset_ir3: {
enum ir3_driver_param param =
unsigned param =
intr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
if (nir_src_is_const(intr->src[0])) {
uint32_t view = nir_src_as_uint(intr->src[0]);
for (int i = 0; i < dest_components; i++) {
@ -4559,7 +4559,7 @@ emit_stream_out(struct ir3_context *ctx)
* of the shader:
*/
vtxcnt = create_sysval_input(ctx, SYSTEM_VALUE_VERTEX_CNT, 0x1);
maxvtxcnt = create_driver_param(ctx, IR3_DP_VTXCNT_MAX);
maxvtxcnt = create_driver_param(ctx, IR3_DP_VS(vtxcnt_max));
/* at this point, we are at the original 'end' block,
* re-purpose this block to stream-out condition, then

View file

@ -1188,61 +1188,61 @@ ir3_get_driver_param_info(const nir_shader *shader, nir_intrinsic_instr *intr,
{
switch (intr->intrinsic) {
case nir_intrinsic_load_base_workgroup_id:
param_info->offset = IR3_DP_BASE_GROUP_X;
param_info->offset = IR3_DP_CS(base_group_x);
break;
case nir_intrinsic_load_num_workgroups:
param_info->offset = IR3_DP_NUM_WORK_GROUPS_X;
param_info->offset = IR3_DP_CS(num_work_groups_x);
break;
case nir_intrinsic_load_workgroup_size:
param_info->offset = IR3_DP_LOCAL_GROUP_SIZE_X;
param_info->offset = IR3_DP_CS(local_group_size_x);
break;
case nir_intrinsic_load_subgroup_size:
assert(shader->info.stage == MESA_SHADER_COMPUTE ||
shader->info.stage == MESA_SHADER_FRAGMENT);
if (shader->info.stage == MESA_SHADER_COMPUTE) {
param_info->offset = IR3_DP_CS_SUBGROUP_SIZE;
param_info->offset = IR3_DP_CS(subgroup_size);
} else {
param_info->offset = IR3_DP_FS_SUBGROUP_SIZE;
param_info->offset = IR3_DP_FS(subgroup_size);
}
break;
case nir_intrinsic_load_subgroup_id_shift_ir3:
param_info->offset = IR3_DP_SUBGROUP_ID_SHIFT;
param_info->offset = IR3_DP_CS(subgroup_id_shift);
break;
case nir_intrinsic_load_work_dim:
param_info->offset = IR3_DP_WORK_DIM;
param_info->offset = IR3_DP_CS(work_dim);
break;
case nir_intrinsic_load_base_vertex:
case nir_intrinsic_load_first_vertex:
param_info->offset = IR3_DP_VTXID_BASE;
param_info->offset = IR3_DP_VS(vtxid_base);
break;
case nir_intrinsic_load_is_indexed_draw:
param_info->offset = IR3_DP_IS_INDEXED_DRAW;
param_info->offset = IR3_DP_VS(is_indexed_draw);
break;
case nir_intrinsic_load_draw_id:
param_info->offset = IR3_DP_DRAWID;
param_info->offset = IR3_DP_VS(draw_id);
break;
case nir_intrinsic_load_base_instance:
param_info->offset = IR3_DP_INSTID_BASE;
param_info->offset = IR3_DP_VS(instid_base);
break;
case nir_intrinsic_load_user_clip_plane: {
uint32_t idx = nir_intrinsic_ucp_id(intr);
param_info->offset = IR3_DP_UCP0_X + 4 * idx;
param_info->offset = IR3_DP_VS(ucp[0].x) + 4 * idx;
break;
}
case nir_intrinsic_load_tess_level_outer_default:
param_info->offset = IR3_DP_HS_DEFAULT_OUTER_LEVEL_X;
param_info->offset = IR3_DP_TCS(default_outer_level_x);
break;
case nir_intrinsic_load_tess_level_inner_default:
param_info->offset = IR3_DP_HS_DEFAULT_INNER_LEVEL_X;
param_info->offset = IR3_DP_TCS(default_inner_level_x);
break;
case nir_intrinsic_load_frag_size_ir3:
param_info->offset = IR3_DP_FS_FRAG_SIZE;
param_info->offset = IR3_DP_FS(frag_size);
break;
case nir_intrinsic_load_frag_offset_ir3:
param_info->offset = IR3_DP_FS_FRAG_OFFSET;
param_info->offset = IR3_DP_FS(frag_offset);
break;
case nir_intrinsic_load_frag_invocation_count:
param_info->offset = IR3_DP_FS_FRAG_INVOCATION_COUNT;
param_info->offset = IR3_DP_FS(frag_invocation_count);
break;
default:
return false;
@ -1306,7 +1306,7 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st
if (!compiler->has_shared_regfile &&
shader->info.stage == MESA_SHADER_COMPUTE) {
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_WORKGROUP_ID_Z + 1);
MAX2(layout->num_driver_params, IR3_DP_CS(workgroup_id_z) + 1);
}
}
@ -1338,7 +1338,7 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
if ((compiler->gen < 5) && (v->stream_output.num_outputs > 0)) {
const_state->num_driver_params =
MAX2(const_state->num_driver_params, IR3_DP_VTXCNT_MAX + 1);
MAX2(const_state->num_driver_params, IR3_DP_VS(vtxcnt_max) + 1);
}
const_state->num_ubos = nir->info.num_ubos;
@ -1375,7 +1375,7 @@ ir3_setup_const_state(nir_shader *nir, struct ir3_shader_variant *v,
const_state->num_driver_params = align(const_state->num_driver_params, 4);
unsigned upload_unit = 1;
if (v->type == MESA_SHADER_COMPUTE ||
(const_state->num_driver_params >= IR3_DP_VTXID_BASE)) {
(const_state->num_driver_params >= IR3_DP_VS(vtxid_base))) {
upload_unit = compiler->const_upload_unit;
}

View file

@ -21,60 +21,95 @@
BEGINC;
/* driver param indices: */
enum ir3_driver_param {
/* compute shader driver params: */
IR3_DP_NUM_WORK_GROUPS_X = 0,
IR3_DP_NUM_WORK_GROUPS_Y = 1,
IR3_DP_NUM_WORK_GROUPS_Z = 2,
IR3_DP_WORK_DIM = 3,
IR3_DP_BASE_GROUP_X = 4,
IR3_DP_BASE_GROUP_Y = 5,
IR3_DP_BASE_GROUP_Z = 6,
IR3_DP_CS_SUBGROUP_SIZE = 7,
IR3_DP_LOCAL_GROUP_SIZE_X = 8,
IR3_DP_LOCAL_GROUP_SIZE_Y = 9,
IR3_DP_LOCAL_GROUP_SIZE_Z = 10,
IR3_DP_SUBGROUP_ID_SHIFT = 11,
IR3_DP_WORKGROUP_ID_X = 12,
IR3_DP_WORKGROUP_ID_Y = 13,
IR3_DP_WORKGROUP_ID_Z = 14,
#define dword_offsetof(type, name) DIV_ROUND_UP(offsetof(type, name), 4)
#define dword_sizeof(type) DIV_ROUND_UP(sizeof(type), 4)
/**
* Driver params for compute shaders.
*
* Note, driver param structs should be size aligned to vec4
*/
struct ir3_driver_params_cs {
/* NOTE: gl_NumWorkGroups should be vec4 aligned because
* glDispatchComputeIndirect() needs to load these from
* the info->indirect buffer. Keep that in mind when/if
* adding any addition CS driver params.
*/
IR3_DP_CS_COUNT = 16, /* must be aligned to vec4 */
/* vertex shader driver params: */
IR3_DP_DRAWID = 0,
IR3_DP_VTXID_BASE = 1,
IR3_DP_INSTID_BASE = 2,
IR3_DP_VTXCNT_MAX = 3,
IR3_DP_IS_INDEXED_DRAW = 4, /* Note: boolean, ie. 0 or ~0 */
/* user-clip-plane components, up to 8x vec4's: */
IR3_DP_UCP0_X = 5,
/* .... */
IR3_DP_UCP7_W = 36,
IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */
/* TCS driver params: */
IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0,
IR3_DP_HS_DEFAULT_OUTER_LEVEL_Y = 1,
IR3_DP_HS_DEFAULT_OUTER_LEVEL_Z = 2,
IR3_DP_HS_DEFAULT_OUTER_LEVEL_W = 3,
IR3_DP_HS_DEFAULT_INNER_LEVEL_X = 4,
IR3_DP_HS_DEFAULT_INNER_LEVEL_Y = 5,
IR3_DP_HS_COUNT = 8, /* must be aligned to vec4 */
/* fragment shader driver params: */
IR3_DP_FS_SUBGROUP_SIZE = 0,
/* Dynamic params (that aren't known when compiling the shader) */
IR3_DP_FS_DYNAMIC = 4,
IR3_DP_FS_FRAG_INVOCATION_COUNT = IR3_DP_FS_DYNAMIC,
IR3_DP_FS_FRAG_SIZE = IR3_DP_FS_DYNAMIC + 4,
IR3_DP_FS_FRAG_OFFSET = IR3_DP_FS_DYNAMIC + 6,
uint32_t num_work_groups_x;
uint32_t num_work_groups_y;
uint32_t num_work_groups_z;
uint32_t work_dim;
uint32_t base_group_x;
uint32_t base_group_y;
uint32_t base_group_z;
uint32_t subgroup_size;
uint32_t local_group_size_x;
uint32_t local_group_size_y;
uint32_t local_group_size_z;
uint32_t subgroup_id_shift;
uint32_t workgroup_id_x;
uint32_t workgroup_id_y;
uint32_t workgroup_id_z;
uint32_t __pad;
};
#define IR3_DP_CS(name) dword_offsetof(struct ir3_driver_params_cs, name)
/**
* Driver params for vertex shaders.
*
* Note, driver param structs should be size aligned to vec4
*/
struct ir3_driver_params_vs {
uint32_t draw_id;
uint32_t vtxid_base;
uint32_t instid_base;
uint32_t vtxcnt_max;
uint32_t is_indexed_draw; /* Note: boolean, ie. 0 or ~0 */
/* user-clip-plane components, up to 8x vec4's: */
struct {
uint32_t x;
uint32_t y;
uint32_t z;
uint32_t w;
} ucp[8];
uint32_t __pad_37_39[3];
};
#define IR3_DP_VS(name) dword_offsetof(struct ir3_driver_params_vs, name)
/**
* Driver params for TCS shaders.
*
* Note, driver param structs should be size aligned to vec4
*/
struct ir3_driver_params_tcs {
uint32_t default_outer_level_x;
uint32_t default_outer_level_y;
uint32_t default_outer_level_z;
uint32_t default_outer_level_w;
uint32_t default_inner_level_x;
uint32_t default_inner_level_y;
uint32_t __pad_06_07[2];
};
#define IR3_DP_TCS(name) dword_offsetof(struct ir3_driver_params_tcs, name)
/**
* Driver params for fragment shaders.
*
* Note, driver param structs should be size aligned to vec4
*/
struct ir3_driver_params_fs {
uint32_t subgroup_size;
uint32_t __pad_01_03[3];
/* Dynamic params (that aren't known when compiling the shader) */
#define IR3_DP_FS_DYNAMIC dword_offsetof(struct ir3_driver_params_fs, frag_invocation_count)
uint32_t frag_invocation_count;
uint32_t __pad_05_07[3];
uint32_t frag_size;
uint32_t __pad_09;
uint32_t frag_offset;
uint32_t __pad_11_12[2];
};
#define IR3_DP_FS(name) dword_offsetof(struct ir3_driver_params_fs, name)
#define IR3_MAX_SHADER_BUFFERS 32
#define IR3_MAX_SHADER_IMAGES 32

View file

@ -5423,15 +5423,15 @@ tu_emit_fdm_params(struct tu_cmd_buffer *cmd,
struct tu_cs *cs, struct tu_shader *fs,
unsigned num_units)
{
STATIC_ASSERT(IR3_DP_FS_FRAG_INVOCATION_COUNT == IR3_DP_FS_DYNAMIC);
STATIC_ASSERT(IR3_DP_FS(frag_invocation_count) == IR3_DP_FS_DYNAMIC);
tu_cs_emit(cs, fs->fs.per_samp ?
cmd->vk.dynamic_graphics_state.ms.rasterization_samples : 1);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
STATIC_ASSERT(IR3_DP_FS_FRAG_SIZE == IR3_DP_FS_DYNAMIC + 4);
STATIC_ASSERT(IR3_DP_FS_FRAG_OFFSET == IR3_DP_FS_DYNAMIC + 6);
STATIC_ASSERT(IR3_DP_FS(frag_size) == IR3_DP_FS_DYNAMIC + 4);
STATIC_ASSERT(IR3_DP_FS(frag_offset) == IR3_DP_FS_DYNAMIC + 6);
if (num_units > 1) {
if (fs->fs.has_fdm) {
struct apply_fs_params_state state = {
@ -5885,9 +5885,9 @@ vs_params_offset(struct tu_cmd_buffer *cmd)
return 0;
/* this layout is required by CP_DRAW_INDIRECT_MULTI */
STATIC_ASSERT(IR3_DP_DRAWID == 0);
STATIC_ASSERT(IR3_DP_VTXID_BASE == 1);
STATIC_ASSERT(IR3_DP_INSTID_BASE == 2);
STATIC_ASSERT(IR3_DP_VS(draw_id) == 0);
STATIC_ASSERT(IR3_DP_VS(vtxid_base) == 1);
STATIC_ASSERT(IR3_DP_VS(instid_base) == 2);
/* 0 means disabled for CP_DRAW_INDIRECT_MULTI */
assert(const_state->offsets.driver_param != 0);
@ -6333,6 +6333,29 @@ struct tu_dispatch_info
uint64_t indirect_offset;
};
static inline struct ir3_driver_params_cs
build_driver_params_cs(const struct ir3_shader_variant *variant,
const struct tu_dispatch_info *info)
{
unsigned subgroup_size = variant->info.subgroup_size;
unsigned subgroup_shift = util_logbase2(subgroup_size);
return (struct ir3_driver_params_cs) {
.num_work_groups_x = info->blocks[0],
.num_work_groups_y = info->blocks[1],
.num_work_groups_z = info->blocks[2],
.work_dim = 0,
.base_group_x = info->offsets[0],
.base_group_y = info->offsets[1],
.base_group_z = info->offsets[2],
.subgroup_size = subgroup_size,
.local_group_size_x = 0,
.local_group_size_y = 0,
.local_group_size_z = 0,
.subgroup_id_shift = subgroup_shift,
};
}
template <chip CHIP>
static void
tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
@ -6353,27 +6376,15 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
bool direct_indirect_load =
!(info->indirect_offset & 0xf) &&
!(info->indirect && num_consts > IR3_DP_BASE_GROUP_X);
!(info->indirect && num_consts > IR3_DP_CS(base_group_x));
uint64_t iova = 0;
if (!info->indirect) {
uint32_t driver_params[12] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
[IR3_DP_WORK_DIM] = 0,
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
[IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size,
[IR3_DP_LOCAL_GROUP_SIZE_X] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Y] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Z] = 0,
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
};
struct ir3_driver_params_cs driver_params =
build_driver_params_cs(variant, info);
assert(num_consts <= ARRAY_SIZE(driver_params));
assert(num_consts <= dword_sizeof(driver_params));
struct tu_cs_memory consts;
uint32_t consts_vec4 = DIV_ROUND_UP(num_consts, 4);
@ -6382,7 +6393,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
vk_command_buffer_set_error(&cmd->vk, result);
return;
}
memcpy(consts.map, driver_params, num_consts * sizeof(uint32_t));
memcpy(consts.map, &driver_params, num_consts * sizeof(uint32_t));
iova = consts.iova;
} else if (direct_indirect_load) {
iova = info->indirect->iova + info->indirect_offset;
@ -6406,12 +6417,12 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
/* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
* indirect dispatch.
*/
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
if (info->indirect && num_consts > IR3_DP_CS(base_group_x)) {
uint32_t indirect_driver_params[8] = {
0, 0, 0, subgroup_size,
0, 0, 0, subgroup_shift,
};
bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X;
bool emit_local = num_consts > IR3_DP_CS(local_group_size_x);
uint32_t emit_size = emit_local ? 8 : 4;
tu_cs_emit_pkt7(cs, CP_MEM_WRITE, 2 + emit_size);
@ -6448,22 +6459,10 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
(variant->constlen - offset) * 4);
if (!info->indirect) {
uint32_t driver_params[12] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->blocks[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->blocks[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->blocks[2],
[IR3_DP_WORK_DIM] = 0,
[IR3_DP_BASE_GROUP_X] = info->offsets[0],
[IR3_DP_BASE_GROUP_Y] = info->offsets[1],
[IR3_DP_BASE_GROUP_Z] = info->offsets[2],
[IR3_DP_CS_SUBGROUP_SIZE] = subgroup_size,
[IR3_DP_LOCAL_GROUP_SIZE_X] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Y] = 0,
[IR3_DP_LOCAL_GROUP_SIZE_Z] = 0,
[IR3_DP_SUBGROUP_ID_SHIFT] = subgroup_shift,
};
struct ir3_driver_params_cs driver_params =
build_driver_params_cs(variant, info);
assert(num_consts <= ARRAY_SIZE(driver_params));
assert(num_consts <= dword_sizeof(driver_params));
/* push constants */
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3 + num_consts);
@ -6474,9 +6473,7 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
CP_LOAD_STATE6_0_NUM_UNIT(num_consts / 4));
tu_cs_emit(cs, 0);
tu_cs_emit(cs, 0);
uint32_t i;
for (i = 0; i < num_consts; i++)
tu_cs_emit(cs, driver_params[i]);
tu_cs_emit_array(cs, (uint32_t *)&driver_params, num_consts);
} else if (!(info->indirect_offset & 0xf)) {
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 3);
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset) |
@ -6518,21 +6515,21 @@ tu_emit_compute_driver_params(struct tu_cmd_buffer *cmd,
/* Fill out IR3_DP_CS_SUBGROUP_SIZE and IR3_DP_SUBGROUP_ID_SHIFT for
* indirect dispatch.
*/
if (info->indirect && num_consts > IR3_DP_BASE_GROUP_X) {
bool emit_local = num_consts > IR3_DP_LOCAL_GROUP_SIZE_X;
if (info->indirect && num_consts > IR3_DP_CS(base_group_x)) {
bool emit_local = num_consts > IR3_DP_CS(local_group_size_x);
tu_cs_emit_pkt7(cs, tu6_stage2opcode(type), 7 + (emit_local ? 4 : 0));
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_BASE_GROUP_X / 4)) |
tu_cs_emit(cs, CP_LOAD_STATE6_0_DST_OFF(offset + (IR3_DP_CS(base_group_x) / 4)) |
CP_LOAD_STATE6_0_STATE_TYPE(ST6_CONSTANTS) |
CP_LOAD_STATE6_0_STATE_SRC(SS6_DIRECT) |
CP_LOAD_STATE6_0_STATE_BLOCK(tu6_stage2shadersb(type)) |
CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_BASE_GROUP_X) / 4));
CP_LOAD_STATE6_0_NUM_UNIT((num_consts - IR3_DP_CS(base_group_x)) / 4));
tu_cs_emit_qw(cs, 0);
tu_cs_emit(cs, 0); /* BASE_GROUP_X */
tu_cs_emit(cs, 0); /* BASE_GROUP_Y */
tu_cs_emit(cs, 0); /* BASE_GROUP_Z */
tu_cs_emit(cs, subgroup_size);
if (emit_local) {
assert(num_consts == align(IR3_DP_SUBGROUP_ID_SHIFT, 4));
assert(num_consts == align(IR3_DP_CS(subgroup_id_shift), 4));
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_X */
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Y */
tu_cs_emit(cs, 0); /* LOCAL_GROUP_SIZE_Z */

View file

@ -511,9 +511,9 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
if (!dev->compiler->load_shader_consts_via_preamble)
return false;
enum ir3_driver_param param =
unsigned param =
instr->intrinsic == nir_intrinsic_load_frag_size_ir3 ?
IR3_DP_FS_FRAG_SIZE : IR3_DP_FS_FRAG_OFFSET;
IR3_DP_FS(frag_size) : IR3_DP_FS(frag_offset);
unsigned offset = param - IR3_DP_FS_DYNAMIC;
@ -531,7 +531,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr,
nir_def *result =
ir3_load_driver_ubo(b, 1, &shader->const_state.fdm_ubo,
IR3_DP_FS_FRAG_INVOCATION_COUNT -
IR3_DP_FS(frag_invocation_count) -
IR3_DP_FS_DYNAMIC);
nir_def_replace(&instr->def, result);

View file

@ -59,12 +59,12 @@ cs_program_emit(struct fd_ringbuffer *ring, struct ir3_shader_variant *v)
num_wg_id, work_dim_id, unused_id;
local_invocation_id =
ir3_find_sysval_regid(v, SYSTEM_VALUE_LOCAL_INVOCATION_ID);
work_group_id = driver_param_base + IR3_DP_WORKGROUP_ID_X;
num_wg_id = driver_param_base + IR3_DP_NUM_WORK_GROUPS_X;
local_group_size_id = driver_param_base + IR3_DP_LOCAL_GROUP_SIZE_X;
work_dim_id = driver_param_base + IR3_DP_WORK_DIM;
work_group_id = driver_param_base + IR3_DP_CS(workgroup_id_x);
num_wg_id = driver_param_base + IR3_DP_CS(num_work_groups_x);
local_group_size_id = driver_param_base + IR3_DP_CS(local_group_size_x);
work_dim_id = driver_param_base + IR3_DP_CS(work_dim);
/* NOTE: At some point we'll want to use this, it's probably WGOFFSETCONSTID */
unused_id = driver_param_base + IR3_DP_BASE_GROUP_X;
unused_id = driver_param_base + IR3_DP_CS(base_group_x);
OUT_PKT0(ring, REG_A4XX_HLSQ_CL_CONTROL_0, 2);
OUT_RING(ring, A4XX_HLSQ_CL_CONTROL_0_WGIDCONSTID(work_group_id) |

View file

@ -281,19 +281,29 @@ fd6_build_driver_params(struct fd6_emit *emit)
return NULL;
}
unsigned size_dwords = num_dp * (4 + IR3_DP_VS_COUNT); /* 4dw PKT7 header */
bool needs_ucp = !!emit->vs->key.ucp_enables;
if (PIPELINE == HAS_TESS_GS) {
needs_ucp |= emit->gs && emit->gs->key.ucp_enables;
needs_ucp |= emit->hs && emit->hs->key.ucp_enables;
needs_ucp |= emit->ds && emit->ds->key.ucp_enables;
}
struct ir3_driver_params_vs p =
ir3_build_driver_params_vs(ctx, emit->info, emit->draw, emit->draw_id, needs_ucp);
unsigned size_dwords =
num_dp * (4 + dword_sizeof(p)); /* 4dw PKT7 header */
struct fd_ringbuffer *dpconstobj = fd_submit_new_ringbuffer(
ctx->batch->submit, size_dwords * 4, FD_RINGBUFFER_STREAMING);
if (emit->vs->need_driver_params) {
ir3_emit_driver_params(emit->vs, dpconstobj, ctx, emit->info,
emit->indirect, emit->draw, emit->draw_id);
ir3_emit_driver_params(emit->vs, dpconstobj, ctx, emit->info, emit->indirect, &p);
}
if (PIPELINE == HAS_TESS_GS) {
if (emit->gs && emit->gs->need_driver_params) {
ir3_emit_driver_params(emit->gs, dpconstobj, ctx, emit->info,
emit->indirect, emit->draw, 0);
ir3_emit_driver_params(emit->gs, dpconstobj, ctx, emit->info, emit->indirect, &p);
}
if (emit->hs && emit->hs->need_driver_params) {
@ -301,8 +311,7 @@ fd6_build_driver_params(struct fd6_emit *emit)
}
if (emit->ds && emit->ds->need_driver_params) {
ir3_emit_driver_params(emit->ds, dpconstobj, ctx, emit->info,
emit->indirect, emit->draw, 0);
ir3_emit_driver_params(emit->ds, dpconstobj, ctx, emit->info, emit->indirect, &p);
}
}

View file

@ -430,35 +430,44 @@ emit_kernel_params(struct fd_context *ctx, const struct ir3_shader_variant *v,
}
}
static inline struct ir3_driver_params_vs
ir3_build_driver_params_vs(struct fd_context *ctx,
const struct pipe_draw_info *info,
const struct pipe_draw_start_count_bias *draw,
uint32_t draw_id, bool needs_ucp)
assert_dt
{
struct ir3_driver_params_vs vertex_params = {
.draw_id = draw_id, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */
.vtxid_base = info->index_size ? draw->index_bias : draw->start,
.instid_base = info->start_instance,
.vtxcnt_max = ctx->streamout.max_tf_vtx,
.is_indexed_draw = info->index_size != 0 ? ~0 : 0,
};
if (needs_ucp) {
struct pipe_clip_state *ucp = &ctx->ucp;
for (unsigned i = 0; i < ARRAY_SIZE(vertex_params.ucp); i++) {
vertex_params.ucp[i].x = fui(ucp->ucp[i][0]);
vertex_params.ucp[i].y = fui(ucp->ucp[i][1]);
vertex_params.ucp[i].z = fui(ucp->ucp[i][2]);
vertex_params.ucp[i].w = fui(ucp->ucp[i][3]);
}
}
return vertex_params;
}
static inline void
ir3_emit_driver_params(const struct ir3_shader_variant *v,
struct fd_ringbuffer *ring, struct fd_context *ctx,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *draw,
const uint32_t draw_id) assert_dt
const struct ir3_driver_params_vs *vertex_params)
assert_dt
{
assert(v->need_driver_params);
const struct ir3_const_state *const_state = ir3_const_state(v);
uint32_t offset = const_state->offsets.driver_param;
uint32_t vertex_params[IR3_DP_VS_COUNT] = {
[IR3_DP_DRAWID] = draw_id, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */
[IR3_DP_VTXID_BASE] = info->index_size ? draw->index_bias : draw->start,
[IR3_DP_INSTID_BASE] = info->start_instance,
[IR3_DP_VTXCNT_MAX] = ctx->streamout.max_tf_vtx,
[IR3_DP_IS_INDEXED_DRAW] = info->index_size != 0 ? ~0 : 0,
};
if (v->key.ucp_enables) {
struct pipe_clip_state *ucp = &ctx->ucp;
unsigned pos = IR3_DP_UCP0_X;
for (unsigned i = 0; pos <= IR3_DP_UCP7_W; i++) {
for (unsigned j = 0; j < 4; j++) {
vertex_params[pos] = fui(ucp->ucp[i][j]);
pos++;
}
}
}
/* Only emit as many params as needed, i.e. up to the highest enabled UCP
* plane. However a binning pass may drop even some of these, so limit to
@ -466,7 +475,7 @@ ir3_emit_driver_params(const struct ir3_shader_variant *v,
*/
const uint32_t vertex_params_size =
MIN2(const_state->num_driver_params, (v->constlen - offset) * 4);
assert(vertex_params_size <= IR3_DP_VS_COUNT);
assert(vertex_params_size <= dword_sizeof(*vertex_params));
/* for indirect draw, we need to copy VTXID_BASE from
* indirect-draw parameters buffer.. which is annoying
@ -501,15 +510,28 @@ ir3_emit_driver_params(const struct ir3_shader_variant *v,
pipe_resource_reference(&vertex_params_rsc, NULL);
} else {
emit_const_user(ring, v, offset * 4, vertex_params_size, vertex_params);
emit_const_user(ring, v, offset * 4, vertex_params_size, (uint32_t *)vertex_params);
}
/* if needed, emit stream-out buffer addresses: */
if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
if (vertex_params->vtxcnt_max > 0) {
emit_tfbos(ctx, v, ring);
}
}
static inline struct ir3_driver_params_tcs
ir3_build_driver_params_tcs(struct fd_context *ctx)
assert_dt
{
return (struct ir3_driver_params_tcs) {
.default_outer_level_x = fui(ctx->default_outer_level[0]),
.default_outer_level_y = fui(ctx->default_outer_level[1]),
.default_outer_level_z = fui(ctx->default_outer_level[2]),
.default_outer_level_w = fui(ctx->default_outer_level[3]),
.default_inner_level_x = fui(ctx->default_inner_level[0]),
.default_inner_level_y = fui(ctx->default_inner_level[1]),
};
}
static inline void
ir3_emit_hs_driver_params(const struct ir3_shader_variant *v,
@ -521,20 +543,13 @@ ir3_emit_hs_driver_params(const struct ir3_shader_variant *v,
const struct ir3_const_state *const_state = ir3_const_state(v);
uint32_t offset = const_state->offsets.driver_param;
uint32_t hs_params[IR3_DP_HS_COUNT] = {
[IR3_DP_HS_DEFAULT_OUTER_LEVEL_X] = fui(ctx->default_outer_level[0]),
[IR3_DP_HS_DEFAULT_OUTER_LEVEL_Y] = fui(ctx->default_outer_level[1]),
[IR3_DP_HS_DEFAULT_OUTER_LEVEL_Z] = fui(ctx->default_outer_level[2]),
[IR3_DP_HS_DEFAULT_OUTER_LEVEL_W] = fui(ctx->default_outer_level[3]),
[IR3_DP_HS_DEFAULT_INNER_LEVEL_X] = fui(ctx->default_inner_level[0]),
[IR3_DP_HS_DEFAULT_INNER_LEVEL_Y] = fui(ctx->default_inner_level[1]),
};
struct ir3_driver_params_tcs hs_params = ir3_build_driver_params_tcs(ctx);
const uint32_t hs_params_size =
MIN2(const_state->num_driver_params, (v->constlen - offset) * 4);
assert(hs_params_size <= IR3_DP_HS_COUNT);
assert(hs_params_size <= dword_sizeof(hs_params));
emit_const_user(ring, v, offset * 4, hs_params_size, hs_params);
emit_const_user(ring, v, offset * 4, hs_params_size, (uint32_t *)&hs_params);
}
@ -552,7 +567,11 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v,
/* emit driver params every time: */
if (info && v->need_driver_params) {
ring_wfi(ctx->batch, ring);
ir3_emit_driver_params(v, ring, ctx, info, indirect, draw, 0);
struct ir3_driver_params_vs p =
ir3_build_driver_params_vs(ctx, info, draw, 0, v->key.ucp_enables);
ir3_emit_driver_params(v, ring, ctx, info, indirect, &p);
}
}
@ -565,6 +584,29 @@ ir3_emit_fs_consts(const struct ir3_shader_variant *v,
emit_common_consts(v, ring, ctx, PIPE_SHADER_FRAGMENT);
}
static inline struct ir3_driver_params_cs
ir3_build_driver_params_cs(const struct ir3_shader_variant *v,
const struct pipe_grid_info *info)
{
return (struct ir3_driver_params_cs) {
.num_work_groups_x = info->grid[0],
.num_work_groups_y = info->grid[1],
.num_work_groups_z = info->grid[2],
.work_dim = info->work_dim,
.base_group_x = info->grid_base[0],
.base_group_y = info->grid_base[1],
.base_group_z = info->grid_base[2],
.subgroup_size = v->info.subgroup_size,
.local_group_size_x = info->block[0],
.local_group_size_y = info->block[1],
.local_group_size_z = info->block[2],
.subgroup_id_shift = util_logbase2(v->info.subgroup_size),
.workgroup_id_x = 0, // TODO
.workgroup_id_y = 0, // TODO
.workgroup_id_z = 0, // TODO
};
}
static inline void
ir3_emit_cs_driver_params(const struct ir3_shader_variant *v,
struct fd_ringbuffer *ring, struct fd_context *ctx,
@ -614,27 +656,12 @@ ir3_emit_cs_driver_params(const struct ir3_shader_variant *v,
} else {
// TODO some of these are not part of the indirect state.. so we
// need to emit some of this directly in both cases.
uint32_t compute_params[IR3_DP_CS_COUNT] = {
[IR3_DP_NUM_WORK_GROUPS_X] = info->grid[0],
[IR3_DP_NUM_WORK_GROUPS_Y] = info->grid[1],
[IR3_DP_NUM_WORK_GROUPS_Z] = info->grid[2],
[IR3_DP_WORK_DIM] = info->work_dim,
[IR3_DP_BASE_GROUP_X] = info->grid_base[0],
[IR3_DP_BASE_GROUP_Y] = info->grid_base[1],
[IR3_DP_BASE_GROUP_Z] = info->grid_base[2],
[IR3_DP_CS_SUBGROUP_SIZE] = v->info.subgroup_size,
[IR3_DP_LOCAL_GROUP_SIZE_X] = info->block[0],
[IR3_DP_LOCAL_GROUP_SIZE_Y] = info->block[1],
[IR3_DP_LOCAL_GROUP_SIZE_Z] = info->block[2],
[IR3_DP_SUBGROUP_ID_SHIFT] = util_logbase2(v->info.subgroup_size),
[IR3_DP_WORKGROUP_ID_X] = 0, // TODO
[IR3_DP_WORKGROUP_ID_Y] = 0, // TODO
[IR3_DP_WORKGROUP_ID_Z] = 0, // TODO
};
struct ir3_driver_params_cs compute_params =
ir3_build_driver_params_cs(v, info);
uint32_t size =
MIN2(const_state->num_driver_params, v->constlen * 4 - offset * 4);
emit_const_user(ring, v, offset * 4, size, compute_params);
emit_const_user(ring, v, offset * 4, size, (uint32_t *)&compute_params);
}
}
}