freedreno, tu, ir3: Enable tiled workgroup item dispatch on a7xx

There is a 1.6% improvement in the Sacha Willems computeshader demo.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30758>
This commit is contained in:
Connor Abbott 2024-08-20 13:01:09 -04:00 committed by Marge Bot
parent 58ed1854c4
commit 70934f3015
6 changed files with 45 additions and 13 deletions

View file

@ -225,9 +225,17 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
OUT_REG(ring,
SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz));
if (CHIP == A7XX) {
/* TODO allow the shader to control the tiling */
OUT_REG(ring,
SP_CS_CNTL_1(A7XX, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz,
.workitemrastorder = WORKITEMRASTORDER_LINEAR));
} else {
OUT_REG(ring,
SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
.threadsize = thrsz));
}
}
OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);

View file

@ -607,9 +607,23 @@ lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
}
static nir_def *
lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader)
{
(void)unused;
struct ir3_shader *shader = _shader;
/* Vulkan allows implementations to tile workgroup invocations even when
* subgroup operations are involved, which is implied by this Note:
*
* "There is no direct relationship between SubgroupLocalInvocationId and
* LocalInvocationId or LocalInvocationIndex."
*
* However there is no way to get SubgroupId directly, so we have to use
* LocalInvocationIndex here. This means that whenever we do this lowering we
* have to force linear dispatch to make sure that the relation between
* SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we
* expect.
*/
shader->cs.force_linear_dispatch = true;
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
@ -638,10 +652,10 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
}
static bool
ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
ir3_nir_lower_subgroup_id_cs(nir_shader *nir, struct ir3_shader *shader)
{
return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
lower_subgroup_id, NULL);
return nir_shader_lower_instructions(nir, lower_subgroup_id_filter,
lower_subgroup_id, shader);
}
/**
@ -764,7 +778,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
if ((s->info.stage == MESA_SHADER_COMPUTE) ||
(s->info.stage == MESA_SHADER_KERNEL)) {
bool progress = false;
NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);
NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader);
/* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
* we need to lower again.

View file

@ -410,6 +410,11 @@ create_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
shader->nir_finalized = true;
}
if (v->type == MESA_SHADER_COMPUTE ||
v->type == MESA_SHADER_KERNEL) {
v->cs.force_linear_dispatch = shader->cs.force_linear_dispatch;
}
if (!compile_variant(shader, v))
goto fail;

View file

@ -837,6 +837,7 @@ struct ir3_shader_variant {
struct {
unsigned req_input_mem;
unsigned req_local_mem;
bool force_linear_dispatch;
} cs;
};
@ -909,6 +910,7 @@ struct ir3_shader {
struct {
unsigned req_input_mem; /* in dwords */
unsigned req_local_mem;
bool force_linear_dispatch;
} cs;
/* For vertex shaders: */
struct {

View file

@ -1452,8 +1452,10 @@ tu6_emit_cs_config(struct tu_cs *cs,
SP_CS_CNTL_1(CHIP,
.linearlocalidregid = regid(63, 0),
.threadsize = thrsz_cs,
/* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */
.workitemrastorder = WORKITEMRASTORDER_LINEAR, ));
.workitemrastorder =
v->cs.force_linear_dispatch ?
WORKITEMRASTORDER_LINEAR :
WORKITEMRASTORDER_TILED, ));
tu_cs_emit_regs(
cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,

View file

@ -136,8 +136,9 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
CHIP,
.linearlocalidregid = INVALID_REG,
.threadsize = thrsz_cs,
/* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */
.workitemrastorder = WORKITEMRASTORDER_LINEAR,
.workitemrastorder =
v->cs.force_linear_dispatch ? WORKITEMRASTORDER_LINEAR
: WORKITEMRASTORDER_TILED,
)
);
OUT_REG(ring,