mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-10 12:30:11 +01:00
freedreno, tu, ir3: Enable tiled workgroup item dispatch on a7xx
There is a 1.6% improvement in the Sacha Willems computeshader demo. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30758>
This commit is contained in:
parent
58ed1854c4
commit
70934f3015
6 changed files with 45 additions and 13 deletions
|
|
@ -225,9 +225,17 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel)
|
|||
A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) |
|
||||
A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) |
|
||||
A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id));
|
||||
OUT_REG(ring,
|
||||
SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz));
|
||||
if (CHIP == A7XX) {
|
||||
/* TODO allow the shader to control the tiling */
|
||||
OUT_REG(ring,
|
||||
SP_CS_CNTL_1(A7XX, .linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz,
|
||||
.workitemrastorder = WORKITEMRASTORDER_LINEAR));
|
||||
} else {
|
||||
OUT_REG(ring,
|
||||
SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz));
|
||||
}
|
||||
}
|
||||
|
||||
OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2);
|
||||
|
|
|
|||
|
|
@ -607,9 +607,23 @@ lower_subgroup_id_filter(const nir_instr *instr, const void *unused)
|
|||
}
|
||||
|
||||
static nir_def *
|
||||
lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
|
||||
lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader)
|
||||
{
|
||||
(void)unused;
|
||||
struct ir3_shader *shader = _shader;
|
||||
|
||||
/* Vulkan allows implementations to tile workgroup invocations even when
|
||||
* subgroup operations are involved, which is implied by this Note:
|
||||
*
|
||||
* "There is no direct relationship between SubgroupLocalInvocationId and
|
||||
* LocalInvocationId or LocalInvocationIndex."
|
||||
*
|
||||
* However there is no way to get SubgroupId directly, so we have to use
|
||||
* LocalInvocationIndex here. This means that whenever we do this lowering we
|
||||
* have to force linear dispatch to make sure that the relation between
|
||||
* SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we
|
||||
* expect.
|
||||
*/
|
||||
shader->cs.force_linear_dispatch = true;
|
||||
|
||||
nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
|
||||
if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) {
|
||||
|
|
@ -638,10 +652,10 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused)
|
|||
}
|
||||
|
||||
static bool
|
||||
ir3_nir_lower_subgroup_id_cs(nir_shader *shader)
|
||||
ir3_nir_lower_subgroup_id_cs(nir_shader *nir, struct ir3_shader *shader)
|
||||
{
|
||||
return nir_shader_lower_instructions(shader, lower_subgroup_id_filter,
|
||||
lower_subgroup_id, NULL);
|
||||
return nir_shader_lower_instructions(nir, lower_subgroup_id_filter,
|
||||
lower_subgroup_id, shader);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -764,7 +778,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader)
|
|||
if ((s->info.stage == MESA_SHADER_COMPUTE) ||
|
||||
(s->info.stage == MESA_SHADER_KERNEL)) {
|
||||
bool progress = false;
|
||||
NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs);
|
||||
NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader);
|
||||
|
||||
/* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which
|
||||
* we need to lower again.
|
||||
|
|
|
|||
|
|
@ -410,6 +410,11 @@ create_variant(struct ir3_shader *shader, const struct ir3_shader_key *key,
|
|||
shader->nir_finalized = true;
|
||||
}
|
||||
|
||||
if (v->type == MESA_SHADER_COMPUTE ||
|
||||
v->type == MESA_SHADER_KERNEL) {
|
||||
v->cs.force_linear_dispatch = shader->cs.force_linear_dispatch;
|
||||
}
|
||||
|
||||
if (!compile_variant(shader, v))
|
||||
goto fail;
|
||||
|
||||
|
|
|
|||
|
|
@ -837,6 +837,7 @@ struct ir3_shader_variant {
|
|||
struct {
|
||||
unsigned req_input_mem;
|
||||
unsigned req_local_mem;
|
||||
bool force_linear_dispatch;
|
||||
} cs;
|
||||
};
|
||||
|
||||
|
|
@ -909,6 +910,7 @@ struct ir3_shader {
|
|||
struct {
|
||||
unsigned req_input_mem; /* in dwords */
|
||||
unsigned req_local_mem;
|
||||
bool force_linear_dispatch;
|
||||
} cs;
|
||||
/* For vertex shaders: */
|
||||
struct {
|
||||
|
|
|
|||
|
|
@ -1452,8 +1452,10 @@ tu6_emit_cs_config(struct tu_cs *cs,
|
|||
SP_CS_CNTL_1(CHIP,
|
||||
.linearlocalidregid = regid(63, 0),
|
||||
.threadsize = thrsz_cs,
|
||||
/* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */
|
||||
.workitemrastorder = WORKITEMRASTORDER_LINEAR, ));
|
||||
.workitemrastorder =
|
||||
v->cs.force_linear_dispatch ?
|
||||
WORKITEMRASTORDER_LINEAR :
|
||||
WORKITEMRASTORDER_TILED, ));
|
||||
|
||||
tu_cs_emit_regs(
|
||||
cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1,
|
||||
|
|
|
|||
|
|
@ -136,8 +136,9 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring,
|
|||
CHIP,
|
||||
.linearlocalidregid = INVALID_REG,
|
||||
.threadsize = thrsz_cs,
|
||||
/* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */
|
||||
.workitemrastorder = WORKITEMRASTORDER_LINEAR,
|
||||
.workitemrastorder =
|
||||
v->cs.force_linear_dispatch ? WORKITEMRASTORDER_LINEAR
|
||||
: WORKITEMRASTORDER_TILED,
|
||||
)
|
||||
);
|
||||
OUT_REG(ring,
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue