From 70934f3015ef19ebd692673b3c2a4c8bfefbe765 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Tue, 20 Aug 2024 13:01:09 -0400 Subject: [PATCH] freedreno, tu, ir3: Enable tiled workgroup item dispatch on a7xx There is a 1.6% improvement in the Sacha Willems computeshader demo. Part-of: --- src/freedreno/computerator/a6xx.cc | 14 +++++++--- src/freedreno/ir3/ir3_nir.c | 26 ++++++++++++++----- src/freedreno/ir3/ir3_shader.c | 5 ++++ src/freedreno/ir3/ir3_shader.h | 2 ++ src/freedreno/vulkan/tu_shader.cc | 6 +++-- .../drivers/freedreno/a6xx/fd6_compute.cc | 5 ++-- 6 files changed, 45 insertions(+), 13 deletions(-) diff --git a/src/freedreno/computerator/a6xx.cc b/src/freedreno/computerator/a6xx.cc index 2a504ee2608..65ac3a42f36 100644 --- a/src/freedreno/computerator/a6xx.cc +++ b/src/freedreno/computerator/a6xx.cc @@ -225,9 +225,17 @@ cs_program_emit(struct fd_ringbuffer *ring, struct kernel *kernel) A6XX_SP_CS_CNTL_0_WGSIZECONSTID(regid(63, 0)) | A6XX_SP_CS_CNTL_0_WGOFFSETCONSTID(regid(63, 0)) | A6XX_SP_CS_CNTL_0_LOCALIDREGID(local_invocation_id)); - OUT_REG(ring, - SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0), - .threadsize = thrsz)); + if (CHIP == A7XX) { + /* TODO allow the shader to control the tiling */ + OUT_REG(ring, + SP_CS_CNTL_1(A7XX, .linearlocalidregid = regid(63, 0), + .threadsize = thrsz, + .workitemrastorder = WORKITEMRASTORDER_LINEAR)); + } else { + OUT_REG(ring, + SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0), + .threadsize = thrsz)); + } } OUT_PKT4(ring, REG_A6XX_SP_CS_OBJ_START, 2); diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c index 374d87f7c7a..211af74417d 100644 --- a/src/freedreno/ir3/ir3_nir.c +++ b/src/freedreno/ir3/ir3_nir.c @@ -607,9 +607,23 @@ lower_subgroup_id_filter(const nir_instr *instr, const void *unused) } static nir_def * -lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused) +lower_subgroup_id(nir_builder *b, nir_instr *instr, void *_shader) { - (void)unused; + struct ir3_shader *shader = _shader; + + /* Vulkan allows implementations to tile workgroup invocations even when + * subgroup operations are involved, which is implied by this Note: + * + * "There is no direct relationship between SubgroupLocalInvocationId and + * LocalInvocationId or LocalInvocationIndex." + * + * However there is no way to get SubgroupId directly, so we have to use + * LocalInvocationIndex here. This means that whenever we do this lowering we + * have to force linear dispatch to make sure that the relation between + * SubgroupId/SubgroupLocalInvocationId and LocalInvocationIndex is what we + * expect. + */ + shader->cs.force_linear_dispatch = true; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); if (intr->intrinsic == nir_intrinsic_load_subgroup_invocation) { @@ -638,10 +652,10 @@ lower_subgroup_id(nir_builder *b, nir_instr *instr, void *unused) } static bool -ir3_nir_lower_subgroup_id_cs(nir_shader *shader) +ir3_nir_lower_subgroup_id_cs(nir_shader *nir, struct ir3_shader *shader) { - return nir_shader_lower_instructions(shader, lower_subgroup_id_filter, - lower_subgroup_id, NULL); + return nir_shader_lower_instructions(nir, lower_subgroup_id_filter, + lower_subgroup_id, shader); } /** @@ -764,7 +778,7 @@ ir3_nir_post_finalize(struct ir3_shader *shader) if ((s->info.stage == MESA_SHADER_COMPUTE) || (s->info.stage == MESA_SHADER_KERNEL)) { bool progress = false; - NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs); + NIR_PASS(progress, s, ir3_nir_lower_subgroup_id_cs, shader); /* ir3_nir_lower_subgroup_id_cs creates extra compute intrinsics which * we need to lower again. diff --git a/src/freedreno/ir3/ir3_shader.c b/src/freedreno/ir3/ir3_shader.c index 8863a95336a..7dd6be96c48 100644 --- a/src/freedreno/ir3/ir3_shader.c +++ b/src/freedreno/ir3/ir3_shader.c @@ -410,6 +410,11 @@ create_variant(struct ir3_shader *shader, const struct ir3_shader_key *key, shader->nir_finalized = true; } + if (v->type == MESA_SHADER_COMPUTE || + v->type == MESA_SHADER_KERNEL) { + v->cs.force_linear_dispatch = shader->cs.force_linear_dispatch; + } + if (!compile_variant(shader, v)) goto fail; diff --git a/src/freedreno/ir3/ir3_shader.h b/src/freedreno/ir3/ir3_shader.h index 54ed2d77132..0f7f9653281 100644 --- a/src/freedreno/ir3/ir3_shader.h +++ b/src/freedreno/ir3/ir3_shader.h @@ -837,6 +837,7 @@ struct ir3_shader_variant { struct { unsigned req_input_mem; unsigned req_local_mem; + bool force_linear_dispatch; } cs; }; @@ -909,6 +910,7 @@ struct ir3_shader { struct { unsigned req_input_mem; /* in dwords */ unsigned req_local_mem; + bool force_linear_dispatch; } cs; /* For vertex shaders: */ struct { diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index d19bb08ba14..44f4e5f62ef 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -1452,8 +1452,10 @@ tu6_emit_cs_config(struct tu_cs *cs, SP_CS_CNTL_1(CHIP, .linearlocalidregid = regid(63, 0), .threadsize = thrsz_cs, - /* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */ - .workitemrastorder = WORKITEMRASTORDER_LINEAR, )); + .workitemrastorder = + v->cs.force_linear_dispatch ? + WORKITEMRASTORDER_LINEAR : + WORKITEMRASTORDER_TILED, )); tu_cs_emit_regs( cs, A7XX_HLSQ_CS_LOCAL_SIZE(.localsizex = v->local_size[0] - 1, diff --git a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc index fef92fb96a1..f96f8a16ae2 100644 --- a/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc +++ b/src/gallium/drivers/freedreno/a6xx/fd6_compute.cc @@ -136,8 +136,9 @@ cs_program_emit(struct fd_context *ctx, struct fd_ringbuffer *ring, CHIP, .linearlocalidregid = INVALID_REG, .threadsize = thrsz_cs, - /* A7XX TODO: enable WORKITEMRASTORDER_TILED when we don't use subgroup ops. */ - .workitemrastorder = WORKITEMRASTORDER_LINEAR, + .workitemrastorder = + v->cs.force_linear_dispatch ? WORKITEMRASTORDER_LINEAR + : WORKITEMRASTORDER_TILED, ) ); OUT_REG(ring,