From bf203fbf208f220325007f875a98ebe4ef974dde Mon Sep 17 00:00:00 2001 From: Job Noorman Date: Thu, 26 Jun 2025 11:35:55 +0200 Subject: [PATCH] ir3: add codegen for movs movs is just nir_intrinsic_read_invocation so this is a matter of disabling the current lowering to nir_intrinsic_read_invocation_cond_ir3 and adding lowering to movs. Signed-off-by: Job Noorman Part-of: --- src/freedreno/common/freedreno_dev_info.h | 1 + src/freedreno/common/freedreno_devices.py | 2 ++ src/freedreno/ir3/ir3.h | 23 +++++++++++++++ src/freedreno/ir3/ir3_compiler.c | 1 + src/freedreno/ir3/ir3_compiler.h | 3 ++ src/freedreno/ir3/ir3_compiler_nir.c | 35 ++++++++++++++++++++++- src/freedreno/ir3/ir3_lower_subgroups.c | 2 ++ 7 files changed, 66 insertions(+), 1 deletion(-) diff --git a/src/freedreno/common/freedreno_dev_info.h b/src/freedreno/common/freedreno_dev_info.h index b6fef7781cf..f74baa908b1 100644 --- a/src/freedreno/common/freedreno_dev_info.h +++ b/src/freedreno/common/freedreno_dev_info.h @@ -135,6 +135,7 @@ struct fd_dev_info { bool has_getfiberid; bool mov_half_shared_quirk; + bool has_movs; bool has_dp2acc; bool has_dp4acc; diff --git a/src/freedreno/common/freedreno_devices.py b/src/freedreno/common/freedreno_devices.py index cf1165299e3..0f606001310 100644 --- a/src/freedreno/common/freedreno_devices.py +++ b/src/freedreno/common/freedreno_devices.py @@ -425,6 +425,7 @@ a6xx_gen4 = A6XXProps( has_lpac = True, has_legacy_pipeline_shading_rate = True, has_getfiberid = True, + has_movs = True, has_dp2acc = True, has_dp4acc = True, enable_lrz_fast_clear = True, @@ -900,6 +901,7 @@ a7xx_base = A6XXProps( has_sample_locations = True, has_lpac = True, has_getfiberid = True, + has_movs = True, has_dp2acc = True, has_dp4acc = True, enable_lrz_fast_clear = True, diff --git a/src/freedreno/ir3/ir3.h b/src/freedreno/ir3/ir3.h index 32b97f55712..211bbb62d41 100644 --- a/src/freedreno/ir3/ir3.h +++ b/src/freedreno/ir3/ir3.h @@ -2657,6 +2657,29 @@ ir3_COV_rpt(struct ir3_builder *build, unsigned nrpt, return dst; } +static inline struct ir3_instruction * +ir3_MOVS(struct ir3_builder *build, struct ir3_instruction *src, + struct ir3_instruction *invocation, type_t type) +{ + bool use_a0 = writes_addr0(invocation); + struct ir3_instruction *instr = + ir3_build_instr(build, OPC_MOVS, 1, use_a0 ? 1 : 2); + ir3_register_flags flags = type_flags(type); + + __ssa_dst(instr)->flags |= flags | IR3_REG_SHARED; + __ssa_src(instr, src, 0); + + if (use_a0) { + ir3_instr_set_address(instr, invocation); + } else { + __ssa_src(instr, invocation, 0); + } + + instr->cat1.src_type = type; + instr->cat1.dst_type = type; + return instr; +} + static inline struct ir3_instruction * ir3_MOVMSK(struct ir3_builder *build, unsigned components) { diff --git a/src/freedreno/ir3/ir3_compiler.c b/src/freedreno/ir3/ir3_compiler.c index 753368b4da0..8b09f199ce9 100644 --- a/src/freedreno/ir3/ir3_compiler.c +++ b/src/freedreno/ir3/ir3_compiler.c @@ -217,6 +217,7 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id, compiler->has_getfiberid = dev_info->a6xx.has_getfiberid; compiler->mov_half_shared_quirk = dev_info->a6xx.mov_half_shared_quirk; + compiler->has_movs = dev_info->a6xx.has_movs; compiler->has_dp2acc = dev_info->a6xx.has_dp2acc; compiler->has_dp4acc = dev_info->a6xx.has_dp4acc; diff --git a/src/freedreno/ir3/ir3_compiler.h b/src/freedreno/ir3/ir3_compiler.h index 00006ffb3bb..acbac4417c4 100644 --- a/src/freedreno/ir3/ir3_compiler.h +++ b/src/freedreno/ir3/ir3_compiler.h @@ -210,6 +210,9 @@ struct ir3_compiler { /* Whether half register shared->non-shared moves are broken. */ bool mov_half_shared_quirk; + /* Whether movs is supported for subgroupBroadcast. */ + bool has_movs; + /* True if the shfl instruction is supported. Needed for subgroup rotate and * (more efficient) shuffle. */ diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 4d3e65e3687..5d859fb0366 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -2604,7 +2604,21 @@ apply_mov_half_shared_quirk(struct ir3_context *ctx, * adding an extra mov here so that the original destination stays full. */ if (src->dsts[0]->flags & IR3_REG_HALF) { - dst = ir3_MOV(&ctx->build, dst, TYPE_U32); + if (dst->opc == OPC_MOVS) { + /* For movs, we have to fix up its dst_type and then convert back to + * its original dst_type. Note that this might generate movs.u8u32 + * which doesn't work correctly, but since we convert back using + * cov.u32u8, the end result will be correct. + */ + type_t dst_type = dst->cat1.dst_type; + assert(type_uint(dst_type)); + + dst->cat1.dst_type = TYPE_U32; + dst->dsts[0]->flags &= ~IR3_REG_HALF; + dst = ir3_COV(&ctx->build, dst, dst->cat1.dst_type, dst_type); + } else { + dst = ir3_MOV(&ctx->build, dst, TYPE_U32); + } if (!ctx->compiler->has_scalar_alu) dst->dsts[0]->flags &= ~IR3_REG_SHARED; } @@ -3166,6 +3180,25 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) break; } + case nir_intrinsic_read_invocation: { + struct ir3_instruction *const *srcs = ir3_get_src(ctx, &intr->src[0]); + nir_src *nir_invocation = &intr->src[1]; + struct ir3_instruction *invocation = ir3_get_src(ctx, nir_invocation)[0]; + + if (!nir_src_is_const(*nir_invocation)) { + invocation = ir3_get_addr0(ctx, invocation, 1); + } + + for (unsigned i = 0; i < intr->def.num_components; i++) { + dst[i] = ir3_MOVS(b, srcs[i], invocation, + type_uint_size(intr->def.bit_size)); + dst[i] = apply_mov_half_shared_quirk(ctx, srcs[i], dst[i]); + } + + create_rpt = true; + break; + } + case nir_intrinsic_read_first_invocation: { struct ir3_instruction *src = ir3_get_src(ctx, &intr->src[0])[0]; dst[0] = ir3_READ_FIRST_MACRO(b, src, 0); diff --git a/src/freedreno/ir3/ir3_lower_subgroups.c b/src/freedreno/ir3/ir3_lower_subgroups.c index 0aa7561ac1b..73bd9b9e47e 100644 --- a/src/freedreno/ir3/ir3_lower_subgroups.c +++ b/src/freedreno/ir3/ir3_lower_subgroups.c @@ -745,6 +745,8 @@ ir3_nir_lower_subgroups_filter(const nir_instr *instr, const void *data) default: return intrin->def.num_components > 1; } + case nir_intrinsic_read_invocation: + return !compiler->has_movs; default: return true; }