diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index 02f8718544e..396d165558a 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1286,9 +1286,9 @@ intrinsic("cmat_copy", src_comp=[-1, -1]) # The float versions are not handled because those are not supported # by the backend. store("ssbo_ir3", [1, 1, 1], - indices=[WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) + indices=[BASE, WRITE_MASK, ACCESS, ALIGN_MUL, ALIGN_OFFSET]) load("ssbo_ir3", [1, 1, 1], - indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) + indices=[BASE, ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) intrinsic("ssbo_atomic_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) intrinsic("ssbo_atomic_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1, diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c index ed4c5c51970..36394a3072e 100644 --- a/src/freedreno/ir3/ir3_a6xx.c +++ b/src/freedreno/ir3/ir3_a6xx.c @@ -37,6 +37,20 @@ * encoding compared to a4xx/a5xx. */ +static void +lower_ssbo_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, + nir_src *offset_src, + struct ir3_instruction **offset, unsigned *imm_offset) +{ + if (ctx->compiler->has_ssbo_imm_offsets) { + ir3_lower_imm_offset(ctx, intr, offset_src, 7, offset, imm_offset); + } else { + assert(nir_intrinsic_base(intr) == 0); + *offset = ir3_get_src(ctx, offset_src)[0]; + *imm_offset = 0; + } +} + /* src[] = { buffer_index, offset }. No const_index */ static void emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, @@ -45,9 +59,9 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_block *b = ctx->block; struct ir3_instruction *offset; struct ir3_instruction *ldib; - unsigned imm_offset_val = 0; + unsigned imm_offset_val; - offset = ir3_get_src(ctx, &intr->src[2])[0]; + lower_ssbo_offset(ctx, intr, &intr->src[2], &offset, &imm_offset_val); struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val); ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0, @@ -78,15 +92,15 @@ emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) struct ir3_instruction *stib, *val, *offset; unsigned wrmask = nir_intrinsic_write_mask(intr); unsigned ncomp = ffs(~wrmask) - 1; - unsigned imm_offset_val = 0; + unsigned imm_offset_val; assert(wrmask == BITFIELD_MASK(intr->num_components)); /* src0 is offset, src1 is immediate offset, src2 is value: */ val = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[0]), ncomp); - offset = ir3_get_src(ctx, &intr->src[3])[0]; + lower_ssbo_offset(ctx, intr, &intr->src[3], &offset, &imm_offset_val); struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val); stib = ir3_STIB(b, ir3_ssbo_to_ibo(ctx, intr->src[1]), 0, offset, 0, diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 75c0716d0c3..b4d4c8b5ef5 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1603,14 +1603,15 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, } struct ir3_block *b = ctx->block; - struct ir3_instruction *offset = ir3_get_src(ctx, &intr->src[2])[0]; + nir_src *offset_src = &intr->src[2]; struct ir3_instruction *coords = NULL; unsigned imm_offset = 0; if (ctx->compiler->has_isam_v) { - coords = offset; + ir3_lower_imm_offset(ctx, intr, offset_src, 8, &coords, &imm_offset); } else { - coords = ir3_collect(b, offset, create_immed(b, 0)); + coords = + ir3_collect(b, ir3_get_src(ctx, offset_src)[0], create_immed(b, 0)); } struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false); @@ -1624,6 +1625,10 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, if (ctx->compiler->has_isam_v) { sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D); + + if (imm_offset) { + sam->flags |= IR3_INSTR_IMM_OFFSET; + } } ir3_handle_nonuniform(sam, intr); diff --git a/src/freedreno/ir3/ir3_context.c b/src/freedreno/ir3/ir3_context.c index 8a228723577..be94323f016 100644 --- a/src/freedreno/ir3/ir3_context.c +++ b/src/freedreno/ir3/ir3_context.c @@ -31,6 +31,7 @@ #include "ir3_shader.h" #include "nir.h" #include "nir_intrinsics_indices.h" +#include "util/u_math.h" struct ir3_context * ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader, @@ -673,3 +674,29 @@ ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, */ array_insert(block, block->keeps, mov); } + +void +ir3_lower_imm_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, + nir_src *offset_src, unsigned imm_offset_bits, + struct ir3_instruction **offset, unsigned *imm_offset) +{ + nir_const_value *nir_const_offset = nir_src_as_const_value(*offset_src); + int base = nir_intrinsic_base(intr); + unsigned imm_offset_bound = (1 << imm_offset_bits); + assert(base >= 0 && base < imm_offset_bound); + + if (nir_const_offset) { + /* If both the offset and the base (immed offset) are constants, lower the + * offset to a multiple of the bound and the immed offset to the + * remainder. This ensures that the offset register can often be reused + * among multiple contiguous accesses. + */ + uint32_t full_offset = base + nir_const_offset->u32; + *offset = + create_immed(ctx->block, ROUND_DOWN_TO(full_offset, imm_offset_bound)); + *imm_offset = full_offset % imm_offset_bound; + } else { + *offset = ir3_get_src(ctx, offset_src)[0]; + *imm_offset = base; + } +} diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h index fbecb2c95e5..8a803454ca9 100644 --- a/src/freedreno/ir3/ir3_context.h +++ b/src/freedreno/ir3/ir3_context.h @@ -255,6 +255,10 @@ struct ir3_instruction *ir3_create_array_load(struct ir3_context *ctx, void ir3_create_array_store(struct ir3_context *ctx, struct ir3_array *arr, int n, struct ir3_instruction *src, struct ir3_instruction *address); +void ir3_lower_imm_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, + nir_src *offset_src, unsigned imm_offset_bits, + struct ir3_instruction **offset, + unsigned *imm_offset); static inline type_t utype_for_size(unsigned bit_size)