From 91f19bcbe072e663c1c4e9cd81f12ed0824a30e9 Mon Sep 17 00:00:00 2001 From: Connor Abbott Date: Fri, 15 Mar 2024 09:54:54 -0400 Subject: [PATCH] ir3: Plumb through two-dimensional UAV loads There is native support for D3D-style untyped UAVs, which are an unsized array of "records." This will be needed for acceleration structures, because normal SSBO descriptors aren't large enough to cover all the 128-byte instance descriptors for the maximum number of instances (2**24). Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 1 + src/compiler/nir/nir_intrinsics.py | 8 +++ src/freedreno/ir3/ir3_a6xx.c | 37 ++++++++-- src/freedreno/ir3/ir3_compiler_nir.c | 73 ++++++++++++++------ src/freedreno/ir3/ir3_context.h | 3 + src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 39 +++++++++++ src/freedreno/vulkan/tu_shader.cc | 1 + 7 files changed, 135 insertions(+), 27 deletions(-) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 7b09cfd09ee..59796b7cd41 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -550,6 +550,7 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_load_ssbo: case nir_intrinsic_load_ssbo_ir3: + case nir_intrinsic_load_uav_ir3: is_divergent = (src_divergent(instr->src[0], state) && (nir_intrinsic_access(instr) & ACCESS_NON_UNIFORM)) || src_divergent(instr->src[1], state) || diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index fa0fcc1288c..d0255752ba0 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1355,6 +1355,14 @@ intrinsic("ssbo_atomic_ir3", src_comp=[1, 1, 1, 1], dest_comp=1, intrinsic("ssbo_atomic_swap_ir3", src_comp=[1, 1, 1, 1, 1], dest_comp=1, indices=[ACCESS, ATOMIC_OP]) +# IR3-specific intrinsic for UAVs, which are like SSBOs but with a source +# for which "record" to access as well as the offset within the record, instead +# of just an offset. The record stride is part of the descriptor. +# Currently this is just used for the ray-tracing TLAS descriptor, where a +# normal SSBO wouldn't have enough range. +load("uav_ir3", [1, 2], + indices=[ACCESS, ALIGN_MUL, ALIGN_OFFSET], flags=[CAN_ELIMINATE]) + # System values for freedreno geometry shaders. system_value("vs_primitive_stride_ir3", 1) system_value("vs_vertex_stride_ir3", 1) diff --git a/src/freedreno/ir3/ir3_a6xx.c b/src/freedreno/ir3/ir3_a6xx.c index 2a57c2e48ff..7dc22468bcb 100644 --- a/src/freedreno/ir3/ir3_a6xx.c +++ b/src/freedreno/ir3/ir3_a6xx.c @@ -33,24 +33,22 @@ lower_ssbo_offset(struct ir3_context *ctx, nir_intrinsic_instr *intr, } } -/* src[] = { buffer_index, offset }. No const_index */ static void -emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, - struct ir3_instruction **dst) +emit_load_uav(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction *offset, + unsigned imm_offset_val, + struct ir3_instruction **dst) { struct ir3_builder *b = &ctx->build; - struct ir3_instruction *offset; struct ir3_instruction *ldib; - unsigned imm_offset_val; - lower_ssbo_offset(ctx, intr, &intr->src[2], &offset, &imm_offset_val); struct ir3_instruction *imm_offset = create_immed(b, imm_offset_val); ldib = ir3_LDIB(b, ir3_ssbo_to_ibo(ctx, intr->src[0]), 0, offset, 0, imm_offset, 0); ldib->dsts[0]->wrmask = MASK(intr->num_components); ldib->cat6.iim_val = intr->num_components; - ldib->cat6.d = 1; + ldib->cat6.d = reg_elems(offset->dsts[0]); switch (intr->def.bit_size) { case 8: /* This encodes the 8-bit SSBO load and matches blob's encoding of @@ -83,6 +81,30 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, ir3_split_dest(b, dst, ldib, 0, intr->num_components); } +/* src[] = { buffer_index, offset }. No const_index */ +static void +emit_intrinsic_load_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_instruction *offset; + unsigned imm_offset_val; + + lower_ssbo_offset(ctx, intr, &intr->src[2], &offset, &imm_offset_val); + emit_load_uav(ctx, intr, offset, imm_offset_val, dst); +} + +static void +emit_intrinsic_load_uav(struct ir3_context *ctx, nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + struct ir3_builder *b = &ctx->build; + struct ir3_instruction *offset; + + offset = ir3_create_collect(b, ir3_get_src(ctx, &intr->src[1]), 2); + + emit_load_uav(ctx, intr, offset, 0, dst); +} + /* src[] = { value, block_index, offset }. const_index[] = { write_mask } */ static void emit_intrinsic_store_ssbo(struct ir3_context *ctx, nir_intrinsic_instr *intr) @@ -576,6 +598,7 @@ emit_intrinsic_atomic_global(struct ir3_context *ctx, nir_intrinsic_instr *intr) const struct ir3_context_funcs ir3_a6xx_funcs = { .emit_intrinsic_load_ssbo = emit_intrinsic_load_ssbo, + .emit_intrinsic_load_uav = emit_intrinsic_load_uav, .emit_intrinsic_store_ssbo = emit_intrinsic_store_ssbo, .emit_intrinsic_atomic_ssbo = emit_intrinsic_atomic_ssbo, .emit_intrinsic_load_image = emit_intrinsic_load_image, diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c index 4c4a7b7ba6f..6cf4dffde3b 100644 --- a/src/freedreno/ir3/ir3_compiler_nir.c +++ b/src/freedreno/ir3/ir3_compiler_nir.c @@ -1957,6 +1957,39 @@ get_bindless_samp_src(struct ir3_context *ctx, nir_src *tex, return info; } +static void +emit_readonly_load_uav(struct ir3_context *ctx, + nir_intrinsic_instr *intr, + nir_src *index, + struct ir3_instruction *coords, + unsigned imm_offset, + bool uav_load, + struct ir3_instruction **dst) +{ + struct ir3_builder *b = &ctx->build; + struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, index, false); + + unsigned num_components = intr->def.num_components; + struct ir3_instruction *sam = + emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size), + MASK(num_components), coords, create_immed(b, imm_offset)); + + ir3_handle_nonuniform(sam, intr); + + sam->barrier_class = IR3_BARRIER_BUFFER_R; + sam->barrier_conflict = IR3_BARRIER_BUFFER_W; + + ir3_split_dest(b, dst, sam, 0, num_components); + + if (ctx->compiler->has_isam_v && !uav_load) { + sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D); + + if (imm_offset) { + sam->flags |= IR3_INSTR_IMM_OFFSET; + } + } +} + /* src[] = { buffer_index, offset }. No const_index */ static void emit_intrinsic_load_ssbo(struct ir3_context *ctx, @@ -1987,29 +2020,26 @@ emit_intrinsic_load_ssbo(struct ir3_context *ctx, ir3_collect(b, ir3_get_src(ctx, offset_src)[0], create_immed(b, 0)); } - struct tex_src_info info = get_image_ssbo_samp_tex_src(ctx, &intr->src[0], false); + emit_readonly_load_uav(ctx, intr, &intr->src[0], coords, imm_offset, false, dst); +} - unsigned num_components = intr->def.num_components; - assert(num_components == 1 || ctx->compiler->has_isam_v); - - struct ir3_instruction *sam = - emit_sam(ctx, OPC_ISAM, info, utype_for_size(intr->def.bit_size), - MASK(num_components), coords, create_immed(b, imm_offset)); - - if (ctx->compiler->has_isam_v) { - sam->flags |= (IR3_INSTR_V | IR3_INSTR_INV_1D); - - if (imm_offset) { - sam->flags |= IR3_INSTR_IMM_OFFSET; - } +static void +emit_intrinsic_load_uav(struct ir3_context *ctx, + nir_intrinsic_instr *intr, + struct ir3_instruction **dst) +{ + /* Note: isam currently can't handle vectorized loads/stores */ + if (!(nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) || + intr->def.num_components > 1 || + !ctx->compiler->has_isam_ssbo) { + ctx->funcs->emit_intrinsic_load_uav(ctx, intr, dst); + return; } - ir3_handle_nonuniform(sam, intr); - - sam->barrier_class = IR3_BARRIER_BUFFER_R; - sam->barrier_conflict = IR3_BARRIER_BUFFER_W; - - ir3_split_dest(b, dst, sam, 0, num_components); + struct ir3_builder *b = &ctx->build; + struct ir3_instruction *coords = + ir3_create_collect(b, ir3_get_src(ctx, &intr->src[1]), 2); + emit_readonly_load_uav(ctx, intr, &intr->src[0], coords, 0, true, dst); } static void @@ -2809,6 +2839,9 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr) case nir_intrinsic_load_ssbo_ir3: emit_intrinsic_load_ssbo(ctx, intr, dst); break; + case nir_intrinsic_load_uav_ir3: + emit_intrinsic_load_uav(ctx, intr, dst); + break; case nir_intrinsic_store_ssbo_ir3: ctx->funcs->emit_intrinsic_store_ssbo(ctx, intr); break; diff --git a/src/freedreno/ir3/ir3_context.h b/src/freedreno/ir3/ir3_context.h index bcd1c258970..ecfacc830e5 100644 --- a/src/freedreno/ir3/ir3_context.h +++ b/src/freedreno/ir3/ir3_context.h @@ -153,6 +153,9 @@ struct ir3_context_funcs { void (*emit_intrinsic_load_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr, struct ir3_instruction **dst); + void (*emit_intrinsic_load_uav)(struct ir3_context *ctx, + nir_intrinsic_instr *intr, + struct ir3_instruction **dst); void (*emit_intrinsic_store_ssbo)(struct ir3_context *ctx, nir_intrinsic_instr *intr); struct ir3_instruction *(*emit_intrinsic_atomic_ssbo)( diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c index 2eb2608518b..72a9337f059 100644 --- a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c +++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c @@ -154,6 +154,37 @@ create_shift(nir_builder *b, nir_def *offset, int shift) return nir_ushr_imm(b, offset, shift); } +/* isam doesn't have an "untyped" field, so it can only load 1 component at a + * time because our storage buffer descriptors use a 1-component format. + * Therefore we need to scalarize any loads that would use isam. + */ +static void +scalarize_load(nir_intrinsic_instr *intrinsic, nir_builder *b) +{ + struct nir_def *results[NIR_MAX_VEC_COMPONENTS]; + + nir_def *descriptor = intrinsic->src[0].ssa; + nir_def *offset = intrinsic->src[1].ssa; + nir_def *record = nir_channel(b, offset, 0); + nir_def *record_offset = nir_channel(b, offset, 1); + + for (unsigned i = 0; i < intrinsic->def.num_components; i++) { + results[i] = + nir_load_uav_ir3(b, 1, intrinsic->def.bit_size, descriptor, + nir_vec2(b, record, + nir_iadd_imm(b, record_offset, i)), + .access = nir_intrinsic_access(intrinsic), + .align_mul = nir_intrinsic_align_mul(intrinsic), + .align_offset = nir_intrinsic_align_offset(intrinsic)); + } + + nir_def *result = nir_vec(b, results, intrinsic->def.num_components); + + nir_def_rewrite_uses(&intrinsic->def, result); + + nir_instr_remove(&intrinsic->instr); +} + static bool lower_offset_for_ssbo(nir_intrinsic_instr *intrinsic, nir_builder *b, unsigned ir3_ssbo_opcode, uint8_t offset_src_idx) @@ -271,6 +302,14 @@ lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx) progress |= lower_offset_for_ssbo(intr, b, (unsigned)ir3_intrinsic, offset_src_idx); } + + if (intr->intrinsic == nir_intrinsic_load_uav_ir3 && + (nir_intrinsic_access(intr) & ACCESS_CAN_REORDER) && + ir3_bindless_resource(intr->src[0]) && + intr->num_components > 1) { + b->cursor = nir_before_instr(instr); + scalarize_load(intr, b); + } } return progress; diff --git a/src/freedreno/vulkan/tu_shader.cc b/src/freedreno/vulkan/tu_shader.cc index 6cae345c44e..b274c2f14f2 100644 --- a/src/freedreno/vulkan/tu_shader.cc +++ b/src/freedreno/vulkan/tu_shader.cc @@ -495,6 +495,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *instr, case nir_intrinsic_load_ubo: case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_uav_ir3: case nir_intrinsic_store_ssbo: case nir_intrinsic_ssbo_atomic: case nir_intrinsic_ssbo_atomic_swap: