From 11b6cd2f2ced378bbca90dde8fcd635ef28532ae Mon Sep 17 00:00:00 2001 From: Faith Ekstrand Date: Thu, 15 Jan 2026 03:07:28 -0500 Subject: [PATCH] nir,pan: Rework the pafrost tile load intrinsic Instead of making it explicitly about outputs, this switchies it to being a NIR version of LD_TILE. It means we have to do a bit of work in NIR and add a builder helper but the end result is something much more versatile. Reviewed-by: Christoph Pillmayer Part-of: --- src/compiler/nir/nir_divergence_analysis.c | 4 +- src/compiler/nir/nir_intrinsics.py | 26 +++++----- .../nir/nir_lower_non_uniform_access.c | 4 +- src/compiler/nir/nir_print.c | 4 +- .../compiler/bifrost/bifrost_compile.c | 47 ++++++------------- src/panfrost/compiler/pan_nir.h | 34 ++++++++++++++ src/panfrost/lib/pan_blend.c | 9 ++-- ...anvk_vX_nir_lower_input_attachment_loads.c | 25 ++++++---- 8 files changed, 91 insertions(+), 62 deletions(-) diff --git a/src/compiler/nir/nir_divergence_analysis.c b/src/compiler/nir/nir_divergence_analysis.c index 9d6a0dcd0ed..f584f595af4 100644 --- a/src/compiler/nir/nir_divergence_analysis.c +++ b/src/compiler/nir/nir_divergence_analysis.c @@ -1011,8 +1011,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state) case nir_intrinsic_bvh_stack_rtn_amd: case nir_intrinsic_cmat_load_shared_nv: case nir_intrinsic_cmat_mov_transpose_nv: - case nir_intrinsic_load_converted_output_pan: - case nir_intrinsic_load_readonly_output_pan: + case nir_intrinsic_load_tile_pan: + case nir_intrinsic_load_tile_res_pan: case nir_intrinsic_load_cumulative_coverage_pan: case nir_intrinsic_load_blend_input_pan: case nir_intrinsic_atest_pan: diff --git a/src/compiler/nir/nir_intrinsics.py b/src/compiler/nir/nir_intrinsics.py index dbe12c6a49a..133d892a2ae 100644 --- a/src/compiler/nir/nir_intrinsics.py +++ b/src/compiler/nir/nir_intrinsics.py @@ -1671,18 +1671,22 @@ intrinsic("load_frag_coord_zw_pan", [2], dest_comp=1, indices=[COMPONENT], flags # src[] = { sampler_index } load("sampler_lod_parameters", [1], flags=[CAN_ELIMINATE, CAN_REORDER]) -# Like load_output but using a specified render target and conversion descriptor -# src[] = { target, sample, conversion } -# target must be in the [0..7] range when io_semantics.location is FRAG_RESULT_DATA0 -# and is ignored otherwise -load("converted_output_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE]) +# Maps to LD_TILE +# +# rt must be in the [0..7] range when and io_semantics.location is not +# GL_FRAG_RESULT_DEPTH or GL_FRAG_RESULT_STENCIL +# +# src[] = { rt_sample_pixel, coverage_offset, conversion } +load("tile_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS], + flags=[CAN_ELIMINATE]) -# Like converted_output_pan but for case where the output is never written by the shader -# This is used to relax waits on tile-buffer accesses and the target is read-only -# src[] = { target, sample, conversion } -# target must be in the [0..7] range when io_semantics.location is FRAG_RESULT_DATA0 -# and is ignored otherwise -load("readonly_output_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE]) +# Like load_tile_pan except it relies on resource tracking through +# resource_read/write_mask for dependencies instead of ensuring absolute +# pixel ordering like load_tile_pan does. +# +# src[] = { rt_sample_pixel, coverage_offset, conversion } +load("tile_res_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS], + flags=[CAN_ELIMINATE, CAN_REORDER]) # Load converted memory given an address and a conversion descriptor # src[] = { address, conversion } diff --git a/src/compiler/nir/nir_lower_non_uniform_access.c b/src/compiler/nir/nir_lower_non_uniform_access.c index 7185dcc8973..bdaffcb5508 100644 --- a/src/compiler/nir/nir_lower_non_uniform_access.c +++ b/src/compiler/nir/nir_lower_non_uniform_access.c @@ -412,8 +412,8 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl, progress = true; break; - case nir_intrinsic_load_readonly_output_pan: - case nir_intrinsic_load_converted_output_pan: + case nir_intrinsic_load_tile_pan: + case nir_intrinsic_load_tile_res_pan: /* render target can be nonuniform, but not conversion descriptor */ if ((options->types & nir_lower_non_uniform_image_access) && lower_non_uniform_access_intrin(&state, intrin, 2, nir_lower_non_uniform_image_access)) diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c index 393f3aeb6fd..1d5a00fad14 100644 --- a/src/compiler/nir/nir_print.c +++ b/src/compiler/nir/nir_print.c @@ -1475,8 +1475,8 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state) case nir_intrinsic_load_output: case nir_intrinsic_load_per_vertex_output: - case nir_intrinsic_load_converted_output_pan: - case nir_intrinsic_load_readonly_output_pan: + case nir_intrinsic_load_tile_pan: + case nir_intrinsic_load_tile_res_pan: case nir_intrinsic_load_per_primitive_output: case nir_intrinsic_store_output: case nir_intrinsic_store_per_primitive_output: diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 77be04e6678..3fef87a824b 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -1928,40 +1928,19 @@ bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr) bi_index dest = bi_def_index(&instr->def); nir_alu_type T = nir_intrinsic_dest_type(instr); nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - bool is_zs = bi_is_zs(sem.location); enum bi_register_format regfmt = bi_reg_fmt_for_nir(T); unsigned size = instr->def.bit_size; unsigned nr = instr->num_components; - unsigned target = 0, sample = 0; - if (sem.location == FRAG_RESULT_DEPTH) { - target = 255; - } else if (sem.location == FRAG_RESULT_STENCIL) { - target = 254; - } else if (nir_src_is_const(instr->src[0])) { - target = nir_src_as_uint(instr->src[0]); - assert(target < 8); - } + bi_index pi = bi_src_index(&instr->src[0]); + bi_index coverage = bi_src_index(&instr->src[1]); + bi_index conversion = bi_src_index(&instr->src[2]); - if (nir_src_is_const(instr->src[1])) - sample = nir_src_as_uint(instr->src[1]); + bi_instr *I = bi_ld_tile_to(b, dest, pi, coverage, conversion, + regfmt, nr - 1); + I->z_stencil = bi_is_zs(sem.location); - bi_index pi = bi_pixel_indices(b, target, sample); - - if (!is_zs && !nir_src_is_const(instr->src[0])) - pi = bi_lshift_or(b, 32, bi_src_index(&instr->src[0]), pi, bi_imm_u8(8)); - - if (!nir_src_is_const(instr->src[1])) { - pi = bi_mux_i32(b, bi_src_index(&instr->src[1]), pi, - bi_imm_u32(0x1f), BI_MUX_BIT); - } - - bi_instr *I = bi_ld_tile_to(b, dest, pi, bi_coverage(b), - bi_src_index(&instr->src[2]), regfmt, nr - 1); - if (is_zs) - I->z_stencil = true; - - if (instr->intrinsic == nir_intrinsic_load_readonly_output_pan) + if (instr->intrinsic == nir_intrinsic_load_tile_res_pan) I->wait_resource = true; bi_emit_cached_split(b, dest, size * nr); @@ -2378,8 +2357,8 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) bi_emit_store_converted_mem(b, instr); break; - case nir_intrinsic_load_converted_output_pan: - case nir_intrinsic_load_readonly_output_pan: + case nir_intrinsic_load_tile_pan: + case nir_intrinsic_load_tile_res_pan: bi_emit_ld_tile(b, instr); break; @@ -6184,9 +6163,11 @@ bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr, nir_def *conversion = nir_load_rt_conversion_pan( b, .base = rt, .src_type = nir_intrinsic_dest_type(intr)); - nir_def *lowered = nir_load_converted_output_pan( - b, intr->def.num_components, intr->def.bit_size, nir_imm_int(b, rt), - nir_imm_int(b, 0), conversion, .dest_type = nir_intrinsic_dest_type(intr), + nir_def *lowered = nir_load_tile_pan( + b, intr->def.num_components, intr->def.bit_size, + pan_nir_tile_location_sample(b, loc, nir_imm_int(b, 0)), + pan_nir_tile_default_coverage(b), + conversion, .dest_type = nir_intrinsic_dest_type(intr), .io_semantics = nir_intrinsic_io_semantics(intr)); nir_def_rewrite_uses(&intr->def, lowered); diff --git a/src/panfrost/compiler/pan_nir.h b/src/panfrost/compiler/pan_nir.h index 28c41d3808f..174447646d1 100644 --- a/src/panfrost/compiler/pan_nir.h +++ b/src/panfrost/compiler/pan_nir.h @@ -26,10 +26,44 @@ #define __PAN_NIR_H__ #include "nir.h" +#include "nir_builder.h" #include "pan_compiler.h" struct util_format_description; +static inline nir_def * +pan_nir_tile_rt_sample(nir_builder *b, nir_def *rt, nir_def *sample) +{ + /* y = 255 means "current pixel" */ + return nir_pack_32_4x8_split(b, nir_u2u8(b, sample), + nir_u2u8(b, rt), + nir_imm_intN_t(b, 0, 8), + nir_imm_intN_t(b, 255, 8)); +} + +static inline nir_def * +pan_nir_tile_location_sample(nir_builder *b, gl_frag_result location, + nir_def *sample) +{ + uint8_t rt; + if (location == FRAG_RESULT_DEPTH) { + rt = 255; + } else if (location == FRAG_RESULT_STENCIL) { + rt = 254; + } else { + assert(location >= FRAG_RESULT_DATA0); + rt = location - FRAG_RESULT_DATA0; + } + + return pan_nir_tile_rt_sample(b, nir_imm_int(b, rt), sample); +} + +static inline nir_def * +pan_nir_tile_default_coverage(nir_builder *b) +{ + return nir_iand_imm(b, nir_load_cumulative_coverage_pan(b), 0x1f); +} + bool pan_nir_lower_store_component(nir_shader *shader); bool pan_nir_lower_vertex_id(nir_shader *shader); diff --git a/src/panfrost/lib/pan_blend.c b/src/panfrost/lib/pan_blend.c index 8117065e7a9..97d9b8def83 100644 --- a/src/panfrost/lib/pan_blend.c +++ b/src/panfrost/lib/pan_blend.c @@ -812,10 +812,13 @@ lower_rt_intrin(nir_builder *b, nir_intrinsic_instr *intr, void *data) b->cursor = nir_after_instr(&intr->instr); - nir_def *lowered = nir_load_converted_output_pan( + nir_def *sample_id = + nr_samples > 1 ? nir_load_sample_id(b) : nir_imm_int(b, 0); + + nir_def *lowered = nir_load_tile_pan( b, intr->def.num_components, intr->def.bit_size, - nir_imm_int(b, rt), - nr_samples > 1 ? nir_load_sample_id(b) : nir_imm_int(b, 0), + pan_nir_tile_rt_sample(b, nir_imm_int(b, rt), sample_id), + pan_nir_tile_default_coverage(b), nir_imm_int(b, blend_desc >> 32), .dest_type = dest_type, .io_semantics = io); diff --git a/src/panfrost/vulkan/panvk_vX_nir_lower_input_attachment_loads.c b/src/panfrost/vulkan/panvk_vX_nir_lower_input_attachment_loads.c index 7f8846e2af5..3d02e2ec9f0 100644 --- a/src/panfrost/vulkan/panvk_vX_nir_lower_input_attachment_loads.c +++ b/src/panfrost/vulkan/panvk_vX_nir_lower_input_attachment_loads.c @@ -34,6 +34,7 @@ #include "nir.h" #include "nir_builder.h" +#include "pan_nir.h" struct panvk_lower_input_attachment_load_ctx { uint32_t ro_color_mask; @@ -161,16 +162,20 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr, iosem.location = FRAG_RESULT_DATA0; nir_push_if(b, is_read_only); { - load_ro_color = nir_load_readonly_output_pan( - b, intr->def.num_components, intr->def.bit_size, target, - intr->src[2].ssa, conversion, .dest_type = dest_type, + load_ro_color = nir_load_tile_res_pan( + b, intr->def.num_components, intr->def.bit_size, + pan_nir_tile_rt_sample(b, target, intr->src[2].ssa), + pan_nir_tile_default_coverage(b), + conversion, .dest_type = dest_type, .access = nir_intrinsic_access(intr), .io_semantics = iosem); } nir_push_else(b, NULL); { - load_rw_color = nir_load_converted_output_pan( - b, intr->def.num_components, intr->def.bit_size, target, - intr->src[2].ssa, conversion, .dest_type = dest_type, + load_rw_color = nir_load_tile_pan( + b, intr->def.num_components, intr->def.bit_size, + pan_nir_tile_rt_sample(b, target, intr->src[2].ssa), + pan_nir_tile_default_coverage(b), + conversion, .dest_type = dest_type, .access = nir_intrinsic_access(intr), .io_semantics = iosem); } nir_pop_if(b, NULL); @@ -201,9 +206,11 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr, iosem.location = dest_type == nir_type_float32 ? FRAG_RESULT_DEPTH : FRAG_RESULT_STENCIL; target = nir_imm_int(b, 0); - load_zs = nir_load_converted_output_pan( - b, intr->def.num_components, intr->def.bit_size, target, - intr->src[2].ssa, conversion, .dest_type = dest_type, + load_zs = nir_load_tile_pan( + b, intr->def.num_components, intr->def.bit_size, + pan_nir_tile_location_sample(b, iosem.location, intr->src[2].ssa), + pan_nir_tile_default_coverage(b), + conversion, .dest_type = dest_type, .access = nir_intrinsic_access(intr), .io_semantics = iosem); /* If we loaded the stencil value, the upper 24 bits might contain