nir,pan: Rework the pafrost tile load intrinsic

Instead of making it explicitly about outputs, this switchies it to
being a NIR version of LD_TILE.  It means we have to do a bit of work in
NIR and add a builder helper but the end result is something much more
versatile.

Reviewed-by: Christoph Pillmayer <christoph.pillmayer@arm.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39367>
This commit is contained in:
Faith Ekstrand 2026-01-15 03:07:28 -05:00 committed by Marge Bot
parent 592963e941
commit 11b6cd2f2c
8 changed files with 91 additions and 62 deletions

View file

@ -1011,8 +1011,8 @@ visit_intrinsic(nir_intrinsic_instr *instr, struct divergence_state *state)
case nir_intrinsic_bvh_stack_rtn_amd:
case nir_intrinsic_cmat_load_shared_nv:
case nir_intrinsic_cmat_mov_transpose_nv:
case nir_intrinsic_load_converted_output_pan:
case nir_intrinsic_load_readonly_output_pan:
case nir_intrinsic_load_tile_pan:
case nir_intrinsic_load_tile_res_pan:
case nir_intrinsic_load_cumulative_coverage_pan:
case nir_intrinsic_load_blend_input_pan:
case nir_intrinsic_atest_pan:

View file

@ -1671,18 +1671,22 @@ intrinsic("load_frag_coord_zw_pan", [2], dest_comp=1, indices=[COMPONENT], flags
# src[] = { sampler_index }
load("sampler_lod_parameters", [1], flags=[CAN_ELIMINATE, CAN_REORDER])
# Like load_output but using a specified render target and conversion descriptor
# src[] = { target, sample, conversion }
# target must be in the [0..7] range when io_semantics.location is FRAG_RESULT_DATA0
# and is ignored otherwise
load("converted_output_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])
# Maps to LD_TILE
#
# rt must be in the [0..7] range when and io_semantics.location is not
# GL_FRAG_RESULT_DEPTH or GL_FRAG_RESULT_STENCIL
#
# src[] = { rt_sample_pixel, coverage_offset, conversion }
load("tile_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS],
flags=[CAN_ELIMINATE])
# Like converted_output_pan but for case where the output is never written by the shader
# This is used to relax waits on tile-buffer accesses and the target is read-only
# src[] = { target, sample, conversion }
# target must be in the [0..7] range when io_semantics.location is FRAG_RESULT_DATA0
# and is ignored otherwise
load("readonly_output_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS], flags=[CAN_ELIMINATE])
# Like load_tile_pan except it relies on resource tracking through
# resource_read/write_mask for dependencies instead of ensuring absolute
# pixel ordering like load_tile_pan does.
#
# src[] = { rt_sample_pixel, coverage_offset, conversion }
load("tile_res_pan", [1, 1, 1], indices=[ACCESS, DEST_TYPE, IO_SEMANTICS],
flags=[CAN_ELIMINATE, CAN_REORDER])
# Load converted memory given an address and a conversion descriptor
# src[] = { address, conversion }

View file

@ -412,8 +412,8 @@ nir_lower_non_uniform_access_impl(nir_function_impl *impl,
progress = true;
break;
case nir_intrinsic_load_readonly_output_pan:
case nir_intrinsic_load_converted_output_pan:
case nir_intrinsic_load_tile_pan:
case nir_intrinsic_load_tile_res_pan:
/* render target can be nonuniform, but not conversion descriptor */
if ((options->types & nir_lower_non_uniform_image_access) &&
lower_non_uniform_access_intrin(&state, intrin, 2, nir_lower_non_uniform_image_access))

View file

@ -1475,8 +1475,8 @@ print_intrinsic_instr(nir_intrinsic_instr *instr, print_state *state)
case nir_intrinsic_load_output:
case nir_intrinsic_load_per_vertex_output:
case nir_intrinsic_load_converted_output_pan:
case nir_intrinsic_load_readonly_output_pan:
case nir_intrinsic_load_tile_pan:
case nir_intrinsic_load_tile_res_pan:
case nir_intrinsic_load_per_primitive_output:
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_primitive_output:

View file

@ -1928,40 +1928,19 @@ bi_emit_ld_tile(bi_builder *b, nir_intrinsic_instr *instr)
bi_index dest = bi_def_index(&instr->def);
nir_alu_type T = nir_intrinsic_dest_type(instr);
nir_io_semantics sem = nir_intrinsic_io_semantics(instr);
bool is_zs = bi_is_zs(sem.location);
enum bi_register_format regfmt = bi_reg_fmt_for_nir(T);
unsigned size = instr->def.bit_size;
unsigned nr = instr->num_components;
unsigned target = 0, sample = 0;
if (sem.location == FRAG_RESULT_DEPTH) {
target = 255;
} else if (sem.location == FRAG_RESULT_STENCIL) {
target = 254;
} else if (nir_src_is_const(instr->src[0])) {
target = nir_src_as_uint(instr->src[0]);
assert(target < 8);
}
bi_index pi = bi_src_index(&instr->src[0]);
bi_index coverage = bi_src_index(&instr->src[1]);
bi_index conversion = bi_src_index(&instr->src[2]);
if (nir_src_is_const(instr->src[1]))
sample = nir_src_as_uint(instr->src[1]);
bi_instr *I = bi_ld_tile_to(b, dest, pi, coverage, conversion,
regfmt, nr - 1);
I->z_stencil = bi_is_zs(sem.location);
bi_index pi = bi_pixel_indices(b, target, sample);
if (!is_zs && !nir_src_is_const(instr->src[0]))
pi = bi_lshift_or(b, 32, bi_src_index(&instr->src[0]), pi, bi_imm_u8(8));
if (!nir_src_is_const(instr->src[1])) {
pi = bi_mux_i32(b, bi_src_index(&instr->src[1]), pi,
bi_imm_u32(0x1f), BI_MUX_BIT);
}
bi_instr *I = bi_ld_tile_to(b, dest, pi, bi_coverage(b),
bi_src_index(&instr->src[2]), regfmt, nr - 1);
if (is_zs)
I->z_stencil = true;
if (instr->intrinsic == nir_intrinsic_load_readonly_output_pan)
if (instr->intrinsic == nir_intrinsic_load_tile_res_pan)
I->wait_resource = true;
bi_emit_cached_split(b, dest, size * nr);
@ -2378,8 +2357,8 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr)
bi_emit_store_converted_mem(b, instr);
break;
case nir_intrinsic_load_converted_output_pan:
case nir_intrinsic_load_readonly_output_pan:
case nir_intrinsic_load_tile_pan:
case nir_intrinsic_load_tile_res_pan:
bi_emit_ld_tile(b, instr);
break;
@ -6184,9 +6163,11 @@ bi_lower_load_output(nir_builder *b, nir_intrinsic_instr *intr,
nir_def *conversion = nir_load_rt_conversion_pan(
b, .base = rt, .src_type = nir_intrinsic_dest_type(intr));
nir_def *lowered = nir_load_converted_output_pan(
b, intr->def.num_components, intr->def.bit_size, nir_imm_int(b, rt),
nir_imm_int(b, 0), conversion, .dest_type = nir_intrinsic_dest_type(intr),
nir_def *lowered = nir_load_tile_pan(
b, intr->def.num_components, intr->def.bit_size,
pan_nir_tile_location_sample(b, loc, nir_imm_int(b, 0)),
pan_nir_tile_default_coverage(b),
conversion, .dest_type = nir_intrinsic_dest_type(intr),
.io_semantics = nir_intrinsic_io_semantics(intr));
nir_def_rewrite_uses(&intr->def, lowered);

View file

@ -26,10 +26,44 @@
#define __PAN_NIR_H__
#include "nir.h"
#include "nir_builder.h"
#include "pan_compiler.h"
struct util_format_description;
static inline nir_def *
pan_nir_tile_rt_sample(nir_builder *b, nir_def *rt, nir_def *sample)
{
/* y = 255 means "current pixel" */
return nir_pack_32_4x8_split(b, nir_u2u8(b, sample),
nir_u2u8(b, rt),
nir_imm_intN_t(b, 0, 8),
nir_imm_intN_t(b, 255, 8));
}
static inline nir_def *
pan_nir_tile_location_sample(nir_builder *b, gl_frag_result location,
nir_def *sample)
{
uint8_t rt;
if (location == FRAG_RESULT_DEPTH) {
rt = 255;
} else if (location == FRAG_RESULT_STENCIL) {
rt = 254;
} else {
assert(location >= FRAG_RESULT_DATA0);
rt = location - FRAG_RESULT_DATA0;
}
return pan_nir_tile_rt_sample(b, nir_imm_int(b, rt), sample);
}
static inline nir_def *
pan_nir_tile_default_coverage(nir_builder *b)
{
return nir_iand_imm(b, nir_load_cumulative_coverage_pan(b), 0x1f);
}
bool pan_nir_lower_store_component(nir_shader *shader);
bool pan_nir_lower_vertex_id(nir_shader *shader);

View file

@ -812,10 +812,13 @@ lower_rt_intrin(nir_builder *b, nir_intrinsic_instr *intr, void *data)
b->cursor = nir_after_instr(&intr->instr);
nir_def *lowered = nir_load_converted_output_pan(
nir_def *sample_id =
nr_samples > 1 ? nir_load_sample_id(b) : nir_imm_int(b, 0);
nir_def *lowered = nir_load_tile_pan(
b, intr->def.num_components, intr->def.bit_size,
nir_imm_int(b, rt),
nr_samples > 1 ? nir_load_sample_id(b) : nir_imm_int(b, 0),
pan_nir_tile_rt_sample(b, nir_imm_int(b, rt), sample_id),
pan_nir_tile_default_coverage(b),
nir_imm_int(b, blend_desc >> 32),
.dest_type = dest_type,
.io_semantics = io);

View file

@ -34,6 +34,7 @@
#include "nir.h"
#include "nir_builder.h"
#include "pan_nir.h"
struct panvk_lower_input_attachment_load_ctx {
uint32_t ro_color_mask;
@ -161,16 +162,20 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr,
iosem.location = FRAG_RESULT_DATA0;
nir_push_if(b, is_read_only);
{
load_ro_color = nir_load_readonly_output_pan(
b, intr->def.num_components, intr->def.bit_size, target,
intr->src[2].ssa, conversion, .dest_type = dest_type,
load_ro_color = nir_load_tile_res_pan(
b, intr->def.num_components, intr->def.bit_size,
pan_nir_tile_rt_sample(b, target, intr->src[2].ssa),
pan_nir_tile_default_coverage(b),
conversion, .dest_type = dest_type,
.access = nir_intrinsic_access(intr), .io_semantics = iosem);
}
nir_push_else(b, NULL);
{
load_rw_color = nir_load_converted_output_pan(
b, intr->def.num_components, intr->def.bit_size, target,
intr->src[2].ssa, conversion, .dest_type = dest_type,
load_rw_color = nir_load_tile_pan(
b, intr->def.num_components, intr->def.bit_size,
pan_nir_tile_rt_sample(b, target, intr->src[2].ssa),
pan_nir_tile_default_coverage(b),
conversion, .dest_type = dest_type,
.access = nir_intrinsic_access(intr), .io_semantics = iosem);
}
nir_pop_if(b, NULL);
@ -201,9 +206,11 @@ lower_input_attachment_load(nir_builder *b, nir_intrinsic_instr *intr,
iosem.location = dest_type == nir_type_float32 ? FRAG_RESULT_DEPTH
: FRAG_RESULT_STENCIL;
target = nir_imm_int(b, 0);
load_zs = nir_load_converted_output_pan(
b, intr->def.num_components, intr->def.bit_size, target,
intr->src[2].ssa, conversion, .dest_type = dest_type,
load_zs = nir_load_tile_pan(
b, intr->def.num_components, intr->def.bit_size,
pan_nir_tile_location_sample(b, iosem.location, intr->src[2].ssa),
pan_nir_tile_default_coverage(b),
conversion, .dest_type = dest_type,
.access = nir_intrinsic_access(intr), .io_semantics = iosem);
/* If we loaded the stencil value, the upper 24 bits might contain