brw: switch buffer/image size intrinsics lowering to NIR

Fossil-db DG2:

Totals from 127 (0.01% of 1799288) affected shaders:
Instrs: 60593 -> 60508 (-0.14%); split: -0.15%, +0.01%
Cycle count: 7099635 -> 7116148 (+0.23%); split: -0.12%, +0.35%
Spill count: 468 -> 466 (-0.43%)
Fill count: 224 -> 222 (-0.89%)
Max live registers: 6418 -> 6424 (+0.09%); split: -0.06%, +0.16%
Non SSA regs after NIR: 11228 -> 11220 (-0.07%); split: -0.20%, +0.12%

Fossil-db LNL:

Totals from 135 (0.01% of 1573226) affected shaders:
Instrs: 55173 -> 55143 (-0.05%); split: -0.07%, +0.01%
Cycle count: 9178338 -> 9157052 (-0.23%); split: -0.32%, +0.09%
Spill count: 454 -> 452 (-0.44%)
Fill count: 181 -> 179 (-1.10%)
Max live registers: 12915 -> 12919 (+0.03%); split: -0.06%, +0.09%
Non SSA regs after NIR: 10860 -> 10852 (-0.07%); split: -0.20%, +0.13%

shader-db LNL:

total instructions in shared programs: 16911578 -> 16911566 (<.01%)
instructions in affected programs: 1602 -> 1590 (-0.75%)
helped: 7
HURT: 0
helped stats (abs) min: 1.0 max: 2.0 x̄: 1.71 x̃: 2
helped stats (rel) min: 0.48% max: 1.10% x̄: 0.75% x̃: 0.74%
95% mean confidence interval for instructions value: -2.17 -1.26
95% mean confidence interval for instructions %-change: -0.96% -0.55%
Instructions are helped.

total loops in shared programs: 5168 -> 5168 (0.00%)
loops in affected programs: 0 -> 0
helped: 0
HURT: 0

total cycles in shared programs: 848964184 -> 848955094 (<.01%)
cycles in affected programs: 1528020 -> 1518930 (-0.59%)
helped: 9
HURT: 6
helped stats (abs) min: 2.0 max: 8484.0 x̄: 1212.89 x̃: 20
helped stats (rel) min: 0.02% max: 3.23% x̄: 0.57% x̃: 0.11%
HURT stats (abs)   min: 2.0 max: 1608.0 x̄: 304.33 x̃: 15
HURT stats (rel)   min: <.01% max: 0.59% x̄: 0.19% x̃: 0.07%
95% mean confidence interval for cycles value: -1875.18 663.18
95% mean confidence interval for cycles %-change: -0.75% 0.23%
Inconclusive result (value mean confidence interval includes 0).

total spills in shared programs: 3345 -> 3345 (0.00%)
spills in affected programs: 0 -> 0
helped: 0
HURT: 0

total fills in shared programs: 1777 -> 1777 (0.00%)
fills in affected programs: 0 -> 0
helped: 0
HURT: 0

total sends in shared programs: 869299 -> 869299 (0.00%)
sends in affected programs: 0 -> 0
helped: 0
HURT: 0

LOST:   0
GAINED: 0

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39258>
This commit is contained in:
Lionel Landwerlin 2026-01-12 10:49:24 +02:00 committed by Marge Bot
parent 6293137d77
commit fd744b0c8a
2 changed files with 81 additions and 109 deletions

View file

@ -5076,47 +5076,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
break;
}
case nir_intrinsic_image_size:
case nir_intrinsic_bindless_image_size: {
/* Cube image sizes should have previously been lowered to a 2D array */
assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
/* Unlike the [un]typed load and store opcodes, the TXS that this turns
* into will handle the binding table index for us in the geneerator.
* Incidentally, this means that we can handle bindless with exactly the
* same code.
*/
brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD);
image = bld.emit_uniformize(image);
assert(nir_src_as_uint(instr->src[1]) == 0);
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
srcs[TEX_LOGICAL_SRC_SURFACE] = image;
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
/* Since the image size is always uniform, we can just emit a SIMD8
* query instruction and splat the result out.
*/
const brw_builder ubld = bld.scalar_group();
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
tmp, srcs,
TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
inst->required_params = 0x1 /* LOD */;
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
for (unsigned c = 0; c < instr->def.num_components; ++c) {
bld.MOV(offset(retype(dest, tmp.type), bld, c),
component(offset(tmp, ubld, c), 0));
}
break;
}
case nir_intrinsic_barrier:
case nir_intrinsic_begin_invocation_interlock:
case nir_intrinsic_end_invocation_interlock: {
@ -5570,67 +5529,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
break;
}
case nir_intrinsic_get_ssbo_size: {
assert(nir_src_num_components(instr->src[0]) == 1);
/* A resinfo's sampler message is used to get the buffer size. The
* SIMD8's writeback message consists of four registers and SIMD16's
* writeback message consists of 8 destination registers (two per each
* component). Because we are only interested on the first channel of
* the first returned component, where resinfo returns the buffer size
* for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
* the dispatch width.
*/
const brw_builder ubld = bld.scalar_group();
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
srcs[TEX_LOGICAL_SRC_SURFACE] = get_nir_buffer_intrinsic_index(ntb, bld, instr);
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
tmp, srcs,
TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
inst->required_params = 0x1 /* LOD */;
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]);
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
inst->fused_eu_disable =
(nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0;
for (unsigned c = 0; c < instr->def.num_components; ++c) {
bld.MOV(offset(retype(dest, tmp.type), bld, c),
component(offset(tmp, ubld, c), 0));
}
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
*
* "Out-of-bounds checking is always performed at a DWord granularity. If
* any part of the DWord is out-of-bounds then the whole DWord is
* considered out-of-bounds."
*
* This implies that types with size smaller than 4-bytes need to be
* padded if they don't complete the last dword of the buffer. But as we
* need to maintain the original size we need to reverse the padding
* calculation to return the correct size to know the number of elements
* of an unsized array. As we stored in the last two bits of the surface
* size the needed padding for the buffer, we calculate here the
* original buffer_size reversing the surface_size calculation:
*
* surface_size = isl_align(buffer_size, 4) +
* (isl_align(buffer_size) - buffer_size)
*
* buffer_size = surface_size & ~3 - surface_size & 3
*/
brw_reg size_padding = ubld.AND(tmp, brw_imm_ud(3));
brw_reg size_aligned4 = ubld.AND(tmp, brw_imm_ud(~3));
brw_reg buffer_size = ubld.ADD(size_aligned4, negate(size_padding));
bld.MOV(retype(dest, tmp.type), component(buffer_size, 0));
break;
}
case nir_intrinsic_load_subgroup_size:
/* This should only happen for fragment shaders because every other case
* is lowered in NIR so we can optimize on it.

View file

@ -36,9 +36,7 @@
* to sample_po_c_l instead.
*/
static bool
pre_lower_texture_instr(nir_builder *b,
nir_tex_instr *tex,
void *data)
pre_lower_tex_instr(nir_builder *b, nir_tex_instr *tex)
{
switch (tex->op) {
case nir_texop_txb: {
@ -87,13 +85,89 @@ pre_lower_texture_instr(nir_builder *b,
}
}
/* Lower size intrinsic to use the sampler. */
static bool
pre_lower_intrinsic_instr(nir_builder *b, nir_intrinsic_instr *intrin)
{
enum glsl_sampler_dim dim = GLSL_SAMPLER_DIM_BUF;
bool is_array = false;
switch (intrin->intrinsic) {
case nir_intrinsic_get_ssbo_size:
break;
case nir_intrinsic_bindless_image_size:
case nir_intrinsic_image_size:
dim = nir_intrinsic_image_dim(intrin);
is_array = nir_intrinsic_image_array(intrin);
break;
default:
return false;
}
b->cursor = nir_before_instr(&intrin->instr);
nir_src *surface = nir_get_io_index_src(intrin);
nir_intrinsic_instr *rsrc = nir_src_as_intrinsic(*surface);
bool bindless = rsrc && (nir_intrinsic_resource_access_intel(rsrc) &
nir_resource_intel_bindless);
nir_def *txs = nir_txs(b, .lod = nir_imm_int(b, 0),
.dim = dim, .is_array = is_array,
.texture_offset = bindless ? NULL : surface->ssa,
.texture_handle = bindless ? surface->ssa : NULL);
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
*
* "Out-of-bounds checking is always performed at a DWord granularity. If
* any part of the DWord is out-of-bounds then the whole DWord is
* considered out-of-bounds."
*
* This implies that types with size smaller than 4-bytes need to be
* padded if they don't complete the last dword of the buffer. But as we
* need to maintain the original size we need to reverse the padding
* calculation to return the correct size to know the number of elements
* of an unsized array. As we stored in the last two bits of the surface
* size the needed padding for the buffer, we calculate here the
* original buffer_size reversing the surface_size calculation:
*
* surface_size = isl_align(buffer_size, 4) +
* (isl_align(buffer_size) - buffer_size)
*
* buffer_size = surface_size & ~3 - surface_size & 3
*/
if (intrin->intrinsic == nir_intrinsic_get_ssbo_size)
txs = nir_isub(b, txs, nir_imul_imm(b, nir_iand_imm(b, txs, 3), 2));
nir_def_replace(&intrin->def, txs);
return true;
}
static bool
pre_lower_texture_instr(nir_builder *b, nir_instr *instr, void *data)
{
switch (instr->type) {
case nir_instr_type_tex:
return pre_lower_tex_instr(b, nir_instr_as_tex(instr));
case nir_instr_type_intrinsic:
return pre_lower_intrinsic_instr(b, nir_instr_as_intrinsic(instr));
default:
return false;
}
}
bool
brw_nir_pre_lower_texture(nir_shader *shader)
{
return nir_shader_tex_pass(shader,
pre_lower_texture_instr,
nir_metadata_control_flow,
NULL);
return nir_shader_instructions_pass(shader,
pre_lower_texture_instr,
nir_metadata_control_flow,
NULL);
}
/**