mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2026-01-27 01:10:25 +01:00
brw: switch buffer/image size intrinsics lowering to NIR
Fossil-db DG2: Totals from 127 (0.01% of 1799288) affected shaders: Instrs: 60593 -> 60508 (-0.14%); split: -0.15%, +0.01% Cycle count: 7099635 -> 7116148 (+0.23%); split: -0.12%, +0.35% Spill count: 468 -> 466 (-0.43%) Fill count: 224 -> 222 (-0.89%) Max live registers: 6418 -> 6424 (+0.09%); split: -0.06%, +0.16% Non SSA regs after NIR: 11228 -> 11220 (-0.07%); split: -0.20%, +0.12% Fossil-db LNL: Totals from 135 (0.01% of 1573226) affected shaders: Instrs: 55173 -> 55143 (-0.05%); split: -0.07%, +0.01% Cycle count: 9178338 -> 9157052 (-0.23%); split: -0.32%, +0.09% Spill count: 454 -> 452 (-0.44%) Fill count: 181 -> 179 (-1.10%) Max live registers: 12915 -> 12919 (+0.03%); split: -0.06%, +0.09% Non SSA regs after NIR: 10860 -> 10852 (-0.07%); split: -0.20%, +0.13% shader-db LNL: total instructions in shared programs: 16911578 -> 16911566 (<.01%) instructions in affected programs: 1602 -> 1590 (-0.75%) helped: 7 HURT: 0 helped stats (abs) min: 1.0 max: 2.0 x̄: 1.71 x̃: 2 helped stats (rel) min: 0.48% max: 1.10% x̄: 0.75% x̃: 0.74% 95% mean confidence interval for instructions value: -2.17 -1.26 95% mean confidence interval for instructions %-change: -0.96% -0.55% Instructions are helped. total loops in shared programs: 5168 -> 5168 (0.00%) loops in affected programs: 0 -> 0 helped: 0 HURT: 0 total cycles in shared programs: 848964184 -> 848955094 (<.01%) cycles in affected programs: 1528020 -> 1518930 (-0.59%) helped: 9 HURT: 6 helped stats (abs) min: 2.0 max: 8484.0 x̄: 1212.89 x̃: 20 helped stats (rel) min: 0.02% max: 3.23% x̄: 0.57% x̃: 0.11% HURT stats (abs) min: 2.0 max: 1608.0 x̄: 304.33 x̃: 15 HURT stats (rel) min: <.01% max: 0.59% x̄: 0.19% x̃: 0.07% 95% mean confidence interval for cycles value: -1875.18 663.18 95% mean confidence interval for cycles %-change: -0.75% 0.23% Inconclusive result (value mean confidence interval includes 0). total spills in shared programs: 3345 -> 3345 (0.00%) spills in affected programs: 0 -> 0 helped: 0 HURT: 0 total fills in shared programs: 1777 -> 1777 (0.00%) fills in affected programs: 0 -> 0 helped: 0 HURT: 0 total sends in shared programs: 869299 -> 869299 (0.00%) sends in affected programs: 0 -> 0 helped: 0 HURT: 0 LOST: 0 GAINED: 0 Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/39258>
This commit is contained in:
parent
6293137d77
commit
fd744b0c8a
2 changed files with 81 additions and 109 deletions
|
|
@ -5076,47 +5076,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_image_size:
|
||||
case nir_intrinsic_bindless_image_size: {
|
||||
/* Cube image sizes should have previously been lowered to a 2D array */
|
||||
assert(nir_intrinsic_image_dim(instr) != GLSL_SAMPLER_DIM_CUBE);
|
||||
|
||||
/* Unlike the [un]typed load and store opcodes, the TXS that this turns
|
||||
* into will handle the binding table index for us in the geneerator.
|
||||
* Incidentally, this means that we can handle bindless with exactly the
|
||||
* same code.
|
||||
*/
|
||||
brw_reg image = retype(get_nir_src_imm(ntb, instr->src[0]), BRW_TYPE_UD);
|
||||
image = bld.emit_uniformize(image);
|
||||
|
||||
assert(nir_src_as_uint(instr->src[1]) == 0);
|
||||
|
||||
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
|
||||
srcs[TEX_LOGICAL_SRC_SURFACE] = image;
|
||||
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
|
||||
srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
|
||||
|
||||
/* Since the image size is always uniform, we can just emit a SIMD8
|
||||
* query instruction and splat the result out.
|
||||
*/
|
||||
const brw_builder ubld = bld.scalar_group();
|
||||
|
||||
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
|
||||
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
|
||||
tmp, srcs,
|
||||
TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
|
||||
inst->required_params = 0x1 /* LOD */;
|
||||
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
|
||||
inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
|
||||
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
|
||||
|
||||
for (unsigned c = 0; c < instr->def.num_components; ++c) {
|
||||
bld.MOV(offset(retype(dest, tmp.type), bld, c),
|
||||
component(offset(tmp, ubld, c), 0));
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_barrier:
|
||||
case nir_intrinsic_begin_invocation_interlock:
|
||||
case nir_intrinsic_end_invocation_interlock: {
|
||||
|
|
@ -5570,67 +5529,6 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
|
|||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_get_ssbo_size: {
|
||||
assert(nir_src_num_components(instr->src[0]) == 1);
|
||||
|
||||
/* A resinfo's sampler message is used to get the buffer size. The
|
||||
* SIMD8's writeback message consists of four registers and SIMD16's
|
||||
* writeback message consists of 8 destination registers (two per each
|
||||
* component). Because we are only interested on the first channel of
|
||||
* the first returned component, where resinfo returns the buffer size
|
||||
* for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
|
||||
* the dispatch width.
|
||||
*/
|
||||
const brw_builder ubld = bld.scalar_group();
|
||||
|
||||
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
|
||||
srcs[TEX_LOGICAL_SRC_SURFACE] = get_nir_buffer_intrinsic_index(ntb, bld, instr);
|
||||
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
|
||||
srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
|
||||
|
||||
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
|
||||
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
|
||||
tmp, srcs,
|
||||
TEX_LOGICAL_SRC_PAYLOAD0 + 1)->as_tex();
|
||||
inst->required_params = 0x1 /* LOD */;
|
||||
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
|
||||
inst->surface_bindless = get_nir_src_bindless(ntb, instr->src[0]);
|
||||
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
|
||||
inst->fused_eu_disable =
|
||||
(nir_intrinsic_access(instr) & ACCESS_FUSED_EU_DISABLE_INTEL) != 0;
|
||||
|
||||
for (unsigned c = 0; c < instr->def.num_components; ++c) {
|
||||
bld.MOV(offset(retype(dest, tmp.type), bld, c),
|
||||
component(offset(tmp, ubld, c), 0));
|
||||
}
|
||||
|
||||
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
|
||||
*
|
||||
* "Out-of-bounds checking is always performed at a DWord granularity. If
|
||||
* any part of the DWord is out-of-bounds then the whole DWord is
|
||||
* considered out-of-bounds."
|
||||
*
|
||||
* This implies that types with size smaller than 4-bytes need to be
|
||||
* padded if they don't complete the last dword of the buffer. But as we
|
||||
* need to maintain the original size we need to reverse the padding
|
||||
* calculation to return the correct size to know the number of elements
|
||||
* of an unsized array. As we stored in the last two bits of the surface
|
||||
* size the needed padding for the buffer, we calculate here the
|
||||
* original buffer_size reversing the surface_size calculation:
|
||||
*
|
||||
* surface_size = isl_align(buffer_size, 4) +
|
||||
* (isl_align(buffer_size) - buffer_size)
|
||||
*
|
||||
* buffer_size = surface_size & ~3 - surface_size & 3
|
||||
*/
|
||||
brw_reg size_padding = ubld.AND(tmp, brw_imm_ud(3));
|
||||
brw_reg size_aligned4 = ubld.AND(tmp, brw_imm_ud(~3));
|
||||
brw_reg buffer_size = ubld.ADD(size_aligned4, negate(size_padding));
|
||||
|
||||
bld.MOV(retype(dest, tmp.type), component(buffer_size, 0));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_subgroup_size:
|
||||
/* This should only happen for fragment shaders because every other case
|
||||
* is lowered in NIR so we can optimize on it.
|
||||
|
|
|
|||
|
|
@ -36,9 +36,7 @@
|
|||
* to sample_po_c_l instead.
|
||||
*/
|
||||
static bool
|
||||
pre_lower_texture_instr(nir_builder *b,
|
||||
nir_tex_instr *tex,
|
||||
void *data)
|
||||
pre_lower_tex_instr(nir_builder *b, nir_tex_instr *tex)
|
||||
{
|
||||
switch (tex->op) {
|
||||
case nir_texop_txb: {
|
||||
|
|
@ -87,13 +85,89 @@ pre_lower_texture_instr(nir_builder *b,
|
|||
}
|
||||
}
|
||||
|
||||
/* Lower size intrinsic to use the sampler. */
|
||||
static bool
|
||||
pre_lower_intrinsic_instr(nir_builder *b, nir_intrinsic_instr *intrin)
|
||||
{
|
||||
enum glsl_sampler_dim dim = GLSL_SAMPLER_DIM_BUF;
|
||||
bool is_array = false;
|
||||
|
||||
switch (intrin->intrinsic) {
|
||||
case nir_intrinsic_get_ssbo_size:
|
||||
break;
|
||||
|
||||
case nir_intrinsic_bindless_image_size:
|
||||
case nir_intrinsic_image_size:
|
||||
dim = nir_intrinsic_image_dim(intrin);
|
||||
is_array = nir_intrinsic_image_array(intrin);
|
||||
break;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
||||
b->cursor = nir_before_instr(&intrin->instr);
|
||||
|
||||
nir_src *surface = nir_get_io_index_src(intrin);
|
||||
nir_intrinsic_instr *rsrc = nir_src_as_intrinsic(*surface);
|
||||
|
||||
bool bindless = rsrc && (nir_intrinsic_resource_access_intel(rsrc) &
|
||||
nir_resource_intel_bindless);
|
||||
|
||||
nir_def *txs = nir_txs(b, .lod = nir_imm_int(b, 0),
|
||||
.dim = dim, .is_array = is_array,
|
||||
.texture_offset = bindless ? NULL : surface->ssa,
|
||||
.texture_handle = bindless ? surface->ssa : NULL);
|
||||
|
||||
/* SKL PRM, vol07, 3D Media GPGPU Engine, Bounds Checking and Faulting:
|
||||
*
|
||||
* "Out-of-bounds checking is always performed at a DWord granularity. If
|
||||
* any part of the DWord is out-of-bounds then the whole DWord is
|
||||
* considered out-of-bounds."
|
||||
*
|
||||
* This implies that types with size smaller than 4-bytes need to be
|
||||
* padded if they don't complete the last dword of the buffer. But as we
|
||||
* need to maintain the original size we need to reverse the padding
|
||||
* calculation to return the correct size to know the number of elements
|
||||
* of an unsized array. As we stored in the last two bits of the surface
|
||||
* size the needed padding for the buffer, we calculate here the
|
||||
* original buffer_size reversing the surface_size calculation:
|
||||
*
|
||||
* surface_size = isl_align(buffer_size, 4) +
|
||||
* (isl_align(buffer_size) - buffer_size)
|
||||
*
|
||||
* buffer_size = surface_size & ~3 - surface_size & 3
|
||||
*/
|
||||
if (intrin->intrinsic == nir_intrinsic_get_ssbo_size)
|
||||
txs = nir_isub(b, txs, nir_imul_imm(b, nir_iand_imm(b, txs, 3), 2));
|
||||
|
||||
nir_def_replace(&intrin->def, txs);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool
|
||||
pre_lower_texture_instr(nir_builder *b, nir_instr *instr, void *data)
|
||||
{
|
||||
switch (instr->type) {
|
||||
case nir_instr_type_tex:
|
||||
return pre_lower_tex_instr(b, nir_instr_as_tex(instr));
|
||||
|
||||
case nir_instr_type_intrinsic:
|
||||
return pre_lower_intrinsic_instr(b, nir_instr_as_intrinsic(instr));
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
brw_nir_pre_lower_texture(nir_shader *shader)
|
||||
{
|
||||
return nir_shader_tex_pass(shader,
|
||||
pre_lower_texture_instr,
|
||||
nir_metadata_control_flow,
|
||||
NULL);
|
||||
return nir_shader_instructions_pass(shader,
|
||||
pre_lower_texture_instr,
|
||||
nir_metadata_control_flow,
|
||||
NULL);
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue