brw: switch to new sampler payload description scheme

Instead of having abstracted opcodes, we target directly the HW format
at the NIR translation.

The payload description gives us the order of the payload sources (we
can use that for pretty printing) and we don't have to have a
complicated scheme in the logical send lowering for the ordering. All
we have to do is build the header if needed as well as the descriptors.

PTL Fossil-db stats:
 Totals from 66759 (13.54% of 492917) affected shaders:
 Instrs: 44289221 -> 43957404 (-0.75%); split: -0.81%, +0.06%
 Send messages: 2050378 -> 2042607 (-0.38%)
 Cycle count: 3878874713 -> 3712848434 (-4.28%); split: -4.44%, +0.16%
 Max live registers: 8773179 -> 8770104 (-0.04%); split: -0.06%, +0.03%
 Max dispatch width: 1677408 -> 1707952 (+1.82%); split: +1.85%, -0.03%
 Non SSA regs after NIR: 11407821 -> 11421041 (+0.12%); split: -0.03%, +0.15%
 GRF registers: 5686983 -> 5838785 (+2.67%); split: -0.24%, +2.91%

LNL Fossil-db stats:

 Totals from 57911 (15.72% of 368381) affected shaders:
 Instrs: 39448036 -> 38923650 (-1.33%); split: -1.41%, +0.08%
 Subgroup size: 1241360 -> 1241392 (+0.00%)
 Send messages: 1846696 -> 1845137 (-0.08%)
 Cycle count: 3834818910 -> 3784003027 (-1.33%); split: -2.33%, +1.00%
 Spill count: 21866 -> 22168 (+1.38%); split: -0.07%, +1.45%
 Fill count: 59324 -> 60339 (+1.71%); split: -0.00%, +1.71%
 Scratch Memory Size: 1479680 -> 1483776 (+0.28%)
 Max live registers: 7521376 -> 7447841 (-0.98%); split: -1.04%, +0.06%
 Non SSA regs after NIR: 9744605 -> 10113728 (+3.79%); split: -0.01%, +3.80%

Only 2 titles negatively impacted (spilling) :
  - Shadow of the Tomb Raider
  - Red Dead Redemption 2

All impacted shaders were already spilling.

Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Acked-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/37171>
This commit is contained in:
Lionel Landwerlin 2025-09-01 22:48:51 +03:00 committed by Marge Bot
parent 232697a0a3
commit efcba73b49
13 changed files with 521 additions and 1052 deletions

View file

@ -552,35 +552,6 @@ enum ENUM_PACKED opcode {
SHADER_OPCODE_LOAD_REG,
};
enum sampler_opcode {
/**
* Texture sampling opcodes.
*
* LOGICAL opcodes are eventually translated to SHADER_OPCODE_SEND but
* take parameters as individual sources. See enum tex_logical_srcs.
*/
SAMPLER_OPCODE_TEX_LOGICAL,
SAMPLER_OPCODE_TXD_LOGICAL,
SAMPLER_OPCODE_TXF_LOGICAL,
SAMPLER_OPCODE_TXL_LOGICAL,
SAMPLER_OPCODE_TXS_LOGICAL,
SAMPLER_OPCODE_TXB_LOGICAL,
SAMPLER_OPCODE_TXF_CMS_W_LOGICAL,
SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL,
SAMPLER_OPCODE_TXF_MCS_LOGICAL,
SAMPLER_OPCODE_LOD_LOGICAL,
SAMPLER_OPCODE_TG4_LOGICAL,
SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL,
SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL,
SAMPLER_OPCODE_TG4_BIAS_LOGICAL,
SAMPLER_OPCODE_TG4_OFFSET_LOGICAL,
SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL,
SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL,
SAMPLER_OPCODE_SAMPLEINFO_LOGICAL,
SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL,
};
enum send_srcs {
/** The 32-bit message descriptor (can be a register) */
SEND_SRC_DESC,
@ -612,26 +583,24 @@ enum fb_write_logical_srcs {
};
enum tex_logical_srcs {
/** Texture coordinates */
TEX_LOGICAL_SRC_COORDINATE,
/** Shadow comparator */
TEX_LOGICAL_SRC_SHADOW_C,
/** dPdx if the operation takes explicit derivatives, otherwise LOD value */
TEX_LOGICAL_SRC_LOD,
/** dPdy if the operation takes explicit derivatives */
TEX_LOGICAL_SRC_LOD2,
/** Min LOD */
TEX_LOGICAL_SRC_MIN_LOD,
/** Sample index */
TEX_LOGICAL_SRC_SAMPLE_INDEX,
/** MCS data */
TEX_LOGICAL_SRC_MCS,
/** REQUIRED: Texture surface index */
TEX_LOGICAL_SRC_SURFACE,
/** Texture sampler index */
TEX_LOGICAL_SRC_SAMPLER,
/** Texel offset for gathers */
TEX_LOGICAL_SRC_TG4_OFFSET,
/** Sampler payloads */
TEX_LOGICAL_SRC_PAYLOAD0,
TEX_LOGICAL_SRC_PAYLOAD1,
TEX_LOGICAL_SRC_PAYLOAD2,
TEX_LOGICAL_SRC_PAYLOAD3,
TEX_LOGICAL_SRC_PAYLOAD4,
TEX_LOGICAL_SRC_PAYLOAD5,
TEX_LOGICAL_SRC_PAYLOAD6,
TEX_LOGICAL_SRC_PAYLOAD7,
TEX_LOGICAL_SRC_PAYLOAD8,
TEX_LOGICAL_SRC_PAYLOAD9,
TEX_LOGICAL_SRC_PAYLOAD10,
TEX_LOGICAL_SRC_PAYLOAD11,
TEX_LOGICAL_SRC_PAYLOAD12,
TEX_LOGICAL_NUM_SRCS,
};

View file

@ -88,38 +88,6 @@ static void brw_from_nir_emit_memory_access(nir_to_brw_state &ntb,
static void brw_combine_with_vec(const brw_builder &bld, const brw_reg &dst,
const brw_reg &src, unsigned n);
static bool
brw_texture_offset(const nir_tex_instr *tex, unsigned src,
uint32_t *offset_bits_out)
{
if (!nir_src_is_const(tex->src[src].src))
return false;
const unsigned num_components = nir_tex_instr_src_size(tex, src);
/* Combine all three offsets into a single unsigned dword:
*
* bits 11:8 - U Offset (X component)
* bits 7:4 - V Offset (Y component)
* bits 3:0 - R Offset (Z component)
*/
uint32_t offset_bits = 0;
for (unsigned i = 0; i < num_components; i++) {
int offset = nir_src_comp_as_int(tex->src[src].src, i);
/* offset out of bounds; caller will handle it. */
if (offset > 7 || offset < -8)
return false;
const unsigned shift = 4 * (2 - i);
offset_bits |= (offset & 0xF) << shift;
}
*offset_bits_out = offset_bits;
return true;
}
static brw_reg
setup_imm_b(const brw_builder &bld, int8_t v)
{
@ -5945,6 +5913,7 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
srcs[TEX_LOGICAL_SRC_SURFACE] = image;
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_d(0);
srcs[TEX_LOGICAL_SRC_PAYLOAD0] = brw_imm_d(0); /* LOD (required) */
/* Since the image size is always uniform, we can just emit a SIMD8
* query instruction and splat the result out.
@ -5953,8 +5922,9 @@ brw_from_nir_emit_intrinsic(nir_to_brw_state &ntb,
brw_reg tmp = ubld.vgrf(BRW_TYPE_UD, 4);
brw_tex_inst *inst = ubld.emit(SHADER_OPCODE_SAMPLER,
tmp, srcs, ARRAY_SIZE(srcs))->as_tex();
inst->sampler_opcode = SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL;
tmp, srcs, 3)->as_tex();
inst->required_params = 0x1 /* LOD */;
inst->sampler_opcode = BRW_SAMPLER_OPCODE_RESINFO;
inst->surface_bindless = instr->intrinsic == nir_intrinsic_bindless_image_size;
inst->size_written = 4 * REG_SIZE * reg_unit(devinfo);
@ -7359,11 +7329,6 @@ static void
brw_from_nir_emit_texture(nir_to_brw_state &ntb,
nir_tex_instr *instr)
{
const intel_device_info *devinfo = ntb.devinfo;
const brw_builder &bld = ntb.bld;
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
/* SKL PRMs: Volume 7: 3D-Media-GPGPU:
*
* "The Pixel Null Mask field, when enabled via the Pixel Null Mask
@ -7373,270 +7338,170 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
*
* We'll take care of this in NIR.
*/
assert(!instr->is_sparse || srcs[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
assert(!instr->is_sparse ||
nir_tex_instr_src_index(instr, nir_tex_src_comparator) == -1);
int lod_components = 0;
const intel_device_info *devinfo = ntb.devinfo;
const brw_builder &bld = ntb.bld;
/* The hardware requires a LOD for buffer textures */
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
ASSERTED bool got_lod = false;
ASSERTED bool got_bias = false;
bool pack_lod_bias_and_offset = false;
uint32_t header_bits = 0;
const enum brw_sampler_opcode sampler_opcode =
(enum brw_sampler_opcode)(instr->backend_flags &
~BRW_TEX_INSTR_FUSED_EU_DISABLE);
const struct brw_sampler_payload_desc *payload_desc =
brw_get_sampler_payload_desc(sampler_opcode);
brw_reg_type default_src_type;
switch (instr->op) {
case nir_texop_txf_ms:
case nir_texop_txf_ms_mcs_intel:
default_src_type = devinfo->verx10 >= 125 ? BRW_TYPE_W : BRW_TYPE_D;
break;
case nir_texop_txf:
case nir_texop_txs:
default_src_type = BRW_TYPE_D;
break;
default:
default_src_type = BRW_TYPE_F;
break;
}
for (unsigned i = 0; i < instr->num_srcs; i++) {
nir_src nir_src = instr->src[i].src;
brw_reg src = get_nir_src(ntb, nir_src, -1);
/* If the source is not a vector (e.g., a 1D texture coordinate), then
* the eventual LOAD_PAYLOAD lowering will not properly adjust the
* stride, etc., so do it now.
*/
if (nir_tex_instr_src_size(instr, i) == 1)
src = offset(src, bld, 0);
brw_reg_type src_type = BRW_TYPE_F;
switch (instr->src[i].src_type) {
case nir_tex_src_sampler_offset:
case nir_tex_src_texture_offset:
case nir_tex_src_sampler_handle:
case nir_tex_src_texture_handle:
case nir_tex_src_offset:
src_type = BRW_TYPE_D;
break;
default:
src_type = default_src_type;
break;
}
switch (instr->src[i].src_type) {
case nir_tex_src_bias:
assert(!got_lod);
got_bias = true;
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
break;
case nir_tex_src_comparator:
srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, src_type);
break;
case nir_tex_src_coord:
srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, src_type);
break;
case nir_tex_src_ddx:
srcs[TEX_LOGICAL_SRC_LOD] = retype(src, src_type);
lod_components = nir_tex_instr_src_size(instr, i);
break;
case nir_tex_src_ddy:
srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, src_type);
break;
case nir_tex_src_lod:
assert(!got_bias);
got_lod = true;
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
break;
case nir_tex_src_min_lod:
srcs[TEX_LOGICAL_SRC_MIN_LOD] =
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
break;
case nir_tex_src_ms_index:
srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, src_type);
break;
case nir_tex_src_offset: {
uint32_t offset_bits = 0;
if (brw_texture_offset(instr, i, &offset_bits)) {
header_bits |= offset_bits;
} else {
/* On gfx12.5+, if the offsets are not both constant and in the
* {-8,7} range, nir_lower_tex() will have already lowered the
* source offset. So we should never reach this point.
*/
assert(devinfo->verx10 < 125);
srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
retype(src, src_type);
}
break;
}
case nir_tex_src_projector:
UNREACHABLE("should be lowered");
case nir_tex_src_texture_offset:
assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
/* Emit code to evaluate the actual indexing expression */
srcs[TEX_LOGICAL_SRC_SURFACE] =
bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
brw_imm_ud(instr->texture_index)));
break;
case nir_tex_src_sampler_offset:
assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle) == -1);
assert(srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE);
/* Emit code to evaluate the actual indexing expression */
srcs[TEX_LOGICAL_SRC_SAMPLER] =
bld.emit_uniformize(bld.ADD(retype(src, BRW_TYPE_UD),
brw_imm_ud(instr->sampler_index)));
break;
case nir_tex_src_texture_handle:
assert(nir_tex_instr_src_index(instr, nir_tex_src_texture_offset) == -1);
assert(srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE);
srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(src);
break;
case nir_tex_src_sampler_handle:
assert(nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset) == -1);
assert(srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE);
srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(src);
break;
case nir_tex_src_ms_mcs_intel:
assert(instr->op == nir_texop_txf_ms);
srcs[TEX_LOGICAL_SRC_MCS] = retype(src, src_type);
break;
/* If this parameter is present, we are packing offset U, V and LOD/Bias
* into a single (32-bit) value.
*/
case nir_tex_src_backend2:
assert(instr->op == nir_texop_tg4);
pack_lod_bias_and_offset = true;
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
break;
/* If this parameter is present, we are packing either the explicit LOD
* or LOD bias and the array index into a single (32-bit) value when
* 32-bit texture coordinates are used.
*/
case nir_tex_src_backend1:
assert(!got_lod && !got_bias);
got_lod = true;
assert(instr->op == nir_texop_txl || instr->op == nir_texop_txb);
srcs[TEX_LOGICAL_SRC_LOD] =
retype(get_nir_src_imm(ntb, instr->src[i].src), src_type);
break;
default:
UNREACHABLE("unknown texture source");
}
}
const bool surface_bindless = nir_tex_instr_src_index(
instr, nir_tex_src_texture_handle) >= 0;
const bool sampler_bindless = nir_tex_instr_src_index(
instr, nir_tex_src_sampler_handle) >= 0;
/* If the surface or sampler were not specified through sources, use the
* instruction index.
*/
if (srcs[TEX_LOGICAL_SRC_SURFACE].file == BAD_FILE)
srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
if (srcs[TEX_LOGICAL_SRC_SAMPLER].file == BAD_FILE)
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
assert(srcs[TEX_LOGICAL_SRC_MCS].file != BAD_FILE ||
instr->op != nir_texop_txf_ms);
enum sampler_opcode opcode;
switch (instr->op) {
case nir_texop_tex:
opcode = SAMPLER_OPCODE_TEX_LOGICAL;
break;
case nir_texop_txb:
opcode = SAMPLER_OPCODE_TXB_LOGICAL;
break;
case nir_texop_txl:
opcode = SAMPLER_OPCODE_TXL_LOGICAL;
break;
case nir_texop_txd:
opcode = SAMPLER_OPCODE_TXD_LOGICAL;
break;
case nir_texop_txf:
opcode = SAMPLER_OPCODE_TXF_LOGICAL;
break;
case nir_texop_txf_ms:
/* On Gfx12HP there is only CMS_W available. From the Bspec: Shared
* Functions - 3D Sampler - Messages - Message Format:
*
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
*/
if (devinfo->verx10 >= 125)
opcode = SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL;
else
opcode = SAMPLER_OPCODE_TXF_CMS_W_LOGICAL;
break;
case nir_texop_txf_ms_mcs_intel:
opcode = SAMPLER_OPCODE_TXF_MCS_LOGICAL;
break;
case nir_texop_query_levels:
case nir_texop_txs:
opcode = SAMPLER_OPCODE_TXS_LOGICAL;
break;
case nir_texop_lod:
opcode = SAMPLER_OPCODE_LOD_LOGICAL;
break;
case nir_texop_tg4: {
if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE) {
opcode = SAMPLER_OPCODE_TG4_OFFSET_LOGICAL;
/* First deal with surface & sampler */
bool surface_bindless = false;
bool sampler_bindless = false;
int src_idx;
{
if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_texture_handle)) >= 0) {
srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(
get_nir_src(ntb, instr->src[src_idx].src, -1));
surface_bindless = true;
} else if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_texture_offset)) >= 0) {
srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(
bld.ADD(get_nir_src(ntb, instr->src[src_idx].src, -1),
brw_imm_ud(instr->texture_index)));
} else {
opcode = SAMPLER_OPCODE_TG4_LOGICAL;
if (devinfo->ver >= 20) {
/* If SPV_AMD_texture_gather_bias_lod extension is enabled, all
* texture gather functions (ie. the ones which do not take the
* extra bias argument and the ones that do) fetch texels from
* implicit LOD in fragment shader stage. In all other shader
* stages, base level is used instead.
*/
if (instr->is_gather_implicit_lod)
opcode = SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL;
if (got_bias)
opcode = SAMPLER_OPCODE_TG4_BIAS_LOGICAL;
if (got_lod)
opcode = SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL;
if (pack_lod_bias_and_offset) {
if (got_lod)
opcode = SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL;
if (got_bias)
opcode = SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL;
}
}
srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(instr->texture_index);
}
if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_handle)) >= 0) {
srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(
get_nir_src(ntb, instr->src[src_idx].src, -1));
sampler_bindless = true;
} else if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_sampler_offset)) >= 0) {
srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(
bld.ADD(get_nir_src(ntb, instr->src[src_idx].src, -1),
brw_imm_ud(instr->sampler_index)));
} else {
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(instr->sampler_index);
}
break;
}
case nir_texop_texture_samples:
opcode = SAMPLER_OPCODE_SAMPLEINFO_LOGICAL;
break;
default:
UNREACHABLE("unknown texture opcode");
}
if (instr->op == nir_texop_tg4) {
header_bits |= instr->component << 16;
/* Now the sampler payload */
bool has_offset_in_payload = false;
uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0;
uint16_t required_params = 0;
for (uint32_t i = 0; payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; i++) {
nir_tex_src_type nir_source;
unsigned nir_comp;
#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name
#define S(name, component) do { \
nir_source = nir_tex_src_##name; \
nir_comp = component; \
} while (0)
struct brw_sampler_payload_src sampler_src =
payload_desc->sources[i];
switch (sampler_src.param) {
case P(U): S(coord, 0); break;
case P(V): S(coord, 1); break;
case P(R): S(coord, 2); break;
case P(AI): S(coord, 3); break;
case P(BIAS): S(bias, 0); break;
case P(LOD): S(lod, 0); break;
case P(MLOD): S(min_lod, 0); break;
case P(REF): S(comparator, 0); break;
case P(DUDX): S(ddx, 0); break;
case P(DUDY): S(ddy, 0); break;
case P(DVDX): S(ddx, 1); break;
case P(DVDY): S(ddy, 1); break;
case P(DRDX): S(ddx, 2); break;
case P(DRDY): S(ddy, 2); break;
case P(SI): S(ms_index, 0); break;
case P(MCSL): S(ms_mcs_intel, 0); break;
case P(MCSH): S(ms_mcs_intel, 1); break;
case P(MCS0): S(ms_mcs_intel, 0); break;
case P(MCS1): S(ms_mcs_intel, 1); break;
case P(MCS2): S(ms_mcs_intel, 2); break;
case P(MCS3): S(ms_mcs_intel, 3); break;
case P(OFFU):
S(offset, 0);
has_offset_in_payload = true;
break;
case P(OFFV):
S(offset, 1);
has_offset_in_payload = true;
break;
case P(OFFUV4):
case P(OFFUVR4):
case P(OFFUV6):
case P(OFFUVR6):
case P(BIAS_OFFUV6):
case P(BIAS_OFFUVR4):
case P(LOD_OFFUV6):
case P(LOD_OFFUVR4):
/* There is no payload with 2 packed entries, so backend1 is always
* the one payload parameter packed. */
S(backend1, 0);
has_offset_in_payload = true;
break;
case P(BIAS_AI):
case P(LOD_AI):
case P(MLOD_R):
/* There is no payload with 2 packed entries, so backend1 is always
* the one payload parameter packed. */
S(backend1, 0);
break;
default: UNREACHABLE("unhandled sampler param");
}
#undef P
#undef S
/* TODO: make sure sources have consistent bit sizes */
brw_reg param_val = brw_imm_ud(0);
src_idx = nir_tex_instr_src_index(instr, nir_source);
if (src_idx >= 0 &&
nir_comp < instr->src[src_idx].src.ssa->num_components) {
param_val =
get_nir_src(ntb, instr->src[src_idx].src, nir_comp);
}
/* The hardware requires a LOD for buffer textures */
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF &&
sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) {
sampler_src.optional = false;
}
/* Wa_14012688258:
*
* Don't trim zeros at the end of payload for sample operations
* in cube and cube arrays.
*
* Compiler should send U,V,R parameters even if V,R are 0.
*/
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
intel_needs_workaround(devinfo, 14012688258) &&
(sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U ||
sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V ||
sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) {
sampler_src.optional = false;
}
srcs[TEX_LOGICAL_SRC_PAYLOAD0 + i] = param_val;
/* The last source present in the payload dictates the number of
* sources, unless it's required.
*
* We can skip the last source if it's zero.
*/
if (!sampler_src.optional ||
!(param_val.file == IMM && param_val.ud == 0))
n_sources = TEX_LOGICAL_SRC_PAYLOAD0 + i + 1;
if (!sampler_src.optional)
required_params |= BITFIELD_BIT(i);
}
brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
@ -7669,31 +7534,32 @@ brw_from_nir_emit_texture(nir_to_brw_state &ntb,
brw_allocate_vgrf_units(*bld.shader, total_regs * reg_unit(devinfo)),
dst_type);
brw_tex_inst *tex = bld.emit(SHADER_OPCODE_SAMPLER, dst, srcs, ARRAY_SIZE(srcs))->as_tex();
tex->sampler_opcode = opcode;
brw_tex_inst *tex = bld.emit(SHADER_OPCODE_SAMPLER, dst, srcs, n_sources)->as_tex();
tex->sampler_opcode = (enum brw_sampler_opcode) instr->backend_flags;
tex->surface_bindless = surface_bindless;
tex->sampler_bindless = sampler_bindless;
tex->offset = header_bits;
tex->size_written = total_regs * grf_size;
tex->residency = instr->is_sparse;
tex->required_params = required_params;
tex->coord_components = instr->coord_components;
tex->grad_components = lod_components;
tex->fused_eu_disable = (instr->backend_flags & BRW_TEX_INSTR_FUSED_EU_DISABLE) != 0;
tex->gather_component = instr->component;
/* Wa_14012688258:
/* If the NIR instruction has an offset param but the sampler payload
* doesn't, we can put the offset into the header of the message.
*
* Don't trim zeros at the end of payload for sample operations
* in cube and cube arrays.
* The restriction though is that it should be a constant value.
*/
if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
intel_needs_workaround(devinfo, 14012688258)) {
if ((src_idx = nir_tex_instr_src_index(instr, nir_tex_src_offset)) != -1 &&
!has_offset_in_payload) {
assert(nir_src_is_const(instr->src[src_idx].src));
/* Compiler should send U,V,R parameters even if V,R are 0. */
if (srcs[TEX_LOGICAL_SRC_COORDINATE].file != BAD_FILE)
assert(instr->coord_components >= 3u);
/* See opt_zero_samples(). */
tex->keep_payload_trailing_zeros = true;
const unsigned num_components = nir_tex_instr_src_size(instr, src_idx);
for (unsigned i = 0; i < num_components; i++) {
int offset = nir_src_comp_as_int(instr->src[src_idx].src, i);
tex->const_offsets[i] = offset;
}
tex->has_const_offsets = true;
}
/* With half-floats returns, the stride into a GRF allocation for each

View file

@ -502,29 +502,8 @@ brw_inst::components_read(unsigned i) const
else
return 1;
case SHADER_OPCODE_SAMPLER: {
const brw_tex_inst *tex = as_tex();
/* Texture coordinates. */
if (i == TEX_LOGICAL_SRC_COORDINATE)
return tex->coord_components;
/* Texture derivatives. */
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL)
return tex->grad_components;
/* Texture offset. */
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
return 2;
/* MCS */
else if (i == TEX_LOGICAL_SRC_MCS) {
if (tex->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_LOGICAL)
return 2;
else if (tex->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
return 4;
else
return 1;
} else
return 1;
}
case SHADER_OPCODE_SAMPLER:
return 1;
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
if (i == MEMORY_LOGICAL_DATA0)

View file

@ -27,6 +27,7 @@
#include <assert.h>
#include "brw_reg.h"
#include "compiler/brw_list.h"
#include "brw_sampler.h"
#define MAX_SAMPLER_MESSAGE_SIZE 11
@ -202,7 +203,6 @@ struct brw_inst : brw_exec_node {
*/
bool predicate_trivial:1;
bool eot:1;
bool keep_payload_trailing_zeros:1;
/**
* Whether the parameters of the SEND instructions are build with
* NoMask (for A32 messages this covers only the surface handle, for
@ -218,7 +218,7 @@ struct brw_inst : brw_exec_node {
*/
bool fused_eu_disable:1;
uint8_t pad:4;
uint8_t pad:5;
};
uint16_t bits;
};
@ -285,10 +285,7 @@ struct brw_send_inst : brw_inst {
};
struct brw_tex_inst : brw_inst {
enum sampler_opcode sampler_opcode;
uint32_t offset;
uint8_t coord_components;
uint8_t grad_components;
enum brw_sampler_opcode sampler_opcode;
union {
struct {
/**
@ -308,9 +305,31 @@ struct brw_tex_inst : brw_inst {
* Whether the sampler handle is bindless
*/
bool sampler_bindless:1;
/**
* Whether const_offsets holds meaningful values
*/
bool has_const_offsets:1;
/**
* Coord components
*/
uint8_t coord_components:2;
/**
* Gather component
*/
uint8_t gather_component:2;
/**
* Bitfields payload parameters that cannot be optimized by
* brw_opt_zero_samples()
*/
uint16_t required_params:13;
};
uint8_t bits;
uint32_t bits;
};
/**
* Constant offsets
*/
int8_t const_offsets[3];
};
struct brw_mem_inst : brw_inst {

View file

@ -28,6 +28,7 @@
#include "brw_eu.h"
#include "brw_shader.h"
#include "brw_builder.h"
#include "brw_sampler.h"
#include "util/bitpack_helpers.h"
@ -604,92 +605,6 @@ is_high_sampler(const struct intel_device_info *devinfo, const brw_reg &sampler)
return sampler.file != IMM || sampler.ud >= 16;
}
static unsigned
sampler_msg_type(const intel_device_info *devinfo,
sampler_opcode opcode, bool shadow_compare,
bool lod_is_zero, bool has_min_lod)
{
switch (opcode) {
case SAMPLER_OPCODE_TEX_LOGICAL:
if (devinfo->ver >= 20 && has_min_lod) {
return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD :
XE2_SAMPLER_MESSAGE_SAMPLE_MLOD;
} else {
return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
GFX5_SAMPLER_MESSAGE_SAMPLE;
}
case SAMPLER_OPCODE_TXB_LOGICAL:
return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
GFX5_SAMPLER_MESSAGE_SAMPLE_BIAS;
case SAMPLER_OPCODE_TXL_LOGICAL:
assert(!has_min_lod);
if (lod_is_zero) {
return shadow_compare ? GFX9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
GFX9_SAMPLER_MESSAGE_SAMPLE_LZ;
}
return shadow_compare ? GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
GFX5_SAMPLER_MESSAGE_SAMPLE_LOD;
case SAMPLER_OPCODE_TXS_LOGICAL:
case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL:
assert(!has_min_lod);
return GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
case SAMPLER_OPCODE_TXD_LOGICAL:
return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
case SAMPLER_OPCODE_TXF_LOGICAL:
assert(!has_min_lod);
return lod_is_zero ? GFX9_SAMPLER_MESSAGE_SAMPLE_LD_LZ :
GFX5_SAMPLER_MESSAGE_SAMPLE_LD;
case SAMPLER_OPCODE_TXF_CMS_W_LOGICAL:
case SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
assert(!has_min_lod);
return GFX9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
case SAMPLER_OPCODE_TXF_MCS_LOGICAL:
assert(!has_min_lod);
return GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
case SAMPLER_OPCODE_LOD_LOGICAL:
assert(!has_min_lod);
return GFX5_SAMPLER_MESSAGE_LOD;
case SAMPLER_OPCODE_TG4_LOGICAL:
assert(!has_min_lod);
return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
break;
case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL:
assert(!has_min_lod);
return shadow_compare ? GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
assert(!has_min_lod);
assert(devinfo->ver >= 20);
return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L_C:
XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_L;
case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
assert(!has_min_lod);
assert(devinfo->ver >= 20);
return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_B;
case SAMPLER_OPCODE_TG4_BIAS_LOGICAL:
assert(!has_min_lod);
assert(devinfo->ver >= 20);
return XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_B;
case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
assert(!has_min_lod);
assert(devinfo->ver >= 20);
return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L_C :
XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_L;
case SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
assert(!has_min_lod);
assert(devinfo->ver >= 20);
return shadow_compare ? XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I_C :
XE2_SAMPLER_MESSAGE_SAMPLE_GATHER4_I;
case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL:
assert(!has_min_lod);
return GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
default:
UNREACHABLE("not reached");
}
}
/**
* Emit a LOAD_PAYLOAD instruction while ensuring the sources are aligned to
* the given requested_alignment_sz.
@ -733,20 +648,28 @@ emit_load_payload_with_padding(const brw_builder &bld, const brw_reg &dst,
}
static bool
shader_opcode_needs_header(sampler_opcode op,
const struct intel_device_info *devinfo)
sampler_op_needs_header(enum brw_sampler_opcode op,
const struct intel_device_info *devinfo)
{
switch (op) {
case SAMPLER_OPCODE_TG4_LOGICAL:
case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL:
case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
case SAMPLER_OPCODE_TG4_BIAS_LOGICAL:
case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
case SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL:
case BRW_SAMPLER_OPCODE_GATHER4:
case BRW_SAMPLER_OPCODE_GATHER4_B:
case BRW_SAMPLER_OPCODE_GATHER4_C:
case BRW_SAMPLER_OPCODE_GATHER4_I:
case BRW_SAMPLER_OPCODE_GATHER4_I_C:
case BRW_SAMPLER_OPCODE_GATHER4_L:
case BRW_SAMPLER_OPCODE_GATHER4_L_C:
case BRW_SAMPLER_OPCODE_GATHER4_PO:
case BRW_SAMPLER_OPCODE_GATHER4_PO_PACKED:
case BRW_SAMPLER_OPCODE_GATHER4_PO_B:
case BRW_SAMPLER_OPCODE_GATHER4_PO_C:
case BRW_SAMPLER_OPCODE_GATHER4_PO_C_PACKED:
case BRW_SAMPLER_OPCODE_GATHER4_PO_L:
case BRW_SAMPLER_OPCODE_GATHER4_PO_L_C:
case BRW_SAMPLER_OPCODE_SAMPLEINFO:
return true;
case SAMPLER_OPCODE_TXF_LOGICAL:
case BRW_SAMPLER_OPCODE_LD:
case BRW_SAMPLER_OPCODE_LD_LZ:
/* Xe3 HW does not seem to work unless we force a header. */
return devinfo->ver >= 30;
default:
@ -757,13 +680,13 @@ shader_opcode_needs_header(sampler_opcode op,
}
static bool
sampler_opcode_uses_sampler_state(sampler_opcode op)
sampler_opcode_uses_sampler_state(enum brw_sampler_opcode op)
{
switch (op) {
case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL:
case SAMPLER_OPCODE_TXF_LOGICAL:
case SAMPLER_OPCODE_TXS_LOGICAL:
case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL:
case BRW_SAMPLER_OPCODE_RESINFO:
case BRW_SAMPLER_OPCODE_SAMPLEINFO:
case BRW_SAMPLER_OPCODE_LD:
case BRW_SAMPLER_OPCODE_LD_LZ:
return false;
default:
@ -777,12 +700,12 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
{
assert(inst);
const brw_reg *src = inst->src;
unsigned src_type_size = 0;
unsigned src_type_size = 4; /* SAMPLEINFO has no payload source */
/* All sources need to have the same size, therefore seek the first valid
* and take the size from there.
*/
for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD0; i < inst->sources; i++) {
if (src[i].file != BAD_FILE) {
src_type_size = brw_type_size_bytes(src[i].type);
break;
@ -797,15 +720,9 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
* which is already in 16-bits unlike the other parameters that need forced
* conversion.
*/
if (inst->sampler_opcode != SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
for (unsigned i = 0; i < TEX_LOGICAL_NUM_SRCS; i++) {
/* surface/sampler don't go in the payload */
if (i == TEX_LOGICAL_SRC_SURFACE ||
i == TEX_LOGICAL_SRC_SAMPLER)
continue;
assert(src[i].file == BAD_FILE ||
brw_type_size_bytes(src[i].type) == src_type_size);
}
for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD0; i < inst->sources; i++) {
assert(src[i].file == BAD_FILE ||
brw_type_size_bytes(src[i].type) == src_type_size);
}
#endif
@ -820,8 +737,8 @@ get_sampler_msg_payload_type_bit_size(const intel_device_info *devinfo,
* ld_mcs SIMD8H and SIMD16H Only
* ld2dms REMOVEDBY(GEN:HAS:1406788836)
*/
if (inst->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL ||
inst->sampler_opcode == SAMPLER_OPCODE_TXF_MCS_LOGICAL)
if (inst->sampler_opcode == BRW_SAMPLER_OPCODE_LD2DMS_W_GFX125 ||
inst->sampler_opcode == BRW_SAMPLER_OPCODE_LD_MCS)
src_type_size = 2;
return src_type_size * 8;
@ -833,16 +750,11 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
const intel_device_info *devinfo = bld.shader->devinfo;
const brw_compiler *compiler = bld.shader->compiler;
const brw_reg coordinate = tex->src[TEX_LOGICAL_SRC_COORDINATE];
const brw_reg shadow_c = tex->src[TEX_LOGICAL_SRC_SHADOW_C];
const brw_reg lod = tex->src[TEX_LOGICAL_SRC_LOD];
const brw_reg lod2 = tex->src[TEX_LOGICAL_SRC_LOD2];
const brw_reg min_lod = tex->src[TEX_LOGICAL_SRC_MIN_LOD];
const brw_reg sample_index = tex->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
const brw_reg mcs = tex->src[TEX_LOGICAL_SRC_MCS];
const enum brw_sampler_opcode op = tex->sampler_opcode;
const bool surface_bindless = tex->surface_bindless;
const bool sampler_bindless = tex->sampler_bindless;
const brw_reg surface = tex->src[TEX_LOGICAL_SRC_SURFACE];
const brw_reg sampler = tex->src[TEX_LOGICAL_SRC_SAMPLER];
const brw_reg tg4_offset = tex->src[TEX_LOGICAL_SRC_TG4_OFFSET];
const unsigned payload_type_bit_size =
get_sampler_msg_payload_type_bit_size(devinfo, tex);
@ -853,23 +765,22 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
/* We never generate EOT sampler messages */
assert(!tex->eot);
const bool surface_bindless = tex->surface_bindless;
const bool sampler_bindless = tex->sampler_bindless;
const enum brw_reg_type payload_type =
brw_type_with_size(BRW_TYPE_F, payload_type_bit_size);
const enum brw_reg_type payload_unsigned_type =
brw_type_with_size(BRW_TYPE_UD, payload_type_bit_size);
const enum brw_reg_type payload_signed_type =
brw_type_with_size(BRW_TYPE_D, payload_type_bit_size);
unsigned header_size = 0, length = 0;
sampler_opcode op = tex->sampler_opcode;
brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
sources[i] = bld.vgrf(payload_type);
if (shader_opcode_needs_header(op, devinfo) || tex->offset != 0 ||
sampler_bindless || is_high_sampler(devinfo, sampler) ||
tex->residency) {
const bool needs_header =
sampler_op_needs_header(op, devinfo) ||
tex->has_const_offsets ||
sampler_bindless || is_high_sampler(devinfo, sampler) ||
tex->residency;
unsigned header_size = needs_header ? reg_unit(devinfo) : 0, length = 0;
brw_reg sources[1 + MAX_SAMPLER_MESSAGE_SIZE];
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
sources[i] = bld.vgrf((i == 0 && needs_header) ? BRW_TYPE_UD : payload_type);
if (needs_header) {
/* For general texture offsets (no txf workaround), we need a header to
* put them in.
*
@ -882,6 +793,12 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
for (header_size = 0; header_size < reg_unit(devinfo); header_size++)
sources[length++] = byte_offset(header, REG_SIZE * header_size);
uint32_t g0_2 = 0;
if (tex->gather_component)
g0_2 |= tex->gather_component << 16;
if (tex->residency)
g0_2 |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
/* If we're requesting fewer than four channels worth of response,
* and we have an explicit header, we need to set up the sampler
* writemask. It's reversed from normal: 1 means "don't write".
@ -895,11 +812,14 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
if (comps_regs < 4 * comp_regs) {
assert(comps_regs % comp_regs == 0);
unsigned mask = ~((1 << (comps_regs / comp_regs)) - 1) & 0xf;
tex->offset |= mask << 12;
g0_2 |= mask << 12;
}
if (tex->residency)
tex->offset |= 1 << 23; /* g0.2 bit23 : Pixel Null Mask Enable */
if (tex->has_const_offsets) {
g0_2 |= ((tex->const_offsets[2] & 0xf) << 0) |
((tex->const_offsets[1] & 0xf) << 4) |
((tex->const_offsets[0] & 0xf) << 8);
}
/* Build the actual header */
const brw_builder ubld = bld.exec_all().group(8 * reg_unit(devinfo), 0);
@ -908,8 +828,9 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
ubld.MOV(header, brw_imm_ud(0));
else
ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_TYPE_UD));
if (tex->offset) {
ubld1.MOV(component(header, 2), brw_imm_ud(tex->offset));
if (g0_2) {
ubld1.MOV(component(header, 2), brw_imm_ud(g0_2));
} else if (devinfo->ver < 11 &&
bld.shader->stage != MESA_SHADER_VERTEX &&
bld.shader->stage != MESA_SHADER_FRAGMENT) {
@ -976,218 +897,14 @@ lower_sampler_logical_send(const brw_builder &bld, brw_tex_inst *tex)
}
}
const bool lod_is_zero = lod.is_zero();
const unsigned msg_type = brw_get_sampler_hw_opcode(op);
/* On Xe2 and newer platforms, min_lod is the first parameter specifically
* so that a bunch of other, possibly unused, parameters don't need to also
* be included.
*/
const unsigned msg_type =
sampler_msg_type(devinfo, op, shadow_c.file != BAD_FILE, lod_is_zero,
min_lod.file != BAD_FILE);
const bool min_lod_is_first = devinfo->ver >= 20 &&
(msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_MLOD ||
msg_type == XE2_SAMPLER_MESSAGE_SAMPLE_COMPARE_MLOD);
if (min_lod_is_first) {
assert(min_lod.file != BAD_FILE);
bld.MOV(sources[length++], min_lod);
}
if (shadow_c.file != BAD_FILE) {
bld.MOV(sources[length], shadow_c);
length++;
}
bool coordinate_done = false;
/* Set up the LOD info */
switch (op) {
case SAMPLER_OPCODE_TXL_LOGICAL:
if (lod_is_zero)
break;
FALLTHROUGH;
case SAMPLER_OPCODE_TXB_LOGICAL:
case SAMPLER_OPCODE_TG4_BIAS_LOGICAL:
case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
bld.MOV(sources[length], lod);
length++;
break;
case SAMPLER_OPCODE_TXD_LOGICAL:
/* TXD should have been lowered in SIMD16 mode (in SIMD32 mode in
* Xe2+).
*/
assert(bld.dispatch_width() == (8 * reg_unit(devinfo)));
/* Load dPdx and the coordinate together:
* [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
*/
for (unsigned i = 0; i < tex->coord_components; i++) {
bld.MOV(sources[length++], offset(coordinate, bld, i));
/* For cube map array, the coordinate is (u,v,r,ai) but there are
* only derivatives for (u, v, r).
*/
if (i < tex->grad_components) {
bld.MOV(sources[length++], offset(lod, bld, i));
bld.MOV(sources[length++], offset(lod2, bld, i));
}
}
coordinate_done = true;
break;
case SAMPLER_OPCODE_TXS_LOGICAL:
sources[length] = retype(sources[length], payload_unsigned_type);
bld.MOV(sources[length++], lod);
break;
case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL:
/* We need an LOD; just use 0 */
sources[length] = retype(sources[length], payload_unsigned_type);
bld.MOV(sources[length++], brw_imm_ud(0));
break;
case SAMPLER_OPCODE_TXF_LOGICAL:
/* On Gfx9 the parameters are intermixed they are u, v, lod, r. */
sources[length] = retype(sources[length], payload_signed_type);
bld.MOV(sources[length++], offset(coordinate, bld, 0));
if (tex->coord_components >= 2) {
sources[length] = retype(sources[length], payload_signed_type);
bld.MOV(sources[length], offset(coordinate, bld, 1));
} else {
sources[length] = brw_imm_d(0);
}
length++;
if (!lod_is_zero) {
sources[length] = retype(sources[length], payload_signed_type);
bld.MOV(sources[length++], lod);
}
for (unsigned i = 2; i < tex->coord_components; i++) {
sources[length] = retype(sources[length], payload_signed_type);
bld.MOV(sources[length++], offset(coordinate, bld, i));
}
coordinate_done = true;
break;
case SAMPLER_OPCODE_TXF_CMS_W_LOGICAL:
case SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
sources[length] = retype(sources[length], payload_unsigned_type);
bld.MOV(sources[length++], sample_index);
/* Data from the multisample control surface. */
for (unsigned i = 0; i < 2; ++i) {
/* Sampler always writes 4/8 register worth of data but for ld_mcs
* only valid data is in first two register. So with 16-bit
* payload, we need to split 2-32bit register into 4-16-bit
* payload.
*
* From the Gfx12HP BSpec: Render Engine - 3D and GPGPU Programs -
* Shared Functions - 3D Sampler - Messages - Message Format:
*
* ld2dms_w si mcs0 mcs1 mcs2 mcs3 u v r
*/
if (op == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL) {
sources[length] = retype(sources[length], payload_unsigned_type);
bld.MOV(sources[length++],
mcs.file == IMM ? mcs : offset(mcs, bld, 2 * i + 0));
sources[length] = retype(sources[length], payload_unsigned_type);
bld.MOV(sources[length++],
mcs.file == IMM ? mcs : offset(mcs, bld, 2 * i + 1));
} else {
sources[length] = retype(sources[length], payload_unsigned_type);
bld.MOV(sources[length++],
mcs.file == IMM ? mcs : offset(mcs, bld, i));
}
}
FALLTHROUGH;
case SAMPLER_OPCODE_TXF_MCS_LOGICAL:
/* There is no offsetting for this message; just copy in the integer
* texture coordinates.
*/
for (unsigned i = 0; i < tex->coord_components; i++) {
sources[length] = retype(sources[length], payload_signed_type);
bld.MOV(sources[length++], offset(coordinate, bld, i));
}
coordinate_done = true;
break;
case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL:
/* More crazy intermixing */
for (unsigned i = 0; i < 2; i++) /* u, v */
bld.MOV(sources[length++], offset(coordinate, bld, i));
for (unsigned i = 0; i < 2; i++) { /* offu, offv */
sources[length] = retype(sources[length], payload_signed_type);
bld.MOV(sources[length++], offset(tg4_offset, bld, i));
}
if (tex->coord_components == 3) /* r if present */
bld.MOV(sources[length++], offset(coordinate, bld, 2));
coordinate_done = true;
break;
default:
break;
}
/* Set up the coordinate (except for cases where it was done above) */
if (!coordinate_done) {
for (unsigned i = 0; i < tex->coord_components; i++)
bld.MOV(retype(sources[length++], payload_type),
offset(coordinate, bld, i));
}
if (min_lod.file != BAD_FILE && !min_lod_is_first) {
/* Account for all of the missing coordinate sources */
if (op == SAMPLER_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
/* Bspec 64985:
*
* For sample_b sampler message format:
*
* SIMD16H/SIMD32H
* Param Number 0 1 2 3 4 5
* Param BIAS U V R Ai MLOD
*
* SIMD16/SIMD32
* Param Number 0 1 2 3 4
* Param BIAS_AI U V R MLOD
*/
length += 3 - tex->coord_components;
} else if (op == SAMPLER_OPCODE_TXD_LOGICAL && devinfo->verx10 >= 125) {
/* On DG2 and newer platforms, sample_d can only be used with 1D and
* 2D surfaces, so the maximum number of gradient components is 2.
* In spite of this limitation, the Bspec lists a mysterious R
* component before the min_lod, so the maximum coordinate components
* is 3.
*
* See bspec 45942, "Enable new message layout for cube array"
*/
length += 3 - tex->coord_components;
length += (2 - tex->grad_components) * 2;
} else {
length += 4 - tex->coord_components;
if (op == SAMPLER_OPCODE_TXD_LOGICAL)
length += (3 - tex->grad_components) * 2;
}
bld.MOV(sources[length++], min_lod);
/* Wa_14014595444: Populate MLOD as parameter 5 (twice). */
if (intel_needs_workaround(devinfo, 14014595444) &&
op == SAMPLER_OPCODE_TXB_LOGICAL && shadow_c.file == BAD_FILE)
bld.MOV(sources[length++], min_lod);
}
for (uint32_t i = TEX_LOGICAL_SRC_PAYLOAD0; i < tex->sources; i++)
bld.MOV(retype(sources[length++], payload_type), retype(tex->src[i], payload_type));
const brw_reg src_payload =
retype(brw_allocate_vgrf_units(*bld.shader, length * bld.dispatch_width() / 8),
BRW_TYPE_F);
BRW_TYPE_UD);
/* In case of 16-bit payload each component takes one full register in
* both SIMD8H and SIMD16H modes. In both cases one reg can hold 16
* elements. In SIMD8H case hardware simply expects the components to be

View file

@ -162,69 +162,35 @@ static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
const brw_tex_inst *tex)
{
/* On gfx12 parameters are fixed to 16-bit values and therefore they all
* always fit regardless of the execution size.
*/
if (tex->sampler_opcode == SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
return MIN2(16, tex->exec_size);
/* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
* unsuppported on Xe2.
*/
if (tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL)
if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D ||
tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D_REDUCED ||
tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D_C ||
tex->sampler_opcode == BRW_SAMPLER_OPCODE_SAMPLE_D_C_PACKED)
return devinfo->ver < 20 ? 8 : 16;
/* If we have a min_lod parameter on anything other than a simple sample
* message, it will push it over 5 arguments and we have to fall back to
* SIMD8.
*/
if (tex->sampler_opcode != SAMPLER_OPCODE_TEX_LOGICAL &&
tex->components_read(TEX_LOGICAL_SRC_MIN_LOD))
return devinfo->ver < 20 ? 8 : 16;
const unsigned max_payload_size =
MAX_SAMPLER_MESSAGE_SIZE *
(reg_unit(devinfo) * 8) /* min SIMD */ *
4 /* dword */;
const unsigned payload_param_size =
brw_type_size_bytes(tex->src[TEX_LOGICAL_SRC_PAYLOAD0].type);
unsigned payload_size =
(tex->sources - TEX_LOGICAL_SRC_PAYLOAD0) *
tex->exec_size *
payload_param_size;
/* On Gfx9+ the LOD argument is for free if we're able to use the LZ
* variant of the TXL or TXF message.
*/
const bool implicit_lod = (tex->sampler_opcode == SAMPLER_OPCODE_TXL_LOGICAL ||
tex->sampler_opcode == SAMPLER_OPCODE_TXF_LOGICAL) &&
tex->src[TEX_LOGICAL_SRC_LOD].is_zero();
/* Calculate the total number of argument components that need to be passed
* to the sampler unit.
*/
unsigned num_payload_components =
tex->coord_components +
tex->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
(implicit_lod ? 0 : tex->components_read(TEX_LOGICAL_SRC_LOD)) +
tex->components_read(TEX_LOGICAL_SRC_LOD2) +
tex->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
(tex->sampler_opcode == SAMPLER_OPCODE_TG4_OFFSET_LOGICAL ?
tex->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
tex->components_read(TEX_LOGICAL_SRC_MCS) +
tex->components_read(TEX_LOGICAL_SRC_MIN_LOD);
if (tex->sampler_opcode == SAMPLER_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
num_payload_components += 3 - tex->coord_components;
} else if (tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL &&
devinfo->verx10 >= 125 && devinfo->ver < 20) {
num_payload_components +=
3 - tex->coord_components + (2 - tex->grad_components) * 2;
} else {
num_payload_components += 4 - tex->coord_components;
if (tex->sampler_opcode == SAMPLER_OPCODE_TXD_LOGICAL)
num_payload_components += (3 - tex->grad_components) * 2;
unsigned simd_width = tex->exec_size;
while (payload_size > max_payload_size) {
payload_size /= 2;
simd_width /= 2;
}
const unsigned max_hw_simd = devinfo->ver < 20 ? 16 : 32;
const unsigned simd_limit = reg_unit(devinfo) *
(num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
/* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
* maximum message size supported by the sampler, regardless of whether a
* header is provided or not.
*/
return MIN2(tex->exec_size, simd_limit);
return MIN2(simd_width, max_hw_simd);
}
static bool

View file

@ -141,11 +141,30 @@ pack_lod_and_array_index(nir_builder *b, nir_tex_instr *tex)
return true;
}
static nir_def *
build_packed_offset(nir_builder *b,
nir_def *offset,
unsigned offset_bits,
unsigned offset_count)
{
offset = nir_iand_imm(b, offset, BITFIELD_MASK(offset_bits));
nir_def *offuvr = nir_channel(b, offset, 0);
for (unsigned i = 1; i < MIN2(offset->num_components, offset_count); i++) {
nir_def *chan = nir_channel(b, offset, i);
offuvr = nir_ior(b, offuvr, nir_ishl_imm(b, chan, i * offset_bits));
}
return offuvr;
}
/**
* Pack either the explicit LOD/Bias and the offset together.
*/
static bool
pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex)
pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex,
unsigned offset_bits,
unsigned offset_count)
{
int offset_index = nir_tex_instr_src_index(tex, nir_tex_src_offset);
if (offset_index < 0)
@ -175,7 +194,6 @@ pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex)
}
nir_def *lod = tex->src[lod_index].src.ssa;
nir_def *offset = tex->src[offset_index].src.ssa;
b->cursor = nir_before_instr(&tex->instr);
@ -192,16 +210,20 @@ pack_lod_or_bias_and_offset(nir_builder *b, nir_tex_instr *tex)
* ------------------------------------------
* |OffsetUV | LOD/Bias | OffsetV | OffsetU |
* ------------------------------------------
*
* Or
* ---------------------------------------------------
* |Bits | [31:12] | [11:9] | [8:5] | [4:0] |
* ----------------------------------------------------
* |OffsetUV | LOD/Bias | OffsetR | OffsetV | OffsetU |
* ----------------------------------------------------
*/
nir_def *offu = nir_iand_imm(b, nir_channel(b, offset, 0), 0x3F);
nir_def *offv = nir_iand_imm(b, nir_channel(b, offset, 1), 0x3F);
nir_def *offuvr = build_packed_offset(
b, tex->src[offset_index].src.ssa, offset_bits, offset_count);
nir_def *offsetUV = nir_ior(b, offu, nir_ishl_imm(b, offv, 6));
nir_def *lod_offsetUV = nir_ior(b, offsetUV,
nir_iand_imm(b, lod, 0xFFFFF000));
nir_def *packed = nir_ior(b, offuvr, nir_iand_imm(b, lod, 0xFFFFF000));
nir_tex_instr_remove_src(tex, offset_index);
nir_tex_instr_add_src(tex, nir_tex_src_backend2, lod_offsetUV);
nir_tex_instr_add_src(tex, nir_tex_src_backend1, packed);
return true;
}
@ -219,9 +241,15 @@ brw_nir_lower_texture_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data)
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUV6) != -1 ||
brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1)
return pack_lod_or_bias_and_offset(b, tex);
brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUV6) != -1)
return pack_lod_or_bias_and_offset(b, tex, 6, 2);
if (brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_BIAS_OFFUVR4) != -1 ||
brw_sampler_opcode_param_index(sampler_opcode,
BRW_SAMPLER_PAYLOAD_PARAM_LOD_OFFUVR4) != -1)
return pack_lod_or_bias_and_offset(b, tex, 4, 3);
return false;
}
@ -321,7 +349,7 @@ brw_nir_lower_mcs_fetch_instr(nir_builder *b, nir_tex_instr *tex, void *cb_data)
break;
default:
continue;
break;
}
}

View file

@ -110,6 +110,12 @@ brw_optimize(brw_shader &s)
OPT(brw_lower_simd_width);
OPT(brw_lower_scalar_fp64_MAD);
OPT(brw_lower_barycentrics);
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages. Do this before
* lowering the send messages.
*/
OPT(brw_opt_zero_samples);
OPT(brw_lower_logical_sends);
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_EARLY_LOWERING);
@ -119,15 +125,6 @@ brw_optimize(brw_shader &s)
if (!OPT(brw_opt_copy_propagation_defs))
OPT(brw_opt_copy_propagation);
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
* Do this before splitting SENDs.
*/
if (OPT(brw_opt_zero_samples)) {
if (!OPT(brw_opt_copy_propagation_defs)) {
OPT(brw_opt_copy_propagation);
}
}
if (s.devinfo->ver >= 30)
OPT(brw_opt_send_to_send_gather);
@ -264,56 +261,21 @@ brw_opt_zero_samples(brw_shader &s)
bool progress = false;
foreach_block_and_inst(block, brw_inst, inst, s.cfg) {
if (inst->opcode != SHADER_OPCODE_SEND)
brw_tex_inst *tex = inst->as_tex();
if (tex == NULL)
continue;
brw_send_inst *send = inst->as_send();
if (send->sfid != BRW_SFID_SAMPLER)
continue;
int last_req_param = util_last_bit(tex->required_params) - 1;
assert(last_req_param <= (tex->sources - TEX_LOGICAL_SRC_PAYLOAD0));
/* Wa_14012688258:
*
* Don't trim zeros at the end of payload for sample operations
* in cube and cube arrays.
*/
if (send->keep_payload_trailing_zeros)
continue;
int last_param = tex->sources - 1 - TEX_LOGICAL_SRC_PAYLOAD0;
/* This pass works on SENDs before splitting. */
if (send->ex_mlen > 0)
continue;
brw_inst *prev = (brw_inst *) send->prev;
if (prev->is_head_sentinel() || prev->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
continue;
brw_load_payload_inst *lp = prev->as_load_payload();
/* How much of the payload are actually read by this SEND. */
const unsigned params =
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
/* We don't want to remove the message header or the first parameter.
* Removing the first parameter is not allowed, see the Haswell PRM
* volume 7, page 149:
*
* "Parameter 0 is required except for the sampleinfo message, which
* has no parameter 0"
*/
const unsigned first_param_idx = lp->header_size;
unsigned zero_size = 0;
for (unsigned i = params - 1; i > first_param_idx; i--) {
if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
for (int i = last_param; i > last_req_param; i--) {
if (tex->src[TEX_LOGICAL_SRC_PAYLOAD0 + i].file != IMM ||
tex->src[TEX_LOGICAL_SRC_PAYLOAD0 + i].ud != 0)
break;
zero_size += lp->exec_size * brw_type_size_bytes(lp->src[i].type) * lp->dst.stride;
}
/* Round down to ensure to only consider full registers. */
const unsigned zero_len = ROUND_DOWN_TO(zero_size / REG_SIZE, reg_unit(s.devinfo));
if (zero_len > 0) {
/* Note mlen is in REG_SIZE units. */
send->mlen -= zero_len;
tex->sources = TEX_LOGICAL_SRC_PAYLOAD0 + i;
progress = true;
}
}

View file

@ -239,12 +239,16 @@ static bool
tex_inst_match(brw_tex_inst *a, brw_tex_inst *b)
{
return a->sampler_opcode == b->sampler_opcode &&
a->offset == b->offset &&
a->surface_bindless == b->surface_bindless &&
a->sampler_bindless == b->sampler_bindless &&
a->residency == b->residency &&
a->required_params == b->required_params &&
a->coord_components == b->coord_components &&
a->grad_components == b->grad_components &&
a->residency == b->residency;
a->gather_component == b->gather_component &&
a->has_const_offsets == b->has_const_offsets &&
a->const_offsets[0] == b->const_offsets[0] &&
a->const_offsets[1] == b->const_offsets[1] &&
a->const_offsets[2] == b->const_offsets[2];
}
static bool
@ -389,12 +393,13 @@ hash_inst(const void *v)
case BRW_KIND_TEX: {
const brw_tex_inst *tex = inst->as_tex();
const uint8_t tex_u8data[] = {
tex->coord_components,
tex->grad_components,
tex->bits,
tex->sampler_opcode,
(uint8_t)tex->const_offsets[0],
(uint8_t)tex->const_offsets[1],
(uint8_t)tex->const_offsets[2],
};
const uint32_t tex_u32data[] = {
tex->sampler_opcode,
tex->bits,
};
hash = HASH(hash, tex_u8data);
hash = HASH(hash, tex_u32data);

View file

@ -48,6 +48,87 @@ sources_match(ASSERTED const brw_def_analysis &defs,
return brw_regs_equal(&a->src[src], &b->src[src]);
}
static void
merge_instructions(brw_shader &s, brw_tex_inst **txfs, unsigned count)
{
const unsigned min_simd = 8 * reg_unit(s.devinfo);
const unsigned max_simd = 16 * reg_unit(s.devinfo);
const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
for (unsigned curr = 0; curr < count; curr += max_simd) {
const unsigned lanes = CLAMP(count - curr, min_simd, max_simd);
const unsigned width = util_next_power_of_two(lanes);
const brw_builder ubld =
brw_builder(&s).before(txfs[curr]).exec_all().group(width, 0);
const brw_builder ubld1 = ubld.group(1, 0);
enum brw_reg_type coord_type =
txfs[curr]->src[TEX_LOGICAL_SRC_PAYLOAD0].type;
brw_reg coord = ubld.vgrf(coord_type);
brw_reg coord_comps[32];
for (unsigned i = 0; i < width; i++) {
/* Our block size might be larger than the number of convergent
* loads we're combining. If so, repeat the last component.
*/
if (txfs[curr+i])
coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_PAYLOAD0];
else
coord_comps[i] = coord_comps[i-1];
}
ubld1.VEC(coord, coord_comps, width);
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(0);
srcs[TEX_LOGICAL_SRC_PAYLOAD0] = coord;
for (unsigned i = TEX_LOGICAL_SRC_PAYLOAD1; i < txfs[0]->sources; i++)
srcs[i] = txfs[0]->src[i];
/* Each of our txf may have a reduced response length if some
* components are never read. Use the maximum of the sizes.
*/
unsigned new_dest_comps = 0;
for (unsigned i = 0; i < width; i++) {
const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
new_dest_comps = MAX2(new_dest_comps, this_comps);
}
/* Emit the new divergent TXF */
brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
brw_tex_inst *div_txf =
ubld.emit(SHADER_OPCODE_SAMPLER, div, srcs, txfs[0]->sources)->as_tex();
div_txf->surface_bindless = txfs[0]->surface_bindless;
div_txf->sampler_opcode = txfs[0]->sampler_opcode;
div_txf->residency = false;
/* Update it to also use response length reduction */
const unsigned per_component_regs =
DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
grf_size);
div_txf->size_written = new_dest_comps * per_component_regs * grf_size;
for (unsigned i = 0; i < width; i++) {
brw_inst *txf = txfs[curr+i];
if (!txf)
break;
const brw_builder ibld = brw_builder(txf);
/* Replace each of the original TXFs with MOVs from our new one */
const unsigned dest_comps = dest_comps_for_txf(s, txf);
assert(dest_comps <= 4);
brw_reg v[4];
for (unsigned c = 0; c < dest_comps; c++)
v[c] = component(offset(div, ubld, c), i);
ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);
txf->remove();
}
}
}
/**
* Look for a series of convergent texture buffer fetches within a basic
* block and combine them into a single divergent load with one lane for
@ -82,23 +163,22 @@ brw_opt_combine_convergent_txf(brw_shader &s)
{
const brw_def_analysis &defs = s.def_analysis.require();
const unsigned min_simd = 8 * reg_unit(s.devinfo);
const unsigned max_simd = 16 * reg_unit(s.devinfo);
const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
bool progress = false;
foreach_block(block, s.cfg) {
/* Gather a list of convergent TXFs to the same surface in this block */
brw_tex_inst *txfs[32] = {};
unsigned count = 0;
brw_tex_inst *txfs_ld[32] = {};
brw_tex_inst *txfs_ld_lz[32] = {};
unsigned ld_count = 0;
unsigned ld_lz_count = 0;
foreach_inst_in_block(brw_inst, inst, block) {
brw_tex_inst *tex = inst->as_tex();
if (tex == NULL)
continue;
if (tex->sampler_opcode != SAMPLER_OPCODE_TXF_LOGICAL)
if (tex->sampler_opcode != BRW_SAMPLER_OPCODE_LD &&
tex->sampler_opcode != BRW_SAMPLER_OPCODE_LD_LZ)
continue;
/* Only handle buffers or single miplevel 1D images for now */
@ -111,120 +191,48 @@ brw_opt_combine_convergent_txf(brw_shader &s)
if (tex->predicate || tex->force_writemask_all)
continue;
if (!is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_LOD]) ||
!is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_SURFACE]))
if (!is_uniform_def(defs, tex->src[TEX_LOGICAL_SRC_SURFACE]))
continue;
/* Only handle immediates for now: we could check is_uniform(),
* but we'd need to ensure the coordinate's definition reaches
* txfs[0] which is where we'll insert the combined coordinate.
*/
if (tex->src[TEX_LOGICAL_SRC_COORDINATE].file != IMM)
if (tex->src[TEX_LOGICAL_SRC_PAYLOAD0].file != IMM)
continue;
/* texelFetch from 1D buffers shouldn't have any of these */
assert(tex->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
assert(tex->src[TEX_LOGICAL_SRC_LOD2].file == BAD_FILE);
assert(tex->src[TEX_LOGICAL_SRC_MIN_LOD].file == BAD_FILE);
assert(tex->src[TEX_LOGICAL_SRC_SAMPLE_INDEX].file == BAD_FILE);
assert(tex->src[TEX_LOGICAL_SRC_MCS].file == BAD_FILE);
assert(tex->src[TEX_LOGICAL_SRC_TG4_OFFSET].file == BAD_FILE);
assert(tex->grad_components == 0);
brw_tex_inst *tex0 = tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD ?
txfs_ld[0] : txfs_ld_lz[0];
if (count > 0 &&
(!sources_match(defs, tex, txfs[0], TEX_LOGICAL_SRC_LOD) ||
!sources_match(defs, tex, txfs[0], TEX_LOGICAL_SRC_SURFACE) ||
tex->surface_bindless != txfs[0]->surface_bindless ||
!sources_match(defs, tex, txfs[0], TEX_LOGICAL_SRC_SAMPLER) ||
tex->sampler_bindless != txfs[0]->sampler_bindless))
continue;
if (tex0 != NULL) {
if (!sources_match(defs, tex, tex0, TEX_LOGICAL_SRC_SURFACE) ||
tex->surface_bindless != tex0->surface_bindless)
continue;
txfs[count++] = tex;
if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD) {
if (ld_count > 0 &&
!sources_match(defs, tex, tex0, TEX_LOGICAL_SRC_PAYLOAD2))
continue;
}
}
if (count == ARRAY_SIZE(txfs))
if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD)
txfs_ld[ld_count++] = tex;
if (tex->sampler_opcode == BRW_SAMPLER_OPCODE_LD_LZ)
txfs_ld_lz[ld_lz_count++] = tex;
if (ld_count == ARRAY_SIZE(txfs_ld) ||
ld_lz_count == ARRAY_SIZE(txfs_ld_lz))
break;
}
/* Need at least two things to combine. */
if (count < 2)
continue;
/* Emit divergent TXFs and replace the original ones with MOVs */
for (unsigned curr = 0; curr < count; curr += max_simd) {
const unsigned lanes = CLAMP(count - curr, min_simd, max_simd);
const unsigned width = util_next_power_of_two(lanes);
const brw_builder ubld =
brw_builder(&s).before(txfs[curr]).exec_all().group(width, 0);
const brw_builder ubld1 = ubld.group(1, 0);
enum brw_reg_type coord_type =
txfs[curr]->src[TEX_LOGICAL_SRC_COORDINATE].type;
brw_reg coord = ubld.vgrf(coord_type);
brw_reg coord_comps[32];
for (unsigned i = 0; i < width; i++) {
/* Our block size might be larger than the number of convergent
* loads we're combining. If so, repeat the last component.
*/
if (txfs[curr+i])
coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_COORDINATE];
else
coord_comps[i] = coord_comps[i-1];
}
ubld1.VEC(coord, coord_comps, width);
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
srcs[TEX_LOGICAL_SRC_COORDINATE] = coord;
srcs[TEX_LOGICAL_SRC_LOD] = txfs[0]->src[TEX_LOGICAL_SRC_LOD];
srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
srcs[TEX_LOGICAL_SRC_SAMPLER] = txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER];
/* Each of our txf may have a reduced response length if some
* components are never read. Use the maximum of the sizes.
*/
unsigned new_dest_comps = 0;
for (unsigned i = 0; i < width; i++) {
const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
new_dest_comps = MAX2(new_dest_comps, this_comps);
}
/* Emit the new divergent TXF */
brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
brw_tex_inst *div_txf =
ubld.emit(SHADER_OPCODE_SAMPLER, div, srcs,
TEX_LOGICAL_NUM_SRCS)->as_tex();
div_txf->surface_bindless = txfs[0]->surface_bindless;
div_txf->sampler_bindless = txfs[0]->sampler_bindless;
div_txf->sampler_opcode = SAMPLER_OPCODE_TXF_LOGICAL;
div_txf->coord_components = 1;
div_txf->grad_components = 0;
div_txf->residency = false;
/* Update it to also use response length reduction */
const unsigned per_component_regs =
DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
grf_size);
div_txf->size_written = new_dest_comps * per_component_regs * grf_size;
for (unsigned i = 0; i < width; i++) {
brw_inst *txf = txfs[curr+i];
if (!txf)
break;
const brw_builder ibld = brw_builder(txf);
/* Replace each of the original TXFs with MOVs from our new one */
const unsigned dest_comps = dest_comps_for_txf(s, txf);
assert(dest_comps <= 4);
brw_reg v[4];
for (unsigned c = 0; c < dest_comps; c++)
v[c] = component(offset(div, ubld, c), i);
ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);
txf->remove();
}
if (ld_count >= 2) {
merge_instructions(s, txfs_ld, ld_count);
progress = true;
}
if (ld_lz_count >= 2) {
merge_instructions(s, txfs_ld_lz, ld_lz_count);
progress = true;
}
}

View file

@ -74,32 +74,6 @@ brw_print_instructions(const brw_shader &s, FILE *file)
}
}
static const char *
brw_sampler_opcode_name(sampler_opcode opcode) {
switch (opcode) {
case SAMPLER_OPCODE_TEX_LOGICAL: return "tex_logical";
case SAMPLER_OPCODE_TXD_LOGICAL: return "txd_logical";
case SAMPLER_OPCODE_TXF_LOGICAL: return "txf_logical";
case SAMPLER_OPCODE_TXL_LOGICAL: return "txl_logical";
case SAMPLER_OPCODE_TXS_LOGICAL: return "txs_logical";
case SAMPLER_OPCODE_TXB_LOGICAL: return "txb_logical";
case SAMPLER_OPCODE_TXF_CMS_W_LOGICAL: return "txf_cms_w_logical";
case SAMPLER_OPCODE_TXF_CMS_W_GFX12_LOGICAL: return "txf_cms_w_gfx12_logical";
case SAMPLER_OPCODE_TXF_MCS_LOGICAL: return "txf_mcs_logical";
case SAMPLER_OPCODE_LOD_LOGICAL: return "lod_logical";
case SAMPLER_OPCODE_TG4_LOGICAL: return "tg4_logical";
case SAMPLER_OPCODE_TG4_OFFSET_LOGICAL: return "tg4_offset_logical";
case SAMPLER_OPCODE_TG4_OFFSET_LOD_LOGICAL: return "tg4_offset_lod_logical";
case SAMPLER_OPCODE_TG4_OFFSET_BIAS_LOGICAL: return "tg4_offset_bias_logical";
case SAMPLER_OPCODE_TG4_BIAS_LOGICAL: return "tg4_b_logical";
case SAMPLER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL: return "tg4_l_logical";
case SAMPLER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL: return "tg4_i_logical";
case SAMPLER_OPCODE_SAMPLEINFO_LOGICAL: return "sampleinfo_logical";
case SAMPLER_OPCODE_IMAGE_SIZE_LOGICAL: return "image_size_logical";
default: UNREACHABLE("invalid sampler opcode");
}
}
static const char *
brw_instruction_name(const struct brw_isa_info *isa, const brw_inst *inst)
{
@ -474,12 +448,34 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con
fprintf(file, " coherent");
}
const brw_tex_inst *tex = inst->as_tex();
const struct brw_sampler_payload_desc *tex_payload = NULL;
if (tex)
tex_payload = brw_get_sampler_payload_desc(tex->sampler_opcode);
for (int i = 0; i < inst->sources; i++) {
if (mem) {
if (print_memory_logical_source(file, inst, i))
continue;
} else {
fprintf(file, ", ");
}
fprintf(file, ", ");
if (tex_payload) {
switch (i) {
case TEX_LOGICAL_SRC_SURFACE:
fprintf(file, "surf: ");
break;
case TEX_LOGICAL_SRC_SAMPLER:
fprintf(file, "smpl: ");
break;
default:
fprintf(file, "%s: ",
brw_sampler_payload_param_name(
tex_payload->sources[
i - TEX_LOGICAL_SRC_PAYLOAD0].param));
break;
}
}
if (inst->src[i].negate)
@ -634,10 +630,16 @@ brw_print_instruction(const brw_shader &s, const brw_inst *inst, FILE *file, con
fprintf(file, ", surface bindless");
if (tex->sampler_bindless)
fprintf(file, ", sampler bindless");
fprintf(file, ", grad_comps: %uu", tex->grad_components);
fprintf(file, ", coord_comps: %uu", tex->coord_components);
fprintf(file, ", grad_comps: %uu", tex->grad_components);
fprintf(file, ", residency: %s", tex->residency ? "true" : "false");
if (brw_sampler_opcode_is_gather(tex->sampler_opcode))
fprintf(file, ", gather_comp: %hhu", tex->gather_component);
if (tex->has_const_offsets) {
fprintf(file, ", offsets: %hhi,%hhi,%hhi",
tex->const_offsets[0],
tex->const_offsets[1],
tex->const_offsets[2]);
}
if (tex->residency)
fprintf(file, ", residency");
}
fprintf(file, " ");

View file

@ -199,32 +199,6 @@ TEST_F(cmod_propagation_test, intervening_mismatch_flag_read)
EXPECT_SHADERS_MATCH(bld, exp);
}
TEST_F(cmod_propagation_test, intervening_dest_write)
{
brw_builder bld = make_shader();
brw_reg dest = bld.vgrf(BRW_TYPE_F, 4);
brw_reg src0 = bld.vgrf(BRW_TYPE_F);
brw_reg src1 = bld.vgrf(BRW_TYPE_F);
brw_reg src2 = bld.vgrf(BRW_TYPE_F, 2);
brw_reg zero(brw_imm_f(0.0f));
brw_reg tex_srcs[TEX_LOGICAL_NUM_SRCS];
tex_srcs[TEX_LOGICAL_SRC_COORDINATE] = src2;
tex_srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(0);
bld.ADD(offset(dest, bld, 2), src0, src1);
brw_tex_inst *tex =
bld.emit(SHADER_OPCODE_SAMPLER, dest, tex_srcs, TEX_LOGICAL_NUM_SRCS)->as_tex();
tex->size_written = 4 * REG_SIZE;
tex->coord_components = 2;
bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
EXPECT_NO_PROGRESS(brw_opt_cmod_propagation, bld);
}
TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
{
brw_builder bld = make_shader();

View file

@ -264,32 +264,6 @@ TEST_F(saturate_propagation_test, producer_saturates)
EXPECT_SHADERS_MATCH(bld, exp);
}
TEST_F(saturate_propagation_test, intervening_dest_write)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);
brw_reg dst0 = bld.vgrf(BRW_TYPE_F, 4);
brw_reg dst1 = bld.vgrf(BRW_TYPE_F);
brw_reg src0 = bld.LOAD_REG(bld.vgrf(BRW_TYPE_F));
brw_reg src1 = bld.LOAD_REG(bld.vgrf(BRW_TYPE_F));
brw_reg src2 = bld.LOAD_REG(bld.vgrf(BRW_TYPE_F, 2));
brw_reg tex_srcs[TEX_LOGICAL_NUM_SRCS] = {};
tex_srcs[TEX_LOGICAL_SRC_COORDINATE] = src2;
tex_srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(0);
bld.ADD(offset(dst0, bld, 2), src0, src1);
brw_tex_inst *tex =
bld.emit(SHADER_OPCODE_SAMPLER, dst0, tex_srcs, TEX_LOGICAL_NUM_SRCS)->as_tex();
tex->size_written = 8 * REG_SIZE;
tex->coord_components = 2;
bld.MOV(dst1, offset(dst0, bld, 2))->saturate = true;
EXPECT_NO_PROGRESS(brw_opt_saturate_propagation, bld);
}
TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat)
{
brw_builder bld = make_shader(MESA_SHADER_FRAGMENT, 16);