mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 13:40:16 +01:00
intel/brw: Handle 16-bit sampler return payloads
API requires samplers to return 32-bit even though hardware can handle 16-bit floating point, so we detect that case and make more efficient use of memory BW. This is helping improve performance of encode and decode tokens during LLM by at least 5% across multiple platforms. Thank you Kenneth Graunke for suggesting and guiding me throughout this implementation. Signed-off-by: Sushma Venkatesh Reddy <sushma.venkatesh.reddy@intel.com> Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30447>
This commit is contained in:
parent
ddd9e043dc
commit
0116430d39
3 changed files with 27 additions and 9 deletions
|
|
@ -8623,7 +8623,12 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
|
||||||
|
|
||||||
brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
|
brw_reg nir_def_reg = get_nir_def(ntb, instr->def);
|
||||||
|
|
||||||
brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4 + instr->is_sparse);
|
bool is_simd8_16bit = nir_alu_type_get_type_size(instr->dest_type) == 16
|
||||||
|
&& bld.dispatch_width() == 8;
|
||||||
|
|
||||||
|
brw_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type),
|
||||||
|
(is_simd8_16bit ? 8 : 4) + instr->is_sparse);
|
||||||
|
|
||||||
fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
|
fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
|
||||||
inst->offset = header_bits;
|
inst->offset = header_bits;
|
||||||
|
|
||||||
|
|
@ -8635,15 +8640,18 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
|
||||||
if (instr->is_sparse) {
|
if (instr->is_sparse) {
|
||||||
read_size = util_last_bit(write_mask) - 1;
|
read_size = util_last_bit(write_mask) - 1;
|
||||||
inst->size_written =
|
inst->size_written =
|
||||||
read_size * inst->dst.component_size(inst->exec_size) +
|
(is_simd8_16bit ? 2 : 1) * read_size *
|
||||||
|
inst->dst.component_size(inst->exec_size) +
|
||||||
(reg_unit(devinfo) * REG_SIZE);
|
(reg_unit(devinfo) * REG_SIZE);
|
||||||
} else {
|
} else {
|
||||||
read_size = util_last_bit(write_mask);
|
read_size = util_last_bit(write_mask);
|
||||||
inst->size_written =
|
inst->size_written =
|
||||||
read_size * inst->dst.component_size(inst->exec_size);
|
(is_simd8_16bit ? 2 : 1) * read_size *
|
||||||
|
inst->dst.component_size(inst->exec_size);
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
inst->size_written = 4 * inst->dst.component_size(inst->exec_size) +
|
inst->size_written = (is_simd8_16bit ? 2 : 1) * 4 *
|
||||||
|
inst->dst.component_size(inst->exec_size) +
|
||||||
(instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
|
(instr->is_sparse ? (reg_unit(devinfo) * REG_SIZE) : 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -8666,7 +8674,8 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
|
||||||
inst->keep_payload_trailing_zeros = true;
|
inst->keep_payload_trailing_zeros = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (instr->op != nir_texop_query_levels && !instr->is_sparse) {
|
if (instr->op != nir_texop_query_levels && !instr->is_sparse
|
||||||
|
&& !is_simd8_16bit) {
|
||||||
/* In most cases we can write directly to the result. */
|
/* In most cases we can write directly to the result. */
|
||||||
inst->dst = nir_def_reg;
|
inst->dst = nir_def_reg;
|
||||||
} else {
|
} else {
|
||||||
|
|
@ -8675,7 +8684,7 @@ fs_nir_emit_texture(nir_to_brw_state &ntb,
|
||||||
*/
|
*/
|
||||||
brw_reg nir_dest[5];
|
brw_reg nir_dest[5];
|
||||||
for (unsigned i = 0; i < read_size; i++)
|
for (unsigned i = 0; i < read_size; i++)
|
||||||
nir_dest[i] = offset(dst, bld, i);
|
nir_dest[i] = offset(dst, bld, (is_simd8_16bit ? 2 : 1) * i);
|
||||||
|
|
||||||
if (instr->op == nir_texop_query_levels) {
|
if (instr->op == nir_texop_query_levels) {
|
||||||
/* # levels is in .w */
|
/* # levels is in .w */
|
||||||
|
|
|
||||||
|
|
@ -1124,13 +1124,16 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
|
||||||
inst->mlen = mlen;
|
inst->mlen = mlen;
|
||||||
inst->header_size = header_size;
|
inst->header_size = header_size;
|
||||||
inst->sfid = BRW_SFID_SAMPLER;
|
inst->sfid = BRW_SFID_SAMPLER;
|
||||||
|
uint sampler_ret_type = brw_type_size_bits(inst->dst.type) == 16
|
||||||
|
? GFX8_SAMPLER_RETURN_FORMAT_16BITS
|
||||||
|
: GFX8_SAMPLER_RETURN_FORMAT_32BITS;
|
||||||
if (surface.file == IMM &&
|
if (surface.file == IMM &&
|
||||||
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
|
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
|
||||||
inst->desc = brw_sampler_desc(devinfo, surface.ud,
|
inst->desc = brw_sampler_desc(devinfo, surface.ud,
|
||||||
sampler.file == IMM ? sampler.ud % 16 : 0,
|
sampler.file == IMM ? sampler.ud % 16 : 0,
|
||||||
msg_type,
|
msg_type,
|
||||||
simd_mode,
|
simd_mode,
|
||||||
0 /* return_format unused on gfx7+ */);
|
sampler_ret_type);
|
||||||
inst->src[0] = brw_imm_ud(0);
|
inst->src[0] = brw_imm_ud(0);
|
||||||
inst->src[1] = brw_imm_ud(0);
|
inst->src[1] = brw_imm_ud(0);
|
||||||
} else if (surface_handle.file != BAD_FILE) {
|
} else if (surface_handle.file != BAD_FILE) {
|
||||||
|
|
@ -1140,7 +1143,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
|
||||||
sampler.file == IMM ? sampler.ud % 16 : 0,
|
sampler.file == IMM ? sampler.ud % 16 : 0,
|
||||||
msg_type,
|
msg_type,
|
||||||
simd_mode,
|
simd_mode,
|
||||||
0 /* return_format unused on gfx7+ */);
|
sampler_ret_type);
|
||||||
|
|
||||||
/* For bindless samplers, the entire address is included in the message
|
/* For bindless samplers, the entire address is included in the message
|
||||||
* header so we can leave the portion in the message descriptor 0.
|
* header so we can leave the portion in the message descriptor 0.
|
||||||
|
|
@ -1166,7 +1169,7 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst,
|
||||||
0, /* sampler */
|
0, /* sampler */
|
||||||
msg_type,
|
msg_type,
|
||||||
simd_mode,
|
simd_mode,
|
||||||
0 /* return_format unused on gfx7+ */);
|
sampler_ret_type);
|
||||||
const fs_builder ubld = bld.group(1, 0).exec_all();
|
const fs_builder ubld = bld.group(1, 0).exec_all();
|
||||||
brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
|
brw_reg desc = ubld.vgrf(BRW_TYPE_UD);
|
||||||
if (surface.equals(sampler)) {
|
if (surface.equals(sampler)) {
|
||||||
|
|
|
||||||
|
|
@ -996,6 +996,12 @@ brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir,
|
||||||
|
|
||||||
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
OPT(nir_lower_alu_to_scalar, NULL, NULL);
|
||||||
|
|
||||||
|
struct nir_opt_16bit_tex_image_options options = {
|
||||||
|
.rounding_mode = nir_rounding_mode_undef,
|
||||||
|
.opt_tex_dest_types = nir_type_float | nir_type_int | nir_type_uint,
|
||||||
|
};
|
||||||
|
OPT(nir_opt_16bit_tex_image, &options);
|
||||||
|
|
||||||
if (nir->info.stage == MESA_SHADER_GEOMETRY)
|
if (nir->info.stage == MESA_SHADER_GEOMETRY)
|
||||||
OPT(nir_lower_gs_intrinsics, 0);
|
OPT(nir_lower_gs_intrinsics, 0);
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue