mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-23 09:00:10 +01:00
Also drop the 'fs' prefix when applicable. Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/33048>
236 lines
8.7 KiB
C++
236 lines
8.7 KiB
C++
/*
|
|
* Copyright © 2024 Intel Corporation
|
|
* SPDX-License-Identifier: MIT
|
|
*/
|
|
|
|
#include "brw_eu.h"
|
|
#include "brw_fs.h"
|
|
#include "brw_builder.h"
|
|
|
|
using namespace brw;
|
|
|
|
static unsigned
|
|
dest_comps_for_txf(const fs_visitor &s, const brw_inst *txf)
|
|
{
|
|
if (!txf)
|
|
return 0;
|
|
|
|
const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
|
|
const unsigned per_component_regs =
|
|
DIV_ROUND_UP(brw_type_size_bytes(txf->dst.type) *
|
|
txf->exec_size, grf_size);
|
|
const unsigned dest_regs = txf->size_written / grf_size;
|
|
const unsigned dest_comps = dest_regs / per_component_regs;
|
|
return dest_comps;
|
|
}
|
|
|
|
static bool
|
|
is_def(const brw_def_analysis &defs, const brw_reg &r)
|
|
{
|
|
return r.file == IMM || r.file == BAD_FILE || defs.get(r) != NULL;
|
|
}
|
|
|
|
static bool
|
|
is_uniform_def(const brw_def_analysis &defs, const brw_reg &r)
|
|
{
|
|
return is_def(defs, r) && is_uniform(r);
|
|
}
|
|
|
|
/**
|
|
* Check if two texture instructions have a matching source (either the same
|
|
* immediate value, or both references to the same immutable SSA def and
|
|
* with matching source modifiers and regions).
|
|
*/
|
|
static bool
|
|
sources_match(ASSERTED const brw_def_analysis &defs,
|
|
const brw_inst *a, const brw_inst *b, enum tex_logical_srcs src)
|
|
{
|
|
assert(is_def(defs, a->src[src]));
|
|
assert(is_def(defs, b->src[src]));
|
|
return brw_regs_equal(&a->src[src], &b->src[src]);
|
|
}
|
|
|
|
/**
|
|
* Look for a series of convergent texture buffer fetches within a basic
|
|
* block and combine them into a single divergent load with one lane for
|
|
* each original fetch. For example, this series of convergent fetches:
|
|
*
|
|
* txf(16) %12:UD, coord = 12d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %13:UD, coord = 13d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %14:UD, coord = 14d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %15:UD, coord = 15d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %16:UD, coord = 16d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %17:UD, coord = 17d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %18:UD, coord = 18d, lod = 0u, handle = %1<0>:D
|
|
* txf(16) %19:UD, coord = 19d, lod = 0u, handle = %1<0>:D
|
|
*
|
|
* can be combined into a single divergent load and scalar-expansion moves
|
|
* (which can easily be copy propagated away):
|
|
*
|
|
* load_payload(1) %2:D 12d, 13d, 14d, 15d, 16d, 17d, 18d, 19d
|
|
* txf(8) %3:UD, coord = %2, lod = 0u, handle = %1<0>:D
|
|
* mov(16) %12:UD, %3+0.0<0>:UD
|
|
* ...
|
|
* mov(16) %19:UD, %3+0.28<0>:UD
|
|
*
|
|
* Our sampler hardware doesn't have any special support for convergent
|
|
* loads (like LSC transpose/block loads), and always performs SIMD8/16/32
|
|
* per-channel loads. But with this trick, we can still combine multiple
|
|
* convergent loads into a single message with fewer round-trips, and much
|
|
* lower register pressure.
|
|
*/
|
|
bool
|
|
brw_opt_combine_convergent_txf(fs_visitor &s)
|
|
{
|
|
const brw_def_analysis &defs = s.def_analysis.require();
|
|
|
|
const unsigned min_simd = 8 * reg_unit(s.devinfo);
|
|
const unsigned max_simd = 16 * reg_unit(s.devinfo);
|
|
const unsigned grf_size = REG_SIZE * reg_unit(s.devinfo);
|
|
|
|
bool progress = false;
|
|
|
|
foreach_block(block, s.cfg) {
|
|
/* Gather a list of convergent TXFs to the same surface in this block */
|
|
brw_inst *txfs[32] = {};
|
|
unsigned count = 0;
|
|
|
|
foreach_inst_in_block(brw_inst, inst, block) {
|
|
if (inst->opcode != SHADER_OPCODE_TXF_LOGICAL)
|
|
continue;
|
|
|
|
/* Only handle buffers or single miplevel 1D images for now */
|
|
if (inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud > 1)
|
|
continue;
|
|
|
|
if (inst->src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0)
|
|
continue;
|
|
|
|
if (inst->predicate || inst->force_writemask_all)
|
|
continue;
|
|
|
|
if (!is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_LOD]) ||
|
|
!is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE]) ||
|
|
!is_uniform_def(defs, inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE]))
|
|
continue;
|
|
|
|
/* Only handle immediates for now: we could check is_uniform(),
|
|
* but we'd need to ensure the coordinate's definition reaches
|
|
* txfs[0] which is where we'll insert the combined coordinate.
|
|
*/
|
|
if (inst->src[TEX_LOGICAL_SRC_COORDINATE].file != IMM)
|
|
continue;
|
|
|
|
/* texelFetch from 1D buffers shouldn't have any of these */
|
|
assert(inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE);
|
|
assert(inst->src[TEX_LOGICAL_SRC_LOD2].file == BAD_FILE);
|
|
assert(inst->src[TEX_LOGICAL_SRC_MIN_LOD].file == BAD_FILE);
|
|
assert(inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX].file == BAD_FILE);
|
|
assert(inst->src[TEX_LOGICAL_SRC_MCS].file == BAD_FILE);
|
|
assert(inst->src[TEX_LOGICAL_SRC_TG4_OFFSET].file == BAD_FILE);
|
|
assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
|
|
inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud == 0);
|
|
|
|
if (count > 0 &&
|
|
(!sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_LOD) ||
|
|
!sources_match(defs, inst, txfs[0], TEX_LOGICAL_SRC_SURFACE) ||
|
|
!sources_match(defs, inst, txfs[0],
|
|
TEX_LOGICAL_SRC_SURFACE_HANDLE)))
|
|
continue;
|
|
|
|
txfs[count++] = inst;
|
|
|
|
if (count == ARRAY_SIZE(txfs))
|
|
break;
|
|
}
|
|
|
|
/* Need at least two things to combine. */
|
|
if (count < 2)
|
|
continue;
|
|
|
|
/* Emit divergent TXFs and replace the original ones with MOVs */
|
|
for (unsigned curr = 0; curr < count; curr += max_simd) {
|
|
const unsigned lanes = CLAMP(count - curr, min_simd, max_simd);
|
|
const unsigned width = util_next_power_of_two(lanes);
|
|
const brw_builder ubld =
|
|
brw_builder(&s).at(block, txfs[curr]).exec_all().group(width, 0);
|
|
const brw_builder ubld1 = ubld.group(1, 0);
|
|
|
|
enum brw_reg_type coord_type =
|
|
txfs[curr]->src[TEX_LOGICAL_SRC_COORDINATE].type;
|
|
brw_reg coord = ubld.vgrf(coord_type);
|
|
brw_reg coord_comps[32];
|
|
|
|
for (unsigned i = 0; i < width; i++) {
|
|
/* Our block size might be larger than the number of convergent
|
|
* loads we're combining. If so, repeat the last component.
|
|
*/
|
|
if (txfs[curr+i])
|
|
coord_comps[i] = txfs[curr+i]->src[TEX_LOGICAL_SRC_COORDINATE];
|
|
else
|
|
coord_comps[i] = coord_comps[i-1];
|
|
}
|
|
ubld1.VEC(coord, coord_comps, width);
|
|
|
|
brw_reg srcs[TEX_LOGICAL_NUM_SRCS];
|
|
srcs[TEX_LOGICAL_SRC_COORDINATE] = coord;
|
|
srcs[TEX_LOGICAL_SRC_LOD] = txfs[0]->src[TEX_LOGICAL_SRC_LOD];
|
|
srcs[TEX_LOGICAL_SRC_SURFACE] = txfs[0]->src[TEX_LOGICAL_SRC_SURFACE];
|
|
srcs[TEX_LOGICAL_SRC_SURFACE_HANDLE] =
|
|
txfs[0]->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
|
|
srcs[TEX_LOGICAL_SRC_SAMPLER] = txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER];
|
|
srcs[TEX_LOGICAL_SRC_SAMPLER_HANDLE] =
|
|
txfs[0]->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
|
|
srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_ud(1);
|
|
srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_ud(0);
|
|
srcs[TEX_LOGICAL_SRC_RESIDENCY] = brw_imm_ud(0);
|
|
|
|
/* Each of our txf may have a reduced response length if some
|
|
* components are never read. Use the maximum of the sizes.
|
|
*/
|
|
unsigned new_dest_comps = 0;
|
|
for (unsigned i = 0; i < width; i++) {
|
|
const unsigned this_comps = dest_comps_for_txf(s, txfs[curr+i]);
|
|
new_dest_comps = MAX2(new_dest_comps, this_comps);
|
|
}
|
|
|
|
/* Emit the new divergent TXF */
|
|
brw_reg div = ubld.vgrf(BRW_TYPE_UD, new_dest_comps);
|
|
brw_inst *div_txf =
|
|
ubld.emit(SHADER_OPCODE_TXF_LOGICAL, div, srcs,
|
|
TEX_LOGICAL_NUM_SRCS);
|
|
|
|
/* Update it to also use response length reduction */
|
|
const unsigned per_component_regs =
|
|
DIV_ROUND_UP(brw_type_size_bytes(div.type) * div_txf->exec_size,
|
|
grf_size);
|
|
div_txf->size_written = new_dest_comps * per_component_regs * grf_size;
|
|
|
|
for (unsigned i = 0; i < width; i++) {
|
|
brw_inst *txf = txfs[curr+i];
|
|
if (!txf)
|
|
break;
|
|
|
|
const brw_builder ibld = brw_builder(&s, block, txf);
|
|
|
|
/* Replace each of the original TXFs with MOVs from our new one */
|
|
const unsigned dest_comps = dest_comps_for_txf(s, txf);
|
|
assert(dest_comps <= 4);
|
|
|
|
brw_reg v[4];
|
|
for (unsigned c = 0; c < dest_comps; c++)
|
|
v[c] = component(offset(div, ubld, c), i);
|
|
ibld.VEC(retype(txf->dst, BRW_TYPE_UD), v, dest_comps);
|
|
|
|
txf->remove(block);
|
|
}
|
|
|
|
progress = true;
|
|
}
|
|
}
|
|
|
|
if (progress)
|
|
s.invalidate_analysis(BRW_DEPENDENCY_INSTRUCTIONS);
|
|
|
|
return progress;
|
|
}
|