mesa/src/intel/compiler/brw_lower_simd_width.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

763 lines
28 KiB
C++
Raw Normal View History

/*
* Copyright © 2010 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "brw_fs.h"
#include "brw_builder.h"
using namespace brw;
static bool
is_mixed_float_with_fp32_dst(const fs_inst *inst)
{
if (inst->dst.type != BRW_TYPE_F)
return false;
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].type == BRW_TYPE_HF)
return true;
}
return false;
}
static bool
is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
{
if (inst->dst.type != BRW_TYPE_HF || inst->dst.stride != 1)
return false;
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].type == BRW_TYPE_F)
return true;
}
return false;
}
/**
* Get the closest allowed SIMD width for instruction \p inst accounting for
* some common regioning and execution control restrictions that apply to FPU
* instructions. These restrictions don't necessarily have any relevance to
* instructions not executed by the FPU pipeline like extended math, control
* flow or send message instructions.
*
* For virtual opcodes it's really up to the instruction -- In some cases
* (e.g. where a virtual instruction unrolls into a simple sequence of FPU
* instructions) it may simplify virtual instruction lowering if we can
* enforce FPU-like regioning restrictions already on the virtual instruction,
* in other cases (e.g. virtual send-like instructions) this may be
* excessively restrictive.
*/
static unsigned
get_fpu_lowered_simd_width(const fs_visitor *shader,
const fs_inst *inst)
{
const struct brw_compiler *compiler = shader->compiler;
const struct intel_device_info *devinfo = compiler->devinfo;
/* Maximum execution size representable in the instruction controls. */
unsigned max_width = MIN2(32, inst->exec_size);
/* Number of channels per polygon handled by a multipolygon PS shader. */
const unsigned poly_width = shader->dispatch_width /
MAX2(1, shader->max_polygons);
/* Number of registers that will be read by an ATTR source if
* present for multipolygon PS shaders, since the PS vertex setup
* data for each polygon is stored in different contiguous GRFs.
*/
const unsigned attr_reg_count = (shader->stage != MESA_SHADER_FRAGMENT ||
shader->max_polygons < 2 ? 0 :
DIV_ROUND_UP(inst->exec_size,
poly_width) * reg_unit(devinfo));
/* According to the PRMs:
* "A. In Direct Addressing mode, a source cannot span more than 2
* adjacent GRF registers.
* B. A destination cannot span more than 2 adjacent GRF registers."
*
* Look for the source or destination with the largest register region
* which is the one that is going to limit the overall execution size of
* the instruction due to this rule.
*/
unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
for (unsigned i = 0; i < inst->sources; i++)
reg_count = MAX3(reg_count, DIV_ROUND_UP(inst->size_read(devinfo, i), REG_SIZE),
(inst->src[i].file == ATTR ? attr_reg_count : 0));
/* Calculate the maximum execution size of the instruction based on the
* factor by which it goes over the hardware limit of 2 GRFs.
*/
const unsigned max_reg_count = 2 * reg_unit(devinfo);
if (reg_count > max_reg_count)
max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
/* From the IVB PRMs (applies to HSW too):
* "Instructions with condition modifiers must not use SIMD32."
*
* From the BDW PRMs (applies to later hardware too):
* "Ternary instruction with condition modifiers must not use SIMD32."
*/
if (inst->conditional_mod && inst->is_3src(compiler) && devinfo->ver < 12)
max_width = MIN2(max_width, 16);
/* From the IVB PRMs (applies to other devices that don't have the
* intel_device_info::supports_simd16_3src flag set):
* "In Align16 access mode, SIMD16 is not allowed for DW operations and
* SIMD8 is not allowed for DF operations."
*/
if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
max_width = MIN2(max_width, inst->exec_size / reg_count);
intel/brw: Allow SIMD16 F and HF type conversion moves On DG2, the lowering generated for these MOV instructions is **awful**. The original SIMD16 MOV { 18} 67: mov(16) vgrf54+0.0:HF, vgrf46+0.0:F NoMask group0 is lowered to SIMD8 MOVs: { 18} 118: mov(8) vgrf54+0.0:HF, vgrf46+0.0:F NoMask group0 { 18} 119: mov(8) vgrf54+0.16:HF, vgrf46+1.0:F NoMask group8 These MOVs violate Gfx12.5 region restrictions, so these are further lowered: { 17} 119: mov(8) vgrf83<2>:HF, vgrf46+0.0:F NoMask group0 { 19} 120: mov(8) vgrf54+0.0:UW, vgrf83<2>:UW NoMask group0 { 19} 122: mov(8) vgrf84<2>:HF, vgrf46+1.0:F NoMask group8 { 19} 123: mov(8) vgrf54+0.16:UW, vgrf84<2>:UW NoMask group8 The shader-db and fossil-db results are nothing to get excited about. However, the affect on vk_cooperative_matrix_perf is substantial. In one subtest shader: shaders/shmemfp16.spv cooperativeMatrixProps = 8x8x16 A = float16_t B = float16_t C = float16_t D = float16_t scope = subgroup TILE_M=128 TILE_N=128, TILE_K=32 BLayout=0 performance on my DG2 improved by ~60% due to a MASSIVE reduction in spills and fills: -Native code for unnamed compute shader (null) (src_hash 0x00000000) (sha1 c6a41b1c4e7aa2da327a39a70ed36c822a4b172f) -SIMD32 shader: 32484 instructions. 1 loops. 1893868 cycles. 737:1820 spills:fills, 442 sends, scheduled with mode none. Promoted 1 constants. Compacted 519744 to 492224 bytes (5%) - START B0 (20782 cycles) +Native code for unnamed compute shader (null) (src_hash 0x00000000) (sha1 621e960daad5b5579b176717f24a315e7ea560a1) +SIMD32 shader: 23918 instructions. 1 loops. 1089894 cycles. 432:1166 spills:fills, 442 sends, scheduled with mode none. Promoted 1 constants. Compacted 382688 to 353232 bytes (8%) shader-db: All Gfx9 and later platforms had similar results. (Meteor Lake shown) total instructions in shared programs: 19656270 -> 19653981 (-0.01%) instructions in affected programs: 61810 -> 59521 (-3.70%) helped: 116 / HURT: 0 total cycles in shared programs: 823368888 -> 823375854 (<.01%) cycles in affected programs: 1165284 -> 1172250 (0.60%) helped: 51 / HURT: 57 fossil-db: DG2 and Meteor Lake had similar results. (Meteor Lake shown) *** Shaders only in 'before' results are ignored: fossil-db/steam-dxvk/total_war_warhammer3/2a3ed2ca632a7cb7/fs.32, fossil-db/steam-dxvk/total_war_warhammer3/18b9d4a3b1961616/fs.32, fossil-db/steam-dxvk/total_war_warhammer3/04ac9f3146a6db19/fs.32, fossil-db/steam-dxvk/total_war_warhammer3/f37ebec6aa1b379a/fs.32, fossil-db/steam-dxvk/total_war_warhammer3/255c987feb0d4310/fs.32, and 25 more from 1 apps: fossil-db/steam-dxvk/total_war_warhammer3 Totals: Instrs: 160946537 -> 160928389 (-0.01%); split: -0.01%, +0.00% Cycles: 14125908620 -> 14125873958 (-0.00%); split: -0.00%, +0.00% Totals from 1002 (0.15% of 652134) affected shaders: Instrs: 411261 -> 393113 (-4.41%); split: -4.41%, +0.00% Cycles: 16676735 -> 16642073 (-0.21%); split: -0.48%, +0.27% Tiger Lake Totals: Instrs: 164511816 -> 164497202 (-0.01%); split: -0.01%, +0.00% Cycles: 13801675722 -> 13801629397 (-0.00%); split: -0.00%, +0.00% Subgroup size: 7955168 -> 7955152 (-0.00%) Send messages: 8544494 -> 8544486 (-0.00%) Totals from 997 (0.15% of 651454) affected shaders: Instrs: 460820 -> 446206 (-3.17%); split: -3.17%, +0.00% Cycles: 16265514 -> 16219189 (-0.28%); split: -0.84%, +0.56% Subgroup size: 17552 -> 17536 (-0.09%) Send messages: 26045 -> 26037 (-0.03%) Ice Lake Totals: Instrs: 165504747 -> 165489970 (-0.01%); split: -0.01%, +0.00% Cycles: 15145244554 -> 15145149627 (-0.00%); split: -0.00%, +0.00% Subgroup size: 8107032 -> 8107016 (-0.00%) Send messages: 8598680 -> 8598672 (-0.00%) Spill count: 45427 -> 45423 (-0.01%) Fill count: 74749 -> 74747 (-0.00%) Totals from 1125 (0.17% of 656115) affected shaders: Instrs: 521676 -> 506899 (-2.83%); split: -2.83%, +0.00% Cycles: 19555434 -> 19460507 (-0.49%); split: -0.59%, +0.10% Subgroup size: 21616 -> 21600 (-0.07%) Send messages: 28623 -> 28615 (-0.03%) Spill count: 603 -> 599 (-0.66%) Fill count: 1362 -> 1360 (-0.15%) Skylake *** Shaders only in 'after' results are ignored: fossil-db/steam-native/red_dead_redemption2/cef460b80bad8485/fs.16, fossil-db/steam-native/red_dead_redemption2/cd5fe081e2e5529d/fs.16 from 1 apps: fossil-db/steam-native/red_dead_redemption2 Totals: Instrs: 141607617 -> 141593776 (-0.01%); split: -0.01%, +0.00% Cycles: 14257812441 -> 14257661671 (-0.00%); split: -0.00%, +0.00% Subgroup size: 7743752 -> 7743736 (-0.00%) Send messages: 7552728 -> 7552720 (-0.00%) Spill count: 43660 -> 43661 (+0.00%) Fill count: 71301 -> 71303 (+0.00%) Totals from 1017 (0.16% of 636964) affected shaders: Instrs: 392454 -> 378613 (-3.53%); split: -3.53%, +0.00% Cycles: 16622974 -> 16472204 (-0.91%); split: -1.04%, +0.13% Subgroup size: 19840 -> 19824 (-0.08%) Send messages: 23021 -> 23013 (-0.03%) Spill count: 484 -> 485 (+0.21%) Fill count: 1155 -> 1157 (+0.17%) Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28281>
2023-10-17 09:48:38 -07:00
if (inst->opcode != BRW_OPCODE_MOV) {
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
* Float Operations:
*
* "No SIMD16 in mixed mode when destination is f32. Instruction
* execution size must be no more than 8."
*
* Testing indicates that this restriction does not apply to MOVs.
*/
if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
max_width = MIN2(max_width, 8);
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
* Float Operations:
*
* "No SIMD16 in mixed mode when destination is packed f16 for both
* Align1 and Align16."
*/
if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
max_width = MIN2(max_width, 8);
}
/* Only power-of-two execution sizes are representable in the instruction
* control fields.
*/
return 1 << util_logbase2(max_width);
}
/**
* Get the maximum allowed SIMD width for instruction \p inst accounting for
* various payload size restrictions that apply to sampler message
* instructions.
*
* This is only intended to provide a maximum theoretical bound for the
* execution size of the message based on the number of argument components
* alone, which in most cases will determine whether the SIMD8 or SIMD16
* variant of the message can be used, though some messages may have
* additional restrictions not accounted for here (e.g. pre-ILK hardware uses
* the message length to determine the exact SIMD width and argument count,
* which makes a number of sampler message combinations impossible to
* represent).
*
* Note: Platforms with monolithic SIMD16 double the possible SIMD widths
* change from (SIMD8, SIMD16) to (SIMD16, SIMD32).
*/
static unsigned
get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
const fs_inst *inst)
{
/* If we have a min_lod parameter on anything other than a simple sample
* message, it will push it over 5 arguments and we have to fall back to
* SIMD8.
*/
if (inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
return devinfo->ver < 20 ? 8 : 16;
/* On Gfx9+ the LOD argument is for free if we're able to use the LZ
* variant of the TXL or TXF message.
*/
const bool implicit_lod = (inst->opcode == SHADER_OPCODE_TXL_LOGICAL ||
inst->opcode == SHADER_OPCODE_TXF_LOGICAL) &&
inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
/* Calculate the total number of argument components that need to be passed
* to the sampler unit.
*/
assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
const unsigned grad_components =
inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
const unsigned coord_components =
inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
unsigned num_payload_components =
coord_components +
inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
(implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
inst->components_read(TEX_LOGICAL_SRC_LOD2) +
inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
(inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
inst->components_read(TEX_LOGICAL_SRC_MCS) +
inst->components_read(TEX_LOGICAL_SRC_MIN_LOD);
if (inst->opcode == FS_OPCODE_TXB_LOGICAL && devinfo->ver >= 20) {
num_payload_components += 3 - coord_components;
} else if (inst->opcode == SHADER_OPCODE_TXD_LOGICAL &&
devinfo->verx10 >= 125 && devinfo->ver < 20) {
num_payload_components +=
3 - coord_components + (2 - grad_components) * 2;
} else {
num_payload_components += 4 - coord_components;
if (inst->opcode == SHADER_OPCODE_TXD_LOGICAL)
num_payload_components += (3 - grad_components) * 2;
}
const unsigned simd_limit = reg_unit(devinfo) *
(num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
/* SIMD16 (SIMD32 on Xe2) messages with more than five arguments exceed the
* maximum message size supported by the sampler, regardless of whether a
* header is provided or not.
*/
return MIN2(inst->exec_size, simd_limit);
}
static bool
is_half_float_src_dst(const fs_inst *inst)
{
if (inst->dst.type == BRW_TYPE_HF)
return true;
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].type == BRW_TYPE_HF)
return true;
}
return false;
}
/**
* Get the closest native SIMD width supported by the hardware for instruction
* \p inst. The instruction will be left untouched by
* fs_visitor::lower_simd_width() if the returned value is equal to the
* original execution size.
*/
unsigned
brw_get_lowered_simd_width(const fs_visitor *shader, const fs_inst *inst)
{
const struct brw_compiler *compiler = shader->compiler;
const struct intel_device_info *devinfo = compiler->devinfo;
switch (inst->opcode) {
case BRW_OPCODE_DP4A:
case BRW_OPCODE_MOV:
case BRW_OPCODE_SEL:
case BRW_OPCODE_NOT:
case BRW_OPCODE_AND:
case BRW_OPCODE_OR:
case BRW_OPCODE_XOR:
case BRW_OPCODE_SHR:
case BRW_OPCODE_SHL:
case BRW_OPCODE_ASR:
case BRW_OPCODE_ROR:
case BRW_OPCODE_ROL:
case BRW_OPCODE_CMPN:
case BRW_OPCODE_CSEL:
case BRW_OPCODE_BFREV:
case BRW_OPCODE_BFE:
case BRW_OPCODE_ADD:
case BRW_OPCODE_MUL:
case BRW_OPCODE_AVG:
case BRW_OPCODE_FRC:
case BRW_OPCODE_RNDU:
case BRW_OPCODE_RNDD:
case BRW_OPCODE_RNDE:
case BRW_OPCODE_RNDZ:
case BRW_OPCODE_LZD:
case BRW_OPCODE_FBH:
case BRW_OPCODE_FBL:
case BRW_OPCODE_CBIT:
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
case BRW_OPCODE_ADD3:
case FS_OPCODE_PACK:
case SHADER_OPCODE_SEL_EXEC:
case SHADER_OPCODE_CLUSTER_BROADCAST:
case SHADER_OPCODE_MOV_RELOC_IMM:
case BRW_OPCODE_CMP:
case BRW_OPCODE_BFI1:
case BRW_OPCODE_BFI2:
return get_fpu_lowered_simd_width(shader, inst);
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
case SHADER_OPCODE_SQRT:
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS: {
/* Xe2+: BSpec 56797
*
* Math operation rules when half-floats are used on both source and
* destination operands and both source and destinations are packed.
*
* The execution size must be 16.
*/
if (is_half_float_src_dst(inst))
return devinfo->ver < 20 ? MIN2(8, inst->exec_size) :
MIN2(16, inst->exec_size);
if (devinfo->ver < 20)
return MIN2(16, inst->exec_size);
return inst->exec_size;
}
case SHADER_OPCODE_POW: {
/* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
* to SIMD8 with half-float
*/
if (is_half_float_src_dst(inst))
return MIN2(8, inst->exec_size);
return MIN2(16, inst->exec_size);
}
case SHADER_OPCODE_USUB_SAT:
case SHADER_OPCODE_ISUB_SAT:
return get_fpu_lowered_simd_width(shader, inst);
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
/* Integer division is limited to SIMD8 on all generations. */
return MIN2(8, inst->exec_size);
case BRW_OPCODE_PLN:
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
case FS_OPCODE_DDX_COARSE:
case FS_OPCODE_DDX_FINE:
case FS_OPCODE_DDY_COARSE:
case FS_OPCODE_DDY_FINE:
return MIN2(16, inst->exec_size);
case SHADER_OPCODE_MULH:
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
* is 8-wide on Gfx7+.
*/
return devinfo->ver >= 20 ? 16 : 8;
case FS_OPCODE_FB_WRITE_LOGICAL:
if (devinfo->ver >= 20) {
/* Dual-source FB writes are unsupported in SIMD32 mode. */
return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
16 : MIN2(32, inst->exec_size));
} else {
/* Dual-source FB writes are unsupported in SIMD16 mode. */
return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
8 : MIN2(16, inst->exec_size));
}
case FS_OPCODE_FB_READ_LOGICAL:
return MIN2(16, inst->exec_size);
case SHADER_OPCODE_TEX_LOGICAL:
case SHADER_OPCODE_TXF_MCS_LOGICAL:
case SHADER_OPCODE_LOD_LOGICAL:
case SHADER_OPCODE_TG4_LOGICAL:
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
case SHADER_OPCODE_TXL_LOGICAL:
case FS_OPCODE_TXB_LOGICAL:
case SHADER_OPCODE_TXF_LOGICAL:
case SHADER_OPCODE_TXS_LOGICAL:
return get_sampler_lowered_simd_width(devinfo, inst);
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
if (devinfo->ver >= 20)
return inst->exec_size;
if (inst->src[MEMORY_LOGICAL_MODE].ud == MEMORY_MODE_TYPED)
return 8;
/* HDC A64 atomics are limited to SIMD8 */
if (!devinfo->has_lsc &&
inst->src[MEMORY_LOGICAL_BINDING_TYPE].ud == LSC_ADDR_SURFTYPE_FLAT
&& lsc_opcode_is_atomic((enum lsc_opcode)
inst->src[MEMORY_LOGICAL_OPCODE].ud))
return 8;
return MIN2(16, inst->exec_size);
/* On gfx12 parameters are fixed to 16-bit values and therefore they all
* always fit regardless of the execution size.
*/
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
return MIN2(16, inst->exec_size);
case SHADER_OPCODE_TXD_LOGICAL:
/* TXD is unsupported in SIMD16 mode previous to Xe2. SIMD32 is still
* unsuppported on Xe2.
*/
return devinfo->ver < 20 ? 8 : 16;
case SHADER_OPCODE_URB_READ_LOGICAL:
case SHADER_OPCODE_URB_WRITE_LOGICAL:
return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
case SHADER_OPCODE_QUAD_SWIZZLE: {
const unsigned swiz = inst->src[1].ud;
return (is_uniform(inst->src[0]) ?
get_fpu_lowered_simd_width(shader, inst) :
devinfo->ver < 11 && brw_type_size_bytes(inst->src[0].type) == 4 ? 8 :
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
get_fpu_lowered_simd_width(shader, inst));
}
case SHADER_OPCODE_MOV_INDIRECT: {
/* From IVB and HSW PRMs:
*
* "2.When the destination requires two registers and the sources are
* indirect, the sources must use 1x1 regioning mode.
*
* In case of DF instructions in HSW/IVB, the exec_size is limited by
* the EU decompression logic not handling VxH indirect addressing
* correctly.
*/
const unsigned max_size = 2 * REG_SIZE;
/* Prior to Broadwell, we only have 8 address subregisters. */
return MIN3(16,
max_size / (inst->dst.stride * brw_type_size_bytes(inst->dst.type)),
inst->exec_size);
}
case SHADER_OPCODE_LOAD_PAYLOAD: {
const unsigned reg_count =
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size),
REG_SIZE * reg_unit(devinfo));
if (reg_count > 2) {
/* Only LOAD_PAYLOAD instructions with per-channel destination region
* can be easily lowered (which excludes headers and heterogeneous
* types).
*/
assert(!inst->header_size);
for (unsigned i = 0; i < inst->sources; i++)
assert(brw_type_size_bits(inst->dst.type) == brw_type_size_bits(inst->src[i].type) ||
inst->src[i].file == BAD_FILE);
return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
} else {
return inst->exec_size;
}
}
default:
return inst->exec_size;
}
}
/**
* Return true if splitting out the group of channels of instruction \p inst
* given by lbld.group() requires allocating a temporary for the i-th source
* of the lowered instruction.
*/
static inline bool
needs_src_copy(const brw_builder &lbld, const fs_inst *inst, unsigned i)
{
/* The indirectly indexed register stays the same even if we split the
* instruction.
*/
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
return false;
brw: Basic infrastructure to store convergent values as scalars In SIMD16 and SIMD32, storing convergent values in full 16- or 32-channel registers is wasteful. It wastes register space, and in most cases on SIMD32, it wastes instructions. Our register allocator is not clever enough to handle scalar allocations. It's fundamental unit of allocation is SIMD8. Start treating convergent values as SIMD8. Add a tracking bit in brw_reg to specify that a register represents a convergent, scalar value. This has two implications: 1. All channels of the SIMD8 register must contain the same value. In general, this means that writes to the register must be force_writemask_all and exec_size = 8; 2. Reads of this register can (and should) use <0,1,0> stride. SIMD8 instructions that have restrictions on source stride can us <8,8,1>. Values that are vectors (e.g., results of load_uniform or texture operations) will be stored as multiple SIMD8 hardware registers. v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2. v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in brw_reg.h. Both suggested by Caio. The offset_to_scalar() change necessitates some trickery in the fs_builder offset() function, but I think this is an improvement overall. There is also some rework in find_value_for_offset to account for the possibility that is_scalar sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-09 17:12:11 -08:00
return !inst->src[i].is_scalar &&
(!(is_periodic(inst->src[i], lbld.dispatch_width()) ||
(inst->components_read(i) == 1 &&
lbld.dispatch_width() <= inst->exec_size)) ||
(inst->flags_written(lbld.shader->devinfo) &
brw_fs_flag_mask(inst->src[i], brw_type_size_bytes(inst->src[i].type))));
}
/**
* Extract the data that would be consumed by the channel group given by
* lbld.group() from the i-th source region of instruction \p inst and return
* it as result in packed form.
*/
static brw_reg
emit_unzip(const brw_builder &lbld, fs_inst *inst, unsigned i)
{
assert(lbld.group() >= inst->group);
/* Specified channel group from the source region. */
const brw_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
if (needs_src_copy(lbld, inst, i)) {
const unsigned num_components = inst->components_read(i);
const brw_reg tmp = lbld.vgrf(inst->src[i].type, num_components);
assert(num_components <= NIR_MAX_VEC_COMPONENTS);
brw_reg comps[NIR_MAX_VEC_COMPONENTS];
for (unsigned k = 0; k < num_components; ++k)
comps[k] = offset(src, inst->exec_size, k);
lbld.VEC(tmp, comps, num_components);
return tmp;
} else if (is_periodic(inst->src[i], lbld.dispatch_width()) ||
brw: Basic infrastructure to store convergent values as scalars In SIMD16 and SIMD32, storing convergent values in full 16- or 32-channel registers is wasteful. It wastes register space, and in most cases on SIMD32, it wastes instructions. Our register allocator is not clever enough to handle scalar allocations. It's fundamental unit of allocation is SIMD8. Start treating convergent values as SIMD8. Add a tracking bit in brw_reg to specify that a register represents a convergent, scalar value. This has two implications: 1. All channels of the SIMD8 register must contain the same value. In general, this means that writes to the register must be force_writemask_all and exec_size = 8; 2. Reads of this register can (and should) use <0,1,0> stride. SIMD8 instructions that have restrictions on source stride can us <8,8,1>. Values that are vectors (e.g., results of load_uniform or texture operations) will be stored as multiple SIMD8 hardware registers. v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2. v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in brw_reg.h. Both suggested by Caio. The offset_to_scalar() change necessitates some trickery in the fs_builder offset() function, but I think this is an improvement overall. There is also some rework in find_value_for_offset to account for the possibility that is_scalar sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-09 17:12:11 -08:00
(inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) ||
inst->src[i].is_scalar) {
/* The source is invariant for all dispatch_width-wide groups of the
* original region.
*
* The src[0] of MOV_INDIRECT is invariant regardless of the execution
* size.
*/
return inst->src[i];
} else {
/* We can just point the lowered instruction at the right channel group
* from the original region.
*/
return src;
}
}
/**
* Return true if splitting out the group of channels of instruction \p inst
* given by lbld.group() requires allocating a temporary for the destination
* of the lowered instruction and copying the data back to the original
* destination region.
*/
static inline bool
needs_dst_copy(const brw_builder &lbld, const fs_inst *inst)
{
if (inst->dst.is_null())
return false;
/* If the instruction writes more than one component we'll have to shuffle
* the results of multiple lowered instructions in order to make sure that
* they end up arranged correctly in the original destination region.
*/
if (inst->size_written > inst->dst.component_size(inst->exec_size))
return true;
for (unsigned i = 0; i < inst->sources; i++) {
/* If we already made a copy of the source for other reasons there won't
* be any overlap with the destination.
*/
if (needs_src_copy(lbld, inst, i))
continue;
/* In order to keep the logic simple we emit a copy whenever the
* destination region doesn't exactly match an overlapping source, which
* may point at the source and destination not being aligned group by
* group which could cause one of the lowered instructions to overwrite
* the data read from the same source by other lowered instructions.
*/
if (regions_overlap(inst->dst, inst->size_written,
inst->src[i], inst->size_read(lbld.shader->devinfo, i)) &&
!inst->dst.equals(inst->src[i]))
return true;
}
return false;
}
/**
* Insert data from a packed temporary into the channel group given by
* lbld.group() of the destination region of instruction \p inst and return
* the temporary as result. Any copy instructions that are required for
* unzipping the previous value (in the case of partial writes) will be
* inserted using \p lbld_before and any copy instructions required for
* zipping up the destination of \p inst will be inserted using \p lbld_after.
*/
static brw_reg
emit_zip(const brw_builder &lbld_before, const brw_builder &lbld_after,
fs_inst *inst)
{
assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
assert(lbld_before.group() == lbld_after.group());
assert(lbld_after.group() >= inst->group);
const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
/* Specified channel group from the destination region. */
const brw_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
if (!needs_dst_copy(lbld_after, inst)) {
/* No need to allocate a temporary for the lowered instruction, just
* take the right group of channels from the original region.
*/
return dst;
}
/* Deal with the residency data part later */
const unsigned residency_size = inst->has_sampler_residency() ?
(reg_unit(devinfo) * REG_SIZE) : 0;
const unsigned dst_size = (inst->size_written - residency_size) /
inst->dst.component_size(inst->exec_size);
const brw_reg tmp = lbld_after.vgrf(inst->dst.type,
dst_size + inst->has_sampler_residency());
if (inst->predicate) {
/* Handle predication by copying the original contents of the
* destination into the temporary before emitting the lowered
* instruction.
*/
for (unsigned k = 0; k < dst_size; ++k) {
lbld_before.MOV(offset(tmp, lbld_before, k),
offset(dst, inst->exec_size, k));
}
}
for (unsigned k = 0; k < dst_size; ++k) {
/* Copy the (split) temp into the original (larger) destination */
lbld_after.MOV(offset(dst, inst->exec_size, k),
offset(tmp, lbld_after, k));
}
if (inst->has_sampler_residency()) {
/* Sampler messages with residency need a special attention. In the
* first lane of the last component are located the Pixel Null Mask
* (bits 0:15) & some upper bits we need to discard (bits 16:31). We
* have to build a single 32bit value for the SIMD32 message out of 2
* SIMD16 16 bit values.
*/
const brw_builder rbld = lbld_after.exec_all().group(1, 0);
brw_reg local_res_reg = component(
retype(offset(tmp, lbld_before, dst_size), BRW_TYPE_UW), 0);
brw_reg final_res_reg =
retype(byte_offset(inst->dst,
inst->size_written - residency_size +
lbld_after.group() / 8), BRW_TYPE_UW);
rbld.MOV(final_res_reg, local_res_reg);
}
return tmp;
}
bool
brw_lower_simd_width(fs_visitor &s)
{
bool progress = false;
foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) {
const unsigned lower_width = brw_get_lowered_simd_width(&s, inst);
/* No splitting required */
if (lower_width == inst->exec_size)
continue;
assert(lower_width < inst->exec_size);
/* Builder matching the original instruction. */
const brw_builder bld = brw_builder(&s).at_end();
const brw_builder ibld =
bld.at(block, inst).exec_all(inst->force_writemask_all)
.group(inst->exec_size, inst->group / inst->exec_size);
/* Split the copies in chunks of the execution width of either the
* original or the lowered instruction, whichever is lower.
*/
const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
const unsigned residency_size = inst->has_sampler_residency() ?
(reg_unit(s.devinfo) * REG_SIZE) : 0;
const unsigned dst_size =
(inst->size_written - residency_size) /
inst->dst.component_size(inst->exec_size);
assert(!inst->writes_accumulator && !inst->mlen);
/* Inserting the zip, unzip, and duplicated instructions in all of
* the right spots is somewhat tricky. All of the unzip and any
* instructions from the zip which unzip the destination prior to
* writing need to happen before all of the per-group instructions
* and the zip instructions need to happen after. In order to sort
* this all out, we insert the unzip instructions before \p inst,
* insert the per-group instructions after \p inst (i.e. before
* inst->next), and insert the zip instructions before the
* instruction after \p inst. Since we are inserting instructions
* after \p inst, inst->next is a moving target and we need to save
* it off here so that we insert the zip instructions in the right
* place.
*
* Since we're inserting split instructions after after_inst, the
* instructions will end up in the reverse order that we insert them.
* However, certain render target writes require that the low group
* instructions come before the high group. From the Ivy Bridge PRM
* Vol. 4, Pt. 1, Section 3.9.11:
*
* "If multiple SIMD8 Dual Source messages are delivered by the
* pixel shader thread, each SIMD8_DUALSRC_LO message must be
* issued before the SIMD8_DUALSRC_HI message with the same Slot
* Group Select setting."
*
* And, from Section 3.9.11.1 of the same PRM:
*
* "When SIMD32 or SIMD16 PS threads send render target writes
* with multiple SIMD8 and SIMD16 messages, the following must
* hold:
*
* All the slots (as described above) must have a corresponding
* render target write irrespective of the slot's validity. A slot
* is considered valid when at least one sample is enabled. For
* example, a SIMD16 PS thread must send two SIMD8 render target
* writes to cover all the slots.
*
* PS thread must send SIMD render target write messages with
* increasing slot numbers. For example, SIMD16 thread has
* Slot[15:0] and if two SIMD8 render target writes are used, the
* first SIMD8 render target write must send Slot[7:0] and the
* next one must send Slot[15:8]."
*
* In order to make low group instructions come before high group
* instructions (this is required for some render target writes), we
* split from the highest group to lowest.
*/
exec_node *const after_inst = inst->next;
for (int i = n - 1; i >= 0; i--) {
/* Emit a copy of the original instruction with the lowered width.
* If the EOT flag was set throw it away except for the last
* instruction to avoid killing the thread prematurely.
*/
fs_inst split_inst = *inst;
split_inst.exec_size = lower_width;
split_inst.eot = inst->eot && i == int(n - 1);
/* Select the correct channel enables for the i-th group, then
* transform the sources and destination and emit the lowered
* instruction.
*/
const brw_builder lbld = ibld.group(lower_width, i);
for (unsigned j = 0; j < inst->sources; j++)
split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
split_inst.dst = emit_zip(lbld.at(block, inst),
lbld.at(block, after_inst), inst);
split_inst.size_written =
split_inst.dst.component_size(lower_width) * dst_size +
residency_size;
lbld.at(block, inst->next).emit(split_inst);
}
inst->remove(block);
progress = true;
}
if (progress)
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}