mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-21 15:50:11 +01:00
Because we do emit the final send message form in code generation, a lot of emissions look like this : add(8) vgrf0, u0, 0x100 mov(1) a0.1, vgrf0 # emitted by the generator send(8) ..., a0.1 By moving address register manipulation in the IR, we can get this down to : add(1) a0.1, u0, 0x100 send(8) ..., a0.1 This reduce register pressure around some send messages by 1 vgrf. All lost shaders in the below results are fragment SIMD32, due to the throughput estimator. If turned off, we loose no SIMD32 shaders with this change. DG2 results: Assassin's Creed Valhalla: Totals from 2044 (96.87% of 2110) affected shaders: Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00% Subgroup size: 23832 -> 23824 (-0.03%) Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82% Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39% Fill count: 2005 -> 1256 (-37.36%) Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00% Max live registers: 116765 -> 115058 (-1.46%) Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67% Cyberpunk 2077: Totals from 1181 (93.43% of 1264) affected shaders: Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01% Subgroup size: 13016 -> 13032 (+0.12%) Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39% Spill count: 12 -> 8 (-33.33%) Fill count: 9 -> 6 (-33.33%) Dota2: Totals from 173 (11.59% of 1493) affected shaders: Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34% Max live registers: 5787 -> 5779 (-0.14%) Max dispatch width: 1344 -> 1152 (-14.29%) Hitman3: Totals from 5072 (95.39% of 5317) affected shaders: Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00% Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48% Spill count: 3942 -> 3200 (-18.82%) Fill count: 10158 -> 8846 (-12.92%) Scratch Memory Size: 257024 -> 223232 (-13.15%) Max live registers: 328467 -> 324631 (-1.17%) Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73% Fortnite: Totals from 360 (4.82% of 7472) affected shaders: Instrs: 778068 -> 777925 (-0.02%) Subgroup size: 3128 -> 3136 (+0.26%) Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19% Max live registers: 50689 -> 50658 (-0.06%) Hogwarts Legacy: Totals from 1376 (84.00% of 1638) affected shaders: Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03% Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12% Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36% Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23% Scratch Memory Size: 99328 -> 89088 (-10.31%) Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23% Max dispatch width: 11848 -> 11920 (+0.61%) Metro Exodus: Totals from 92 (0.21% of 43072) affected shaders: Instrs: 262995 -> 262968 (-0.01%) Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25% Max live registers: 11152 -> 11140 (-0.11%) Red Dead Redemption 2 : Totals from 451 (7.71% of 5847) affected shaders: Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00% Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00% Max live registers: 42294 -> 42185 (-0.26%) Spiderman Remastered: Totals from 6820 (98.02% of 6958) affected shaders: Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65% Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25% Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61% Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58% Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74% Max live registers: 493149 -> 487458 (-1.15%) Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20% Strange Brigade: Totals from 3769 (91.21% of 4132) affected shaders: Instrs: 1354476 -> 1321474 (-2.44%) Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59% Max live registers: 199057 -> 193656 (-2.71%) Max dispatch width: 30272 -> 30240 (-0.11%) Witcher 3: Totals from 25 (2.40% of 1041) affected shaders: Instrs: 24621 -> 24606 (-0.06%) Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05% Max live registers: 1963 -> 1955 (-0.41%) LNL results: Assassin's Creed Valhalla: Totals from 1928 (98.02% of 1967) affected shaders: Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11% Subgroup size: 41264 -> 41280 (+0.04%) Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11% Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90% Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60% Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56% Max live registers: 205483 -> 202192 (-1.60%) Cyberpunk 2077: Totals from 1177 (96.40% of 1221) affected shaders: Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03% Subgroup size: 24912 -> 24944 (+0.13%) Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81% Spill count: 8 -> 3 (-62.50%) Fill count: 6 -> 3 (-50.00%) Max live registers: 126922 -> 125472 (-1.14%) Dota2: Totals from 428 (32.47% of 1318) affected shaders: Instrs: 89355 -> 89740 (+0.43%) Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55% Max live registers: 32863 -> 32847 (-0.05%) Fortnite: Totals from 5354 (81.72% of 6552) affected shaders: Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53% Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65% Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72% Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35% Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71% Hitman3: Totals from 4912 (97.09% of 5059) affected shaders: Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00% Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55% Spill count: 3739 -> 3136 (-16.13%) Fill count: 10657 -> 9564 (-10.26%) Scratch Memory Size: 373760 -> 318464 (-14.79%) Max live registers: 597566 -> 589460 (-1.36%) Hogwarts Legacy: Totals from 1471 (96.33% of 1527) affected shaders: Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05% Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68% Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95% Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83% Scratch Memory Size: 251904 -> 217088 (-13.82%) Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12% Metro Exodus: Totals from 18356 (49.81% of 36854) affected shaders: Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83% Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84% Spill count: 595 -> 546 (-8.24%) Fill count: 1604 -> 1408 (-12.22%) Max live registers: 2086937 -> 2086933 (-0.00%) Red Dead Redemption 2: Totals from 4171 (79.31% of 5259) affected shaders: Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83% Subgroup size: 86416 -> 86432 (+0.02%) Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53% Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59% Scratch Memory Size: 401408 -> 385024 (-4.08%) Spiderman Remastered: Totals from 6639 (98.94% of 6710) affected shaders: Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98% Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59% Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82% Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76% Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17% Max live registers: 918240 -> 906604 (-1.27%) Strange Brigade: Totals from 3675 (92.24% of 3984) affected shaders: Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00% Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09% Max live registers: 361849 -> 351265 (-2.92%) Witcher 3: Totals from 13 (46.43% of 28) affected shaders: Instrs: 593 -> 660 (+11.30%) Cycle count: 28302 -> 28714 (+1.46%) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Reviewed-by: Caio Oliveira <caio.oliveira@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
1664 lines
50 KiB
C++
1664 lines
50 KiB
C++
/*
|
|
* Copyright © 2010 Intel Corporation
|
|
*
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
* to deal in the Software without restriction, including without limitation
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
*
|
|
* The above copyright notice and this permission notice (including the next
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
* Software.
|
|
*
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
* IN THE SOFTWARE.
|
|
*/
|
|
|
|
/** @file
|
|
*
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
* from the LIR.
|
|
*/
|
|
|
|
#include "brw_eu.h"
|
|
#include "brw_fs.h"
|
|
#include "brw_fs_builder.h"
|
|
#include "brw_fs_live_variables.h"
|
|
#include "brw_nir.h"
|
|
#include "brw_cfg.h"
|
|
#include "brw_private.h"
|
|
#include "intel_nir.h"
|
|
#include "shader_enums.h"
|
|
#include "dev/intel_debug.h"
|
|
#include "dev/intel_wa.h"
|
|
#include "compiler/glsl_types.h"
|
|
#include "compiler/nir/nir_builder.h"
|
|
#include "util/u_math.h"
|
|
|
|
using namespace brw;
|
|
|
|
static void
|
|
initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
|
|
|
|
void
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg *src, unsigned sources)
|
|
{
|
|
memset((void*)this, 0, sizeof(*this));
|
|
|
|
initialize_sources(this, src, sources);
|
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
this->src[i] = src[i];
|
|
|
|
this->opcode = opcode;
|
|
this->dst = dst;
|
|
this->exec_size = exec_size;
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
assert(this->exec_size != 0);
|
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
/* This will be the case for almost all instructions. */
|
|
switch (dst.file) {
|
|
case VGRF:
|
|
case ADDRESS:
|
|
case ARF:
|
|
case FIXED_GRF:
|
|
case ATTR:
|
|
this->size_written = dst.component_size(exec_size);
|
|
break;
|
|
case BAD_FILE:
|
|
this->size_written = 0;
|
|
break;
|
|
case IMM:
|
|
case UNIFORM:
|
|
unreachable("Invalid destination register file");
|
|
}
|
|
|
|
this->writes_accumulator = false;
|
|
}
|
|
|
|
fs_inst::fs_inst()
|
|
{
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
|
}
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
{
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
|
}
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
|
|
{
|
|
init(opcode, exec_size, dst, NULL, 0);
|
|
}
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg &src0)
|
|
{
|
|
const brw_reg src[1] = { src0 };
|
|
init(opcode, exec_size, dst, src, 1);
|
|
}
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg &src0, const brw_reg &src1)
|
|
{
|
|
const brw_reg src[2] = { src0, src1 };
|
|
init(opcode, exec_size, dst, src, 2);
|
|
}
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
|
|
{
|
|
const brw_reg src[3] = { src0, src1, src2 };
|
|
init(opcode, exec_size, dst, src, 3);
|
|
}
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
|
|
const brw_reg src[], unsigned sources)
|
|
{
|
|
init(opcode, exec_width, dst, src, sources);
|
|
}
|
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
{
|
|
memcpy((void*)this, &that, sizeof(that));
|
|
initialize_sources(this, that.src, that.sources);
|
|
}
|
|
|
|
fs_inst::~fs_inst()
|
|
{
|
|
if (this->src != this->builtin_src)
|
|
delete[] this->src;
|
|
}
|
|
|
|
static void
|
|
initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
|
|
{
|
|
if (num_sources > ARRAY_SIZE(inst->builtin_src))
|
|
inst->src = new brw_reg[num_sources];
|
|
else
|
|
inst->src = inst->builtin_src;
|
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
inst->src[i] = src[i];
|
|
|
|
inst->sources = num_sources;
|
|
}
|
|
|
|
void
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
{
|
|
if (this->sources == num_sources)
|
|
return;
|
|
|
|
brw_reg *old_src = this->src;
|
|
brw_reg *new_src;
|
|
|
|
const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
|
|
|
|
if (old_src == this->builtin_src) {
|
|
if (num_sources > builtin_size) {
|
|
new_src = new brw_reg[num_sources];
|
|
for (unsigned i = 0; i < this->sources; i++)
|
|
new_src[i] = old_src[i];
|
|
|
|
} else {
|
|
new_src = old_src;
|
|
}
|
|
} else {
|
|
if (num_sources <= builtin_size) {
|
|
new_src = this->builtin_src;
|
|
assert(this->sources > num_sources);
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
new_src[i] = old_src[i];
|
|
|
|
} else if (num_sources < this->sources) {
|
|
new_src = old_src;
|
|
|
|
} else {
|
|
new_src = new brw_reg[num_sources];
|
|
for (unsigned i = 0; i < this->sources; i++)
|
|
new_src[i] = old_src[i];
|
|
}
|
|
|
|
if (old_src != new_src)
|
|
delete[] old_src;
|
|
}
|
|
|
|
this->sources = num_sources;
|
|
this->src = new_src;
|
|
}
|
|
|
|
bool
|
|
fs_inst::is_send_from_grf() const
|
|
{
|
|
switch (opcode) {
|
|
case SHADER_OPCODE_SEND:
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
case SHADER_OPCODE_BARRIER:
|
|
return true;
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
return src[1].file == VGRF;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
fs_inst::is_control_source(unsigned arg) const
|
|
{
|
|
switch (opcode) {
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
return arg == 0;
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
return arg == 1;
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
return arg == 1 || arg == 2;
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
return arg == 0 || arg == 1;
|
|
|
|
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
|
|
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
|
|
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
|
|
return arg != MEMORY_LOGICAL_BINDING &&
|
|
arg != MEMORY_LOGICAL_ADDRESS &&
|
|
arg != MEMORY_LOGICAL_DATA0 &&
|
|
arg != MEMORY_LOGICAL_DATA1;
|
|
|
|
case SHADER_OPCODE_QUAD_SWAP:
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
|
case SHADER_OPCODE_VOTE_ANY:
|
|
case SHADER_OPCODE_VOTE_ALL:
|
|
case SHADER_OPCODE_REDUCE:
|
|
return arg != 0;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
fs_inst::is_payload(unsigned arg) const
|
|
{
|
|
switch (opcode) {
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
case SHADER_OPCODE_BARRIER:
|
|
return arg == 0;
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
return arg == 2 || arg == 3;
|
|
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
bool
|
|
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
|
|
{
|
|
if (is_send_from_grf())
|
|
return false;
|
|
|
|
/* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
|
|
*
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
* is not supported."
|
|
*/
|
|
if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
|
|
opcode == BRW_OPCODE_MAD)) {
|
|
const brw_reg_type exec_type = get_exec_type(this);
|
|
const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
|
|
MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
|
|
MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
|
|
|
|
if (brw_type_is_int(exec_type) &&
|
|
brw_type_size_bytes(exec_type) >= 4 &&
|
|
brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
|
|
return false;
|
|
}
|
|
|
|
switch (opcode) {
|
|
case BRW_OPCODE_ADDC:
|
|
case BRW_OPCODE_BFE:
|
|
case BRW_OPCODE_BFI1:
|
|
case BRW_OPCODE_BFI2:
|
|
case BRW_OPCODE_BFREV:
|
|
case BRW_OPCODE_CBIT:
|
|
case BRW_OPCODE_FBH:
|
|
case BRW_OPCODE_FBL:
|
|
case BRW_OPCODE_ROL:
|
|
case BRW_OPCODE_ROR:
|
|
case BRW_OPCODE_SUBB:
|
|
case BRW_OPCODE_DP4A:
|
|
case BRW_OPCODE_DPAS:
|
|
case SHADER_OPCODE_BROADCAST:
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
case SHADER_OPCODE_REDUCE:
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
|
case SHADER_OPCODE_VOTE_ANY:
|
|
case SHADER_OPCODE_VOTE_ALL:
|
|
case SHADER_OPCODE_VOTE_EQUAL:
|
|
case SHADER_OPCODE_BALLOT:
|
|
case SHADER_OPCODE_QUAD_SWAP:
|
|
case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
|
|
case SHADER_OPCODE_READ_FROM_CHANNEL:
|
|
return false;
|
|
default:
|
|
return true;
|
|
}
|
|
}
|
|
|
|
bool
|
|
fs_inst::can_do_cmod() const
|
|
{
|
|
switch (opcode) {
|
|
case BRW_OPCODE_ADD:
|
|
case BRW_OPCODE_ADD3:
|
|
case BRW_OPCODE_ADDC:
|
|
case BRW_OPCODE_AND:
|
|
case BRW_OPCODE_ASR:
|
|
case BRW_OPCODE_AVG:
|
|
case BRW_OPCODE_CMP:
|
|
case BRW_OPCODE_CMPN:
|
|
case BRW_OPCODE_DP2:
|
|
case BRW_OPCODE_DP3:
|
|
case BRW_OPCODE_DP4:
|
|
case BRW_OPCODE_DPH:
|
|
case BRW_OPCODE_FRC:
|
|
case BRW_OPCODE_LINE:
|
|
case BRW_OPCODE_LRP:
|
|
case BRW_OPCODE_LZD:
|
|
case BRW_OPCODE_MAC:
|
|
case BRW_OPCODE_MACH:
|
|
case BRW_OPCODE_MAD:
|
|
case BRW_OPCODE_MOV:
|
|
case BRW_OPCODE_MUL:
|
|
case BRW_OPCODE_NOT:
|
|
case BRW_OPCODE_OR:
|
|
case BRW_OPCODE_PLN:
|
|
case BRW_OPCODE_RNDD:
|
|
case BRW_OPCODE_RNDE:
|
|
case BRW_OPCODE_RNDU:
|
|
case BRW_OPCODE_RNDZ:
|
|
case BRW_OPCODE_SHL:
|
|
case BRW_OPCODE_SHR:
|
|
case BRW_OPCODE_SUBB:
|
|
case BRW_OPCODE_XOR:
|
|
break;
|
|
default:
|
|
return false;
|
|
}
|
|
|
|
/* The accumulator result appears to get used for the conditional modifier
|
|
* generation. When negating a UD value, there is a 33rd bit generated for
|
|
* the sign in the accumulator value, so now you can't check, for example,
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
*/
|
|
for (unsigned i = 0; i < sources; i++) {
|
|
if (brw_type_is_uint(src[i].type) && src[i].negate)
|
|
return false;
|
|
}
|
|
|
|
if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
|
|
return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool
|
|
fs_inst::can_change_types() const
|
|
{
|
|
return dst.type == src[0].type &&
|
|
!src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
|
|
(opcode == BRW_OPCODE_MOV ||
|
|
(opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
|
|
(opcode == BRW_OPCODE_SEL &&
|
|
dst.type == src[1].type &&
|
|
predicate != BRW_PREDICATE_NONE &&
|
|
!src[1].abs && !src[1].negate && src[1].file != ATTR));
|
|
}
|
|
|
|
void
|
|
fs_visitor::vfail(const char *format, va_list va)
|
|
{
|
|
char *msg;
|
|
|
|
if (failed)
|
|
return;
|
|
|
|
failed = true;
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
|
|
|
this->fail_msg = msg;
|
|
|
|
if (unlikely(debug_enabled)) {
|
|
fprintf(stderr, "%s", msg);
|
|
}
|
|
}
|
|
|
|
void
|
|
fs_visitor::fail(const char *format, ...)
|
|
{
|
|
va_list va;
|
|
|
|
va_start(va, format);
|
|
vfail(format, va);
|
|
va_end(va);
|
|
}
|
|
|
|
/**
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
* than n.
|
|
*
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
* SIMD16+ compile altogether.
|
|
*
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
* this just calls fail().
|
|
*/
|
|
void
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
|
{
|
|
if (dispatch_width > n) {
|
|
fail("%s", msg);
|
|
} else {
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
|
brw_shader_perf_log(compiler, log_data,
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
|
n, msg);
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Returns true if the instruction has a flag that means it won't
|
|
* update an entire destination register.
|
|
*
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
* when a write to a variable screens off any preceding values that were in
|
|
* it.
|
|
*/
|
|
bool
|
|
fs_inst::is_partial_write() const
|
|
{
|
|
if (this->predicate && !this->predicate_trivial &&
|
|
this->opcode != BRW_OPCODE_SEL)
|
|
return true;
|
|
|
|
if (!this->dst.is_contiguous())
|
|
return true;
|
|
|
|
if (this->dst.offset % REG_SIZE != 0)
|
|
return true;
|
|
|
|
return this->size_written % REG_SIZE != 0;
|
|
}
|
|
|
|
unsigned
|
|
fs_inst::components_read(unsigned i) const
|
|
{
|
|
/* Return zero if the source is not present. */
|
|
if (src[i].file == BAD_FILE)
|
|
return 0;
|
|
|
|
switch (opcode) {
|
|
case BRW_OPCODE_PLN:
|
|
return i == 0 ? 1 : 2;
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
case FS_OPCODE_PIXEL_Y:
|
|
assert(i < 2);
|
|
if (i == 0)
|
|
return 2;
|
|
else
|
|
return 1;
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
|
/* First/second FB write color. */
|
|
if (i < 2)
|
|
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
|
else
|
|
return 1;
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
|
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
|
|
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
|
|
src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
|
/* Texture coordinates. */
|
|
if (i == TEX_LOGICAL_SRC_COORDINATE)
|
|
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
|
/* Texture derivatives. */
|
|
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
|
|
opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
|
/* Texture offset. */
|
|
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
|
|
return 2;
|
|
/* MCS */
|
|
else if (i == TEX_LOGICAL_SRC_MCS) {
|
|
if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
|
|
return 2;
|
|
else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
|
|
return 4;
|
|
else
|
|
return 1;
|
|
} else
|
|
return 1;
|
|
|
|
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
|
|
if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
|
|
return 0;
|
|
/* fallthrough */
|
|
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
|
|
if (i == MEMORY_LOGICAL_DATA1)
|
|
return 0;
|
|
/* fallthrough */
|
|
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
|
|
if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
|
|
return src[MEMORY_LOGICAL_COMPONENTS].ud;
|
|
else if (i == MEMORY_LOGICAL_ADDRESS)
|
|
return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
|
|
else
|
|
return 1;
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
return (i == 0 ? 2 : 1);
|
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
|
assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
|
|
|
|
if (i == URB_LOGICAL_SRC_DATA)
|
|
return src[URB_LOGICAL_SRC_COMPONENTS].ud;
|
|
else
|
|
return 1;
|
|
|
|
case BRW_OPCODE_DPAS:
|
|
unreachable("Do not use components_read() for DPAS.");
|
|
|
|
default:
|
|
return 1;
|
|
}
|
|
}
|
|
|
|
unsigned
|
|
fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
|
|
{
|
|
switch (opcode) {
|
|
case SHADER_OPCODE_SEND:
|
|
if (arg == 2) {
|
|
return mlen * REG_SIZE;
|
|
} else if (arg == 3) {
|
|
return ex_mlen * REG_SIZE;
|
|
}
|
|
break;
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
if (arg == 0)
|
|
return mlen * REG_SIZE;
|
|
break;
|
|
|
|
case BRW_OPCODE_PLN:
|
|
if (arg == 0)
|
|
return 16;
|
|
break;
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
if (arg < this->header_size)
|
|
return retype(src[arg], BRW_TYPE_UD).component_size(8);
|
|
break;
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
return REG_SIZE;
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
if (arg == 0) {
|
|
assert(src[2].file == IMM);
|
|
return src[2].ud;
|
|
}
|
|
break;
|
|
|
|
case BRW_OPCODE_DPAS: {
|
|
/* This is a little bit sketchy. There's no way to get at devinfo from
|
|
* here, so the regular reg_unit() cannot be used. However, on
|
|
* reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
|
|
* reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
|
|
* coincidence, so this isn't so bad.
|
|
*/
|
|
const unsigned reg_unit = this->exec_size / 8;
|
|
|
|
switch (arg) {
|
|
case 0:
|
|
if (src[0].type == BRW_TYPE_HF) {
|
|
return rcount * reg_unit * REG_SIZE / 2;
|
|
} else {
|
|
return rcount * reg_unit * REG_SIZE;
|
|
}
|
|
case 1:
|
|
return sdepth * reg_unit * REG_SIZE;
|
|
case 2:
|
|
/* This is simpler than the formula described in the Bspec, but it
|
|
* covers all of the cases that we support. Each inner sdepth
|
|
* iteration of the DPAS consumes a single dword for int8, uint8, or
|
|
* float16 types. These are the one source types currently
|
|
* supportable through Vulkan. This is independent of reg_unit.
|
|
*/
|
|
return rcount * sdepth * 4;
|
|
default:
|
|
unreachable("Invalid source number.");
|
|
}
|
|
break;
|
|
}
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
switch (src[arg].file) {
|
|
case UNIFORM:
|
|
case IMM:
|
|
return components_read(arg) * brw_type_size_bytes(src[arg].type);
|
|
case BAD_FILE:
|
|
case ADDRESS:
|
|
case ARF:
|
|
case FIXED_GRF:
|
|
case VGRF:
|
|
case ATTR:
|
|
/* Regardless of exec_size, values marked as scalar are SIMD8. */
|
|
return components_read(arg) *
|
|
src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
namespace {
|
|
unsigned
|
|
predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
|
|
{
|
|
if (devinfo->ver >= 20) {
|
|
return 1;
|
|
} else {
|
|
switch (predicate) {
|
|
case BRW_PREDICATE_NONE: return 1;
|
|
case BRW_PREDICATE_NORMAL: return 1;
|
|
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
|
|
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
|
|
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
|
|
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
|
|
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
|
|
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
|
|
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
|
|
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
|
|
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
|
|
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
|
|
default: unreachable("Unsupported predicate");
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
unsigned
|
|
fs_inst::flags_read(const intel_device_info *devinfo) const
|
|
{
|
|
if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
|
|
predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
|
|
/* The vertical predication modes combine corresponding bits from
|
|
* f0.0 and f1.0 on Gfx7+.
|
|
*/
|
|
const unsigned shift = 4;
|
|
return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
|
|
} else if (predicate) {
|
|
return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
|
|
} else {
|
|
unsigned mask = 0;
|
|
for (int i = 0; i < sources; i++) {
|
|
mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
|
|
}
|
|
return mask;
|
|
}
|
|
}
|
|
|
|
unsigned
|
|
fs_inst::flags_written(const intel_device_info *devinfo) const
|
|
{
|
|
if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
|
opcode != BRW_OPCODE_CSEL &&
|
|
opcode != BRW_OPCODE_IF &&
|
|
opcode != BRW_OPCODE_WHILE)) {
|
|
return brw_fs_flag_mask(this, 1);
|
|
} else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
|
|
return brw_fs_flag_mask(this, 32);
|
|
} else {
|
|
return brw_fs_flag_mask(dst, size_written);
|
|
}
|
|
}
|
|
|
|
bool
|
|
fs_inst::has_sampler_residency() const
|
|
{
|
|
switch (opcode) {
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
|
assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
|
return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* \sa inst_is_raw_move in brw_eu_validate. */
|
|
bool
|
|
fs_inst::is_raw_move() const
|
|
{
|
|
if (opcode != BRW_OPCODE_MOV)
|
|
return false;
|
|
|
|
if (src[0].file == IMM) {
|
|
if (brw_type_is_vector_imm(src[0].type))
|
|
return false;
|
|
} else if (src[0].negate || src[0].abs) {
|
|
return false;
|
|
}
|
|
|
|
if (saturate)
|
|
return false;
|
|
|
|
return src[0].type == dst.type ||
|
|
(brw_type_is_int(src[0].type) &&
|
|
brw_type_is_int(dst.type) &&
|
|
brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
|
|
}
|
|
|
|
bool
|
|
fs_inst::uses_address_register_implicitly() const
|
|
{
|
|
switch (opcode) {
|
|
case SHADER_OPCODE_BROADCAST:
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
return true;
|
|
default:
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
|
* This brings in those uniform definitions
|
|
*/
|
|
void
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
|
{
|
|
this->uniforms = v->uniforms;
|
|
}
|
|
|
|
enum intel_barycentric_mode
|
|
brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
nir_intrinsic_instr *intr)
|
|
{
|
|
const glsl_interp_mode mode =
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
|
assert(mode != INTERP_MODE_FLAT);
|
|
|
|
unsigned bary;
|
|
switch (intr->intrinsic) {
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
|
/* When per sample interpolation is dynamic, assume sample
|
|
* interpolation. We'll dynamically remap things so that the FS thread
|
|
* payload is not affected.
|
|
*/
|
|
bary = key->persample_interp == INTEL_SOMETIMES ?
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
|
break;
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
|
break;
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
|
break;
|
|
default:
|
|
unreachable("invalid intrinsic");
|
|
}
|
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
|
bary += 3;
|
|
|
|
return (enum intel_barycentric_mode) bary;
|
|
}
|
|
|
|
/**
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
* isn't in control flow, and mark it with EOT.
|
|
*
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
*/
|
|
bool
|
|
fs_visitor::mark_last_urb_write_with_eot()
|
|
{
|
|
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
prev->eot = true;
|
|
|
|
/* Delete now dead instructions. */
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
if (dead == prev)
|
|
break;
|
|
dead->remove();
|
|
}
|
|
return true;
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
static unsigned
|
|
round_components_to_whole_registers(const intel_device_info *devinfo,
|
|
unsigned c)
|
|
{
|
|
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
|
|
}
|
|
|
|
void
|
|
fs_visitor::assign_curb_setup()
|
|
{
|
|
unsigned uniform_push_length =
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
|
|
|
unsigned ubo_push_length = 0;
|
|
unsigned ubo_push_start[4];
|
|
for (int i = 0; i < 4; i++) {
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
|
ubo_push_length += prog_data->ubo_ranges[i].length;
|
|
|
|
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
|
|
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
|
|
}
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
|
if (stage == MESA_SHADER_FRAGMENT &&
|
|
((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
|
|
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
|
|
|
|
uint64_t used = 0;
|
|
bool is_compute = gl_shader_stage_is_compute(stage);
|
|
|
|
if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
|
|
assert(devinfo->has_lsc);
|
|
fs_builder ubld = fs_builder(this, 1).exec_all().at(
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
|
* to mask off the bottom 6 bits.
|
|
*/
|
|
brw_reg base_addr =
|
|
ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
|
* stateless messages.
|
|
*/
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
|
assert(num_regs > 0);
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
/* This pass occurs after all of the optimization passes, so don't
|
|
* emit an 'ADD addr, base_addr, 0' instruction.
|
|
*/
|
|
brw_reg addr = i == 0 ? base_addr :
|
|
ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
|
|
|
brw_reg srcs[4] = {
|
|
brw_imm_ud(0), /* desc */
|
|
brw_imm_ud(0), /* ex_desc */
|
|
addr, /* payload */
|
|
brw_reg(), /* payload2 */
|
|
};
|
|
|
|
brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
|
BRW_TYPE_UD);
|
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
|
|
|
send->sfid = GFX12_SFID_UGM;
|
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
|
LSC_ADDR_SIZE_A32,
|
|
LSC_DATA_SIZE_D32,
|
|
num_regs * 8 /* num_channels */,
|
|
true /* transpose */,
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
|
send->header_size = 0;
|
|
send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
|
|
send->size_written =
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
|
send->send_is_volatile = true;
|
|
|
|
send->src[0] = brw_imm_ud(desc |
|
|
brw_message_desc(devinfo,
|
|
send->mlen,
|
|
send->size_written / REG_SIZE,
|
|
send->header_size));
|
|
|
|
i += num_regs;
|
|
}
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
}
|
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
|
if (inst->src[i].file == UNIFORM) {
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
|
int constant_nr;
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
inst->src[i].offset / 4;
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
|
constant_nr = uniform_nr;
|
|
} else {
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
* values from other variables of the active program or zero."
|
|
* Just return the first push constant.
|
|
*/
|
|
constant_nr = 0;
|
|
}
|
|
|
|
assert(constant_nr / 8 < 64);
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
|
constant_nr / 8,
|
|
constant_nr % 8);
|
|
brw_reg.abs = inst->src[i].abs;
|
|
brw_reg.negate = inst->src[i].negate;
|
|
|
|
/* The combination of is_scalar for load_uniform, copy prop, and
|
|
* lower_btd_logical_send can generate a MOV from a UNIFORM with
|
|
* exec size 2 and stride of 1.
|
|
*/
|
|
assert(inst->src[i].stride == 0 || inst->exec_size == 2);
|
|
inst->src[i] = byte_offset(
|
|
retype(brw_reg, inst->src[i].type),
|
|
inst->src[i].offset % 4);
|
|
}
|
|
}
|
|
}
|
|
|
|
uint64_t want_zero = used & prog_data->zero_push_reg;
|
|
if (want_zero) {
|
|
fs_builder ubld = fs_builder(this, 8).exec_all().at(
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
|
unsigned mask_param = prog_data->push_reg_mask_param;
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
mask_param % 8);
|
|
|
|
brw_reg b32;
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
|
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
|
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
|
brw_imm_v(0x01234567));
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
fs_builder ubld16 = ubld.group(16, 0);
|
|
b32 = ubld16.vgrf(BRW_TYPE_D);
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
}
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
assert(i < prog_data->curb_read_length);
|
|
struct brw_reg push_reg =
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
}
|
|
}
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
}
|
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
|
}
|
|
|
|
/*
|
|
* Build up an array of indices into the urb_setup array that
|
|
* references the active entries of the urb_setup array.
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
* on each upload.
|
|
*/
|
|
void
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
{
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
* skip per-primitive attributes here.
|
|
*/
|
|
|
|
/* Make sure uint8_t is sufficient */
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
uint8_t index = 0;
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
}
|
|
}
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
}
|
|
|
|
void
|
|
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
|
{
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
if (inst->src[i].file == ATTR) {
|
|
assert(inst->src[i].nr == 0);
|
|
int grf = payload().num_regs +
|
|
prog_data->curb_read_length +
|
|
inst->src[i].offset / REG_SIZE;
|
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
*
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
* boundaries.
|
|
*
|
|
* So, for registers that are large enough, we have to split the exec
|
|
* size in two and trust the compression state to sort it out.
|
|
*/
|
|
unsigned total_size = inst->exec_size *
|
|
inst->src[i].stride *
|
|
brw_type_size_bytes(inst->src[i].type);
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
const unsigned exec_size =
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
|
struct brw_reg reg =
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
inst->src[i].offset % REG_SIZE),
|
|
exec_size * inst->src[i].stride,
|
|
width, inst->src[i].stride);
|
|
reg.abs = inst->src[i].abs;
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
inst->src[i] = reg;
|
|
}
|
|
}
|
|
}
|
|
|
|
int
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
const brw_stage_prog_data *prog_data)
|
|
{
|
|
if (prog_data->nr_params == 0)
|
|
return -1;
|
|
|
|
if (devinfo->verx10 >= 125)
|
|
return -1;
|
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
|
return prog_data->nr_params - 1;
|
|
|
|
return -1;
|
|
}
|
|
|
|
/**
|
|
* Get the mask of SIMD channels enabled during dispatch and not yet disabled
|
|
* by discard. Due to the layout of the sample mask in the fragment shader
|
|
* thread payload, \p bld is required to have a dispatch_width() not greater
|
|
* than 16 for fragment shaders.
|
|
*/
|
|
brw_reg
|
|
brw_sample_mask_reg(const fs_builder &bld)
|
|
{
|
|
const fs_visitor &s = *bld.shader;
|
|
|
|
if (s.stage != MESA_SHADER_FRAGMENT) {
|
|
return brw_imm_ud(0xffffffff);
|
|
} else if (s.devinfo->ver >= 20 ||
|
|
brw_wm_prog_data(s.prog_data)->uses_kill) {
|
|
return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
|
|
} else {
|
|
assert(bld.dispatch_width() <= 16);
|
|
assert(s.devinfo->ver < 20);
|
|
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
|
|
BRW_TYPE_UW);
|
|
}
|
|
}
|
|
|
|
uint32_t
|
|
brw_fb_write_msg_control(const fs_inst *inst,
|
|
const struct brw_wm_prog_data *prog_data)
|
|
{
|
|
uint32_t mctl;
|
|
|
|
if (prog_data->dual_src_blend) {
|
|
assert(inst->exec_size < 32);
|
|
|
|
if (inst->group % 16 == 0)
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
else if (inst->group % 16 == 8)
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
else
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
} else {
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
if (inst->exec_size == 16)
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
else if (inst->exec_size == 8)
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
|
else if (inst->exec_size == 32)
|
|
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
|
|
else
|
|
unreachable("Invalid FB write execution size");
|
|
}
|
|
|
|
return mctl;
|
|
}
|
|
|
|
/**
|
|
* Predicate the specified instruction on the sample mask.
|
|
*/
|
|
void
|
|
brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
|
|
{
|
|
assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
|
|
bld.group() == inst->group &&
|
|
bld.dispatch_width() == inst->exec_size);
|
|
|
|
const fs_visitor &s = *bld.shader;
|
|
const brw_reg sample_mask = brw_sample_mask_reg(bld);
|
|
const unsigned subreg = sample_mask_flag_subreg(s);
|
|
|
|
if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
|
|
assert(sample_mask.file == ARF &&
|
|
sample_mask.nr == brw_flag_subreg(subreg).nr &&
|
|
sample_mask.subnr == brw_flag_subreg(
|
|
subreg + inst->group / 16).subnr);
|
|
} else {
|
|
bld.group(1, 0).exec_all()
|
|
.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
|
|
}
|
|
|
|
if (inst->predicate) {
|
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
|
assert(!inst->predicate_inverse);
|
|
assert(inst->flag_subreg == 0);
|
|
assert(s.devinfo->ver < 20);
|
|
/* Combine the sample mask with the existing predicate by using a
|
|
* vertical predication mode.
|
|
*/
|
|
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
|
|
} else {
|
|
inst->flag_subreg = subreg;
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
inst->predicate_inverse = false;
|
|
}
|
|
}
|
|
|
|
brw::register_pressure::register_pressure(const fs_visitor *v)
|
|
{
|
|
const fs_live_variables &live = v->live_analysis.require();
|
|
const unsigned num_instructions = v->cfg->num_blocks ?
|
|
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
|
|
|
|
regs_live_at_ip = new unsigned[num_instructions]();
|
|
|
|
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
|
|
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
|
|
regs_live_at_ip[ip] += v->alloc.sizes[reg];
|
|
}
|
|
|
|
const unsigned payload_count = v->first_non_payload_grf;
|
|
|
|
int *payload_last_use_ip = new int[payload_count];
|
|
v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
|
|
|
|
for (unsigned reg = 0; reg < payload_count; reg++) {
|
|
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
|
|
++regs_live_at_ip[ip];
|
|
}
|
|
|
|
delete[] payload_last_use_ip;
|
|
}
|
|
|
|
brw::register_pressure::~register_pressure()
|
|
{
|
|
delete[] regs_live_at_ip;
|
|
}
|
|
|
|
void
|
|
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
|
|
{
|
|
live_analysis.invalidate(c);
|
|
regpressure_analysis.invalidate(c);
|
|
performance_analysis.invalidate(c);
|
|
idom_analysis.invalidate(c);
|
|
def_analysis.invalidate(c);
|
|
}
|
|
|
|
void
|
|
fs_visitor::debug_optimizer(const nir_shader *nir,
|
|
const char *pass_name,
|
|
int iteration, int pass_num) const
|
|
{
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
|
|
return;
|
|
|
|
char *filename;
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
|
iteration, pass_num, pass_name);
|
|
if (ret == -1)
|
|
return;
|
|
|
|
FILE *file = stderr;
|
|
if (__normal_user()) {
|
|
file = fopen(filename, "w");
|
|
if (!file)
|
|
file = stderr;
|
|
}
|
|
|
|
brw_print_instructions(*this, file);
|
|
|
|
if (file != stderr)
|
|
fclose(file);
|
|
|
|
free(filename);
|
|
}
|
|
|
|
static uint32_t
|
|
brw_compute_max_register_pressure(fs_visitor &s)
|
|
{
|
|
const register_pressure &rp = s.regpressure_analysis.require();
|
|
uint32_t ip = 0, max_pressure = 0;
|
|
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
ip++;
|
|
}
|
|
return max_pressure;
|
|
}
|
|
|
|
static fs_inst **
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
{
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
* prevent dependencies between the different scheduling modes.
|
|
*/
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
|
fs_inst **inst_arr = new fs_inst * [num_insts];
|
|
|
|
int ip = 0;
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
assert(ip >= block->start_ip && ip <= block->end_ip);
|
|
inst_arr[ip++] = inst;
|
|
}
|
|
assert(ip == num_insts);
|
|
|
|
return inst_arr;
|
|
}
|
|
|
|
static void
|
|
restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
|
|
{
|
|
ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
|
|
|
|
int ip = 0;
|
|
foreach_block (block, cfg) {
|
|
block->instructions.make_empty();
|
|
|
|
assert(ip == block->start_ip);
|
|
for (; ip <= block->end_ip; ip++)
|
|
block->instructions.push_tail(inst_arr[ip]);
|
|
}
|
|
assert(ip == num_insts);
|
|
}
|
|
|
|
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
|
static inline unsigned
|
|
brw_get_scratch_size(int size)
|
|
{
|
|
return MAX2(1024, util_next_power_of_two(size));
|
|
}
|
|
|
|
void
|
|
brw_allocate_registers(fs_visitor &s, bool allow_spilling)
|
|
{
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
const nir_shader *nir = s.nir;
|
|
bool allocated;
|
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
|
SCHEDULE_PRE,
|
|
SCHEDULE_PRE_NON_LIFO,
|
|
SCHEDULE_NONE,
|
|
SCHEDULE_PRE_LIFO,
|
|
};
|
|
|
|
static const char *scheduler_mode_name[] = {
|
|
[SCHEDULE_PRE] = "top-down",
|
|
[SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
[SCHEDULE_PRE_LIFO] = "lifo",
|
|
[SCHEDULE_POST] = "post",
|
|
[SCHEDULE_NONE] = "none",
|
|
};
|
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
|
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
|
|
|
|
brw_opt_compact_virtual_grfs(s);
|
|
|
|
if (s.needs_register_pressure)
|
|
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
|
|
|
|
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
* prevent dependencies between the different scheduling modes.
|
|
*/
|
|
fs_inst **orig_order = save_instruction_order(s.cfg);
|
|
fs_inst **best_pressure_order = NULL;
|
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
|
instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
|
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
* performance but increasing likelihood of allocating.
|
|
*/
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
|
enum instruction_scheduler_mode sched_mode = pre_modes[i];
|
|
|
|
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
|
|
|
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
|
|
|
|
if (0) {
|
|
brw_assign_regs_trivial(s);
|
|
allocated = true;
|
|
break;
|
|
}
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
|
assert(!s.spilled_any_registers);
|
|
|
|
allocated = brw_assign_regs(s, false, spill_all);
|
|
if (allocated)
|
|
break;
|
|
|
|
/* Save the maximum register pressure */
|
|
uint32_t this_pressure = brw_compute_max_register_pressure(s);
|
|
|
|
if (0) {
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
}
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
best_register_pressure = this_pressure;
|
|
best_sched = sched_mode;
|
|
delete[] best_pressure_order;
|
|
best_pressure_order = save_instruction_order(s.cfg);
|
|
}
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
|
restore_instruction_order(s.cfg, orig_order);
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
}
|
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
if (!allocated) {
|
|
if (0) {
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
scheduler_mode_name[best_sched]);
|
|
}
|
|
restore_instruction_order(s.cfg, best_pressure_order);
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
|
|
|
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
|
}
|
|
|
|
delete[] orig_order;
|
|
delete[] best_pressure_order;
|
|
|
|
if (!allocated) {
|
|
s.fail("Failure to register allocate. Reduce number of "
|
|
"live scalar values to avoid this.");
|
|
} else if (s.spilled_any_registers) {
|
|
brw_shader_perf_log(s.compiler, s.log_data,
|
|
"%s shader triggered register spilling. "
|
|
"Try reducing the number of live scalar "
|
|
"values to improve performance.\n",
|
|
_mesa_shader_stage_to_string(s.stage));
|
|
}
|
|
|
|
if (s.failed)
|
|
return;
|
|
|
|
int pass_num = 0;
|
|
|
|
s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
|
|
|
|
brw_opt_bank_conflicts(s);
|
|
|
|
s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
|
|
|
|
brw_schedule_instructions_post_ra(s);
|
|
|
|
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
|
|
|
|
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
|
* of part of assign_regs since both bank conflicts optimization and post
|
|
* RA scheduling take advantage of distinguishing references to registers
|
|
* that were allocated from references that were already fixed.
|
|
*
|
|
* TODO: Change the passes above, then move this lowering to be part of
|
|
* assign_regs.
|
|
*/
|
|
brw_lower_vgrfs_to_fixed_grfs(s);
|
|
|
|
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
|
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
|
|
|
|
if (s.last_scratch > 0) {
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
* need to support more eventually, the documentation suggests
|
|
* that we could allocate a larger buffer, and partition it out
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
*
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
*/
|
|
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
* case of bindless shaders with return parts, this will also take the
|
|
* max of all parts.
|
|
*/
|
|
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
|
|
s.prog_data->total_scratch);
|
|
} else {
|
|
s.fail("Scratch space required is larger than supported");
|
|
}
|
|
}
|
|
|
|
if (s.failed)
|
|
return;
|
|
|
|
brw_lower_scoreboard(s);
|
|
|
|
s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
|
|
}
|
|
|
|
unsigned
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
unsigned threads)
|
|
{
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
cs_prog_data->push.cross_thread.size;
|
|
}
|
|
|
|
struct intel_cs_dispatch_info
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
const struct brw_cs_prog_data *prog_data,
|
|
const unsigned *override_local_size)
|
|
{
|
|
struct intel_cs_dispatch_info info = {};
|
|
|
|
const unsigned *sizes =
|
|
override_local_size ? override_local_size :
|
|
prog_data->local_size;
|
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
|
info.simd_size = 8u << simd;
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
if (remainder > 0)
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
else
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
return info;
|
|
}
|
|
|
|
void
|
|
brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
|
|
{
|
|
assert(phase == s.phase + 1);
|
|
s.phase = phase;
|
|
brw_fs_validate(s);
|
|
}
|
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
|
|
{
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
|
}
|
|
|
|
namespace brw {
|
|
brw_reg
|
|
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
|
brw_reg_type type, unsigned n)
|
|
{
|
|
if (!regs[0])
|
|
return brw_reg();
|
|
|
|
if (bld.dispatch_width() > 16) {
|
|
const brw_reg tmp = bld.vgrf(type, n);
|
|
const brw::fs_builder hbld = bld.exec_all().group(16, 0);
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
|
brw_reg *const components = new brw_reg[m * n];
|
|
|
|
for (unsigned c = 0; c < n; c++) {
|
|
for (unsigned g = 0; g < m; g++)
|
|
components[c * m + g] =
|
|
offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
|
|
}
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
|
|
|
|
delete[] components;
|
|
return tmp;
|
|
|
|
} else {
|
|
return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
|
|
}
|
|
}
|
|
|
|
brw_reg
|
|
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
|
|
{
|
|
if (!regs[0])
|
|
return brw_reg();
|
|
else if (bld.shader->devinfo->ver >= 20)
|
|
return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
|
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
|
|
const brw::fs_builder hbld = bld.exec_all().group(8, 0);
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
|
brw_reg *const components = new brw_reg[2 * m];
|
|
|
|
for (unsigned c = 0; c < 2; c++) {
|
|
for (unsigned g = 0; g < m; g++)
|
|
components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
|
|
hbld, c + 2 * (g % 2));
|
|
}
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
|
|
|
|
delete[] components;
|
|
return tmp;
|
|
}
|
|
|
|
void
|
|
check_dynamic_msaa_flag(const fs_builder &bld,
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
|
enum intel_msaa_flags flag)
|
|
{
|
|
fs_inst *inst = bld.AND(bld.null_reg_ud(),
|
|
dynamic_msaa_flags(wm_prog_data),
|
|
brw_imm_ud(flag));
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
}
|
|
}
|