2010-08-10 20:39:06 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
*/
|
|
|
|
|
|
2024-07-13 00:19:44 -07:00
|
|
|
/** @file
|
2010-08-10 20:39:06 -07:00
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2020-01-23 22:55:33 -08:00
|
|
|
#include "brw_fs_live_variables.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
#include "brw_nir.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
#include "brw_cfg.h"
|
2021-10-07 00:23:07 -07:00
|
|
|
#include "brw_private.h"
|
2023-11-01 12:51:33 -07:00
|
|
|
#include "intel_nir.h"
|
2023-09-24 21:38:47 -07:00
|
|
|
#include "shader_enums.h"
|
2021-04-05 10:44:41 -07:00
|
|
|
#include "dev/intel_debug.h"
|
2023-01-20 23:19:34 -08:00
|
|
|
#include "dev/intel_wa.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
#include "compiler/nir/nir_builder.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
|
2024-03-21 15:42:44 -07:00
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
void
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
initialize_sources(this, src, sources);
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
this->opcode = opcode;
|
|
|
|
|
this->dst = dst;
|
2014-08-14 13:56:24 -07:00
|
|
|
this->exec_size = exec_size;
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
switch (dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2024-12-10 10:49:08 +02:00
|
|
|
case ADDRESS:
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
case ATTR:
|
2016-09-07 13:38:20 -07:00
|
|
|
this->size_written = dst.component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
break;
|
|
|
|
|
case BAD_FILE:
|
2016-09-07 13:38:20 -07:00
|
|
|
this->size_written = 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
break;
|
|
|
|
|
case IMM:
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2015-06-18 12:30:43 -07:00
|
|
|
init(opcode, exec_size, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg &src0)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg &src0, const brw_reg &src1)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
|
|
|
|
|
const brw_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
}
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memcpy((void*)this, &that, sizeof(that));
|
2024-03-21 15:42:44 -07:00
|
|
|
initialize_sources(this, that.src, that.sources);
|
2014-02-20 09:40:02 -08:00
|
|
|
}
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
{
|
2024-03-21 15:42:44 -07:00
|
|
|
if (this->src != this->builtin_src)
|
|
|
|
|
delete[] this->src;
|
2015-02-06 01:14:51 +02:00
|
|
|
}
|
|
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
|
2014-02-20 13:14:05 -08:00
|
|
|
{
|
2024-03-21 15:42:44 -07:00
|
|
|
if (num_sources > ARRAY_SIZE(inst->builtin_src))
|
2024-06-18 23:42:59 -07:00
|
|
|
inst->src = new brw_reg[num_sources];
|
2024-03-21 15:42:44 -07:00
|
|
|
else
|
|
|
|
|
inst->src = inst->builtin_src;
|
2015-02-06 01:14:51 +02:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
inst->src[i] = src[i];
|
2015-02-06 01:14:51 +02:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
inst->sources = num_sources;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
{
|
|
|
|
|
if (this->sources == num_sources)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *old_src = this->src;
|
|
|
|
|
brw_reg *new_src;
|
2024-03-21 15:42:44 -07:00
|
|
|
|
|
|
|
|
const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
|
|
|
|
|
|
|
|
|
|
if (old_src == this->builtin_src) {
|
|
|
|
|
if (num_sources > builtin_size) {
|
2024-06-18 23:42:59 -07:00
|
|
|
new_src = new brw_reg[num_sources];
|
2024-03-21 15:42:44 -07:00
|
|
|
for (unsigned i = 0; i < this->sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
new_src = old_src;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (num_sources <= builtin_size) {
|
|
|
|
|
new_src = this->builtin_src;
|
|
|
|
|
assert(this->sources > num_sources);
|
|
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
|
|
|
|
|
} else if (num_sources < this->sources) {
|
|
|
|
|
new_src = old_src;
|
|
|
|
|
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
new_src = new brw_reg[num_sources];
|
2024-12-11 15:57:25 -08:00
|
|
|
for (unsigned i = 0; i < this->sources; i++)
|
2024-03-21 15:42:44 -07:00
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (old_src != new_src)
|
|
|
|
|
delete[] old_src;
|
2014-02-20 13:14:05 -08:00
|
|
|
}
|
2024-03-21 15:42:44 -07:00
|
|
|
|
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
this->src = new_src;
|
2014-02-20 13:14:05 -08:00
|
|
|
}
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
case SHADER_OPCODE_SEND:
|
2014-09-13 11:49:55 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2019-04-26 17:11:42 -07:00
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2014-09-13 11:49:55 -07:00
|
|
|
return true;
|
2024-03-26 02:15:47 -07:00
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return src[1].file == VGRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
}
|
|
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
bool
|
|
|
|
|
fs_inst::is_control_source(unsigned arg) const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return arg == 1;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
return arg == 1 || arg == 2;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return arg == 0 || arg == 1;
|
|
|
|
|
|
intel/brw: Introduce new MEMORY_*_LOGICAL opcodes
This is a new unified set of opcodes for memory access loosely patterned
after the new LSC-style data port messages introduced on Alchemist GPUs.
Rather than creating an opcode for every type of memory access, it has
only three opcodes: load, store, and atomic. It has various sources to
indicate the rest:
- Binding type (raw pointer, pointer to surface state, or BT index)
- Address size (A64, A32, A16)
- Data size (bit size, number of components)
- Opcode (atomic opcode, or LOAD/STORE vs. LOAD_CMASK/STORE_CMASK)
- Mode (typed vs. untyped vs. shared-local vs. scratch)
- Address (and its dimensionality)
- Data (0 for loads, 1 for stores, 2 for atomics)
- Whether we want block access
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2022-12-25 02:00:46 -08:00
|
|
|
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
|
|
|
|
|
return arg != MEMORY_LOGICAL_BINDING &&
|
|
|
|
|
arg != MEMORY_LOGICAL_ADDRESS &&
|
|
|
|
|
arg != MEMORY_LOGICAL_DATA0 &&
|
|
|
|
|
arg != MEMORY_LOGICAL_DATA1;
|
|
|
|
|
|
2024-09-05 17:37:25 -07:00
|
|
|
case SHADER_OPCODE_QUAD_SWAP:
|
2024-11-21 10:00:04 -08:00
|
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_VOTE_ANY:
|
|
|
|
|
case SHADER_OPCODE_VOTE_ALL:
|
|
|
|
|
case SHADER_OPCODE_REDUCE:
|
|
|
|
|
return arg != 0;
|
2024-09-05 17:37:25 -07:00
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-09 14:13:37 -08:00
|
|
|
bool
|
|
|
|
|
fs_inst::is_payload(unsigned arg) const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return arg == 2 || arg == 3;
|
|
|
|
|
|
|
|
|
|
default:
|
2023-10-09 08:23:53 -07:00
|
|
|
return false;
|
2018-11-09 14:13:37 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
bool
|
2021-04-05 13:19:39 -07:00
|
|
|
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
|
2012-11-09 11:48:20 -08:00
|
|
|
{
|
2014-06-23 21:57:31 -07:00
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
return false;
|
|
|
|
|
|
2024-06-05 13:23:39 -07:00
|
|
|
/* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
|
2018-12-07 14:13:53 -08:00
|
|
|
*
|
|
|
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
|
|
|
* is not supported."
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
|
2018-12-07 14:13:53 -08:00
|
|
|
opcode == BRW_OPCODE_MAD)) {
|
|
|
|
|
const brw_reg_type exec_type = get_exec_type(this);
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
|
|
|
|
|
MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
|
|
|
|
|
MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
|
2018-12-07 14:13:53 -08:00
|
|
|
|
2024-04-20 23:19:43 -07:00
|
|
|
if (brw_type_is_int(exec_type) &&
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(exec_type) >= 4 &&
|
|
|
|
|
brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
|
2018-12-07 14:13:53 -08:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-20 21:12:17 -08:00
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
case BRW_OPCODE_ROL:
|
|
|
|
|
case BRW_OPCODE_ROR:
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
case BRW_OPCODE_DP4A:
|
|
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2024-07-15 15:09:12 -07:00
|
|
|
case SHADER_OPCODE_REDUCE:
|
2024-07-16 14:06:12 -07:00
|
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
2024-09-04 10:07:52 -07:00
|
|
|
case SHADER_OPCODE_VOTE_ANY:
|
|
|
|
|
case SHADER_OPCODE_VOTE_ALL:
|
|
|
|
|
case SHADER_OPCODE_VOTE_EQUAL:
|
2024-09-05 09:23:11 -07:00
|
|
|
case SHADER_OPCODE_BALLOT:
|
2024-09-05 17:37:25 -07:00
|
|
|
case SHADER_OPCODE_QUAD_SWAP:
|
2024-11-29 15:31:05 -08:00
|
|
|
case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_READ_FROM_CHANNEL:
|
2013-09-19 19:48:22 -07:00
|
|
|
return false;
|
2024-02-20 21:12:17 -08:00
|
|
|
default:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
}
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
bool
|
2024-02-20 21:12:17 -08:00
|
|
|
fs_inst::can_do_cmod() const
|
2018-10-08 12:22:35 -05:00
|
|
|
{
|
2024-02-20 21:12:17 -08:00
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
case BRW_OPCODE_ADD3:
|
|
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
case BRW_OPCODE_ASR:
|
|
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
case BRW_OPCODE_DP2:
|
|
|
|
|
case BRW_OPCODE_DP3:
|
|
|
|
|
case BRW_OPCODE_DP4:
|
|
|
|
|
case BRW_OPCODE_DPH:
|
|
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
case BRW_OPCODE_MAC:
|
|
|
|
|
case BRW_OPCODE_MACH:
|
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2018-10-08 12:22:35 -05:00
|
|
|
return false;
|
2024-02-20 21:12:17 -08:00
|
|
|
}
|
2018-10-08 12:22:35 -05:00
|
|
|
|
|
|
|
|
/* The accumulator result appears to get used for the conditional modifier
|
|
|
|
|
* generation. When negating a UD value, there is a 33rd bit generated for
|
|
|
|
|
* the sign in the accumulator value, so now you can't check, for example,
|
|
|
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < sources; i++) {
|
2024-04-20 23:19:43 -07:00
|
|
|
if (brw_type_is_uint(src[i].type) && src[i].negate)
|
2018-10-08 12:22:35 -05:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
|
|
|
|
|
return false;
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-14 02:12:09 -07:00
|
|
|
bool
|
|
|
|
|
fs_inst::can_change_types() const
|
|
|
|
|
{
|
|
|
|
|
return dst.type == src[0].type &&
|
2022-06-22 16:17:21 -07:00
|
|
|
!src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
|
2015-10-14 02:12:09 -07:00
|
|
|
(opcode == BRW_OPCODE_MOV ||
|
2024-01-24 16:19:39 -08:00
|
|
|
(opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
|
2015-10-14 02:12:09 -07:00
|
|
|
(opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
dst.type == src[1].type &&
|
|
|
|
|
predicate != BRW_PREDICATE_NONE &&
|
2022-06-22 16:17:21 -07:00
|
|
|
!src[1].abs && !src[1].negate && src[1].file != ATTR));
|
2015-10-14 02:12:09 -07:00
|
|
|
}
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
if (failed)
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2020-07-02 13:37:10 +02:00
|
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
2021-03-23 11:31:51 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2011-06-10 15:26:02 -03:00
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
{
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
va_end(va);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2016-05-18 14:39:52 -07:00
|
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
} else {
|
2020-10-30 17:41:02 +02:00
|
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
2021-07-29 14:27:57 -07:00
|
|
|
brw_shader_perf_log(compiler, log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
/**
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
*
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
* it.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2019-04-24 12:38:28 +02:00
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
{
|
2023-03-14 18:22:50 +02:00
|
|
|
if (this->predicate && !this->predicate_trivial &&
|
|
|
|
|
this->opcode != BRW_OPCODE_SEL)
|
2023-03-10 16:11:56 +02:00
|
|
|
return true;
|
|
|
|
|
|
intel/brw: Use size_written for NoMask instructions in is_partial_write
The intention of inst->is_partial_write() is that it should return true
when any REG_SIZE (32B) chunk of inst's destination is written but not
fully overwritten. This can be used to tell whether inst combines new
data with existing data, or screens off any previous writes, so the old
values are no longer required.
The existing (exec_size * brw_type_size_bytes(this->dst.type) < 32)
check doesn't work in a number of cases. For example, LSC block loads
have exec_size == 1 and force_writemask_all set, but may write multiple
full registers of data. (Currently, we only see them with exec_size 1
after logical-send-lowering, so our SHADER_OPCODE_SEND special case
was covering those.) We had also special cased UNDEF.
Instead, we can simply check:
1. Predication
2. !inst->dst.contiguous()
3. inst->dst.offset % REG_SIZE != 0
4. inst->size_written % REG_SIZE != 0
We had the first three already, but #4 is new. If either #3 or #4
are true, then that implies there is a REG_SIZE chunk of the destination
which is written, but not entirely written, so it's a partial write.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2024-08-26 15:34:21 -07:00
|
|
|
if (!this->dst.is_contiguous())
|
2023-03-10 16:11:56 +02:00
|
|
|
return true;
|
|
|
|
|
|
intel/brw: Use size_written for NoMask instructions in is_partial_write
The intention of inst->is_partial_write() is that it should return true
when any REG_SIZE (32B) chunk of inst's destination is written but not
fully overwritten. This can be used to tell whether inst combines new
data with existing data, or screens off any previous writes, so the old
values are no longer required.
The existing (exec_size * brw_type_size_bytes(this->dst.type) < 32)
check doesn't work in a number of cases. For example, LSC block loads
have exec_size == 1 and force_writemask_all set, but may write multiple
full registers of data. (Currently, we only see them with exec_size 1
after logical-send-lowering, so our SHADER_OPCODE_SEND special case
was covering those.) We had also special cased UNDEF.
Instead, we can simply check:
1. Predication
2. !inst->dst.contiguous()
3. inst->dst.offset % REG_SIZE != 0
4. inst->size_written % REG_SIZE != 0
We had the first three already, but #4 is new. If either #3 or #4
are true, then that implies there is a REG_SIZE chunk of the destination
which is written, but not entirely written, so it's a partial write.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2024-08-26 15:34:21 -07:00
|
|
|
if (this->dst.offset % REG_SIZE != 0)
|
|
|
|
|
return true;
|
2023-07-23 18:20:23 +03:00
|
|
|
|
intel/brw: Use size_written for NoMask instructions in is_partial_write
The intention of inst->is_partial_write() is that it should return true
when any REG_SIZE (32B) chunk of inst's destination is written but not
fully overwritten. This can be used to tell whether inst combines new
data with existing data, or screens off any previous writes, so the old
values are no longer required.
The existing (exec_size * brw_type_size_bytes(this->dst.type) < 32)
check doesn't work in a number of cases. For example, LSC block loads
have exec_size == 1 and force_writemask_all set, but may write multiple
full registers of data. (Currently, we only see them with exec_size 1
after logical-send-lowering, so our SHADER_OPCODE_SEND special case
was covering those.) We had also special cased UNDEF.
Instead, we can simply check:
1. Predication
2. !inst->dst.contiguous()
3. inst->dst.offset % REG_SIZE != 0
4. inst->size_written % REG_SIZE != 0
We had the first three already, but #4 is new. If either #3 or #4
are true, then that implies there is a REG_SIZE chunk of the destination
which is written, but not entirely written, so it's a partial write.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2024-08-26 15:34:21 -07:00
|
|
|
return this->size_written % REG_SIZE != 0;
|
2012-06-04 08:59:00 -07:00
|
|
|
}
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
unsigned
|
|
|
|
|
fs_inst::components_read(unsigned i) const
|
|
|
|
|
{
|
2016-08-12 18:33:58 -07:00
|
|
|
/* Return zero if the source is not present. */
|
|
|
|
|
if (src[i].file == BAD_FILE)
|
|
|
|
|
return 0;
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
switch (opcode) {
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
return i == 0 ? 1 : 2;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
2020-10-29 15:10:59 +02:00
|
|
|
assert(i < 2);
|
|
|
|
|
if (i == 0)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
2015-10-20 14:29:37 -07:00
|
|
|
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
2015-07-27 16:14:36 +03:00
|
|
|
/* First/second FB write color. */
|
|
|
|
|
if (i < 2)
|
2015-10-24 14:55:57 -07:00
|
|
|
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
2015-07-27 16:14:36 +03:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
2018-10-31 09:52:33 -05:00
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
2015-09-08 15:52:09 +01:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
2020-07-07 23:54:00 -07:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-02-05 18:39:13 -08:00
|
|
|
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
|
2023-05-23 13:11:02 +03:00
|
|
|
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
|
|
|
|
|
src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture coordinates. */
|
2016-02-05 18:39:13 -08:00
|
|
|
if (i == TEX_LOGICAL_SRC_COORDINATE)
|
|
|
|
|
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture derivatives. */
|
2016-02-05 18:39:13 -08:00
|
|
|
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
|
|
|
|
|
opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
|
|
|
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture offset. */
|
2016-11-28 18:13:02 -08:00
|
|
|
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
return 2;
|
2015-09-08 15:52:09 +01:00
|
|
|
/* MCS */
|
2020-07-07 23:54:00 -07:00
|
|
|
else if (i == TEX_LOGICAL_SRC_MCS) {
|
|
|
|
|
if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
|
|
|
|
|
return 2;
|
|
|
|
|
else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
|
|
|
|
|
return 4;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
} else
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
return 1;
|
|
|
|
|
|
intel/brw: Introduce new MEMORY_*_LOGICAL opcodes
This is a new unified set of opcodes for memory access loosely patterned
after the new LSC-style data port messages introduced on Alchemist GPUs.
Rather than creating an opcode for every type of memory access, it has
only three opcodes: load, store, and atomic. It has various sources to
indicate the rest:
- Binding type (raw pointer, pointer to surface state, or BT index)
- Address size (A64, A32, A16)
- Data size (bit size, number of components)
- Opcode (atomic opcode, or LOAD/STORE vs. LOAD_CMASK/STORE_CMASK)
- Mode (typed vs. untyped vs. shared-local vs. scratch)
- Address (and its dimensionality)
- Data (0 for loads, 1 for stores, 2 for atomics)
- Whether we want block access
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2022-12-25 02:00:46 -08:00
|
|
|
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
|
|
|
|
|
if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
|
|
|
|
|
return 0;
|
|
|
|
|
/* fallthrough */
|
|
|
|
|
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
|
|
|
|
|
if (i == MEMORY_LOGICAL_DATA1)
|
|
|
|
|
return 0;
|
|
|
|
|
/* fallthrough */
|
|
|
|
|
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
|
|
|
|
|
if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
|
|
|
|
|
return src[MEMORY_LOGICAL_COMPONENTS].ud;
|
|
|
|
|
else if (i == MEMORY_LOGICAL_ADDRESS)
|
|
|
|
|
return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
2016-04-25 18:06:13 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return (i == 0 ? 2 : 1);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
2022-09-28 16:38:35 -07:00
|
|
|
assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
|
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
if (i == URB_LOGICAL_SRC_DATA)
|
2022-09-28 16:38:35 -07:00
|
|
|
return src[URB_LOGICAL_SRC_COMPONENTS].ud;
|
2022-07-12 15:32:01 -07:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
unreachable("Do not use components_read() for DPAS.");
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
default:
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-07 17:00:58 -07:00
|
|
|
unsigned
|
2024-06-19 10:50:51 -07:00
|
|
|
fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
{
|
2015-06-18 11:53:08 -07:00
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
if (arg == 2) {
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
} else if (arg == 3) {
|
|
|
|
|
return ex_mlen * REG_SIZE;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2018-04-19 20:48:42 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
2015-06-18 11:53:08 -07:00
|
|
|
if (arg == 0)
|
2016-09-07 17:00:07 -07:00
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
break;
|
|
|
|
|
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
if (arg == 0)
|
2016-09-07 13:02:55 -07:00
|
|
|
return 16;
|
2015-06-18 17:48:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-06-30 15:51:13 -07:00
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
if (arg < this->header_size)
|
2024-04-20 17:08:02 -07:00
|
|
|
return retype(src[arg], BRW_TYPE_UD).component_size(8);
|
2015-06-30 15:51:13 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-09-15 14:01:17 -07:00
|
|
|
case SHADER_OPCODE_BARRIER:
|
2016-09-07 17:00:07 -07:00
|
|
|
return REG_SIZE;
|
2015-07-16 15:04:43 -07:00
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
assert(src[2].file == IMM);
|
2016-09-07 14:36:32 -07:00
|
|
|
return src[2].ud;
|
2015-11-07 18:58:34 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2024-04-12 16:23:49 -07:00
|
|
|
case BRW_OPCODE_DPAS: {
|
|
|
|
|
/* This is a little bit sketchy. There's no way to get at devinfo from
|
|
|
|
|
* here, so the regular reg_unit() cannot be used. However, on
|
|
|
|
|
* reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
|
|
|
|
|
* reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
|
|
|
|
|
* coincidence, so this isn't so bad.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned reg_unit = this->exec_size / 8;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
switch (arg) {
|
|
|
|
|
case 0:
|
2024-04-20 17:08:02 -07:00
|
|
|
if (src[0].type == BRW_TYPE_HF) {
|
2024-04-12 16:23:49 -07:00
|
|
|
return rcount * reg_unit * REG_SIZE / 2;
|
2023-09-20 12:42:24 -07:00
|
|
|
} else {
|
2024-04-12 16:23:49 -07:00
|
|
|
return rcount * reg_unit * REG_SIZE;
|
2023-09-20 12:42:24 -07:00
|
|
|
}
|
|
|
|
|
case 1:
|
2024-04-12 16:23:49 -07:00
|
|
|
return sdepth * reg_unit * REG_SIZE;
|
2023-09-20 12:42:24 -07:00
|
|
|
case 2:
|
|
|
|
|
/* This is simpler than the formula described in the Bspec, but it
|
2024-04-12 16:23:49 -07:00
|
|
|
* covers all of the cases that we support. Each inner sdepth
|
|
|
|
|
* iteration of the DPAS consumes a single dword for int8, uint8, or
|
|
|
|
|
* float16 types. These are the one source types currently
|
|
|
|
|
* supportable through Vulkan. This is independent of reg_unit.
|
2023-09-20 12:42:24 -07:00
|
|
|
*/
|
2024-04-12 16:23:49 -07:00
|
|
|
return rcount * sdepth * 4;
|
2023-09-20 12:42:24 -07:00
|
|
|
default:
|
|
|
|
|
unreachable("Invalid source number.");
|
|
|
|
|
}
|
|
|
|
|
break;
|
2024-04-12 16:23:49 -07:00
|
|
|
}
|
2023-09-20 12:42:24 -07:00
|
|
|
|
2023-10-09 08:23:53 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
case IMM:
|
2024-04-21 00:57:59 -07:00
|
|
|
return components_read(arg) * brw_type_size_bytes(src[arg].type);
|
2016-08-12 18:33:58 -07:00
|
|
|
case BAD_FILE:
|
2024-12-10 10:49:08 +02:00
|
|
|
case ADDRESS:
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2015-08-05 16:29:30 +03:00
|
|
|
case ATTR:
|
brw: Basic infrastructure to store convergent values as scalars
In SIMD16 and SIMD32, storing convergent values in full 16- or
32-channel registers is wasteful. It wastes register space, and in most
cases on SIMD32, it wastes instructions. Our register allocator is not
clever enough to handle scalar allocations. It's fundamental unit of
allocation is SIMD8. Start treating convergent values as SIMD8.
Add a tracking bit in brw_reg to specify that a register represents a
convergent, scalar value. This has two implications:
1. All channels of the SIMD8 register must contain the same value. In
general, this means that writes to the register must be
force_writemask_all and exec_size = 8;
2. Reads of this register can (and should) use <0,1,0> stride. SIMD8
instructions that have restrictions on source stride can us <8,8,1>.
Values that are vectors (e.g., results of load_uniform or texture
operations) will be stored as multiple SIMD8 hardware registers.
v2: brw_fs_opt_copy_propagation_defs fix from Ken. Fix for Xe2.
v3: Eliminte offset_to_scalar(). Remove mention of vec4 backend in
brw_reg.h. Both suggested by Caio. The offset_to_scalar() change
necessitates some trickery in the fs_builder offset() function, but I
think this is an improvement overall. There is also some rework in
find_value_for_offset to account for the possibility that is_scalar
sources in LOAD_PAYLOAD might be <8;8,1> or <0;1,0>.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/29884>
2024-02-09 17:12:11 -08:00
|
|
|
/* Regardless of exec_size, values marked as scalar are SIMD8. */
|
|
|
|
|
return components_read(arg) *
|
|
|
|
|
src[arg].component_size(src[arg].is_scalar ? 8 * reg_unit(devinfo) : exec_size);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
2015-10-26 06:58:56 -07:00
|
|
|
return 0;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
namespace {
|
2019-09-24 17:06:12 -05:00
|
|
|
unsigned
|
2022-07-22 17:11:52 -07:00
|
|
|
predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
|
2019-09-24 17:06:12 -05:00
|
|
|
{
|
2022-07-22 17:11:52 -07:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
return 1;
|
|
|
|
|
} else {
|
|
|
|
|
switch (predicate) {
|
|
|
|
|
case BRW_PREDICATE_NONE: return 1;
|
|
|
|
|
case BRW_PREDICATE_NORMAL: return 1;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
|
|
|
|
|
default: unreachable("Unsupported predicate");
|
|
|
|
|
}
|
2019-09-24 17:06:12 -05:00
|
|
|
}
|
|
|
|
|
}
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
fs_inst::flags_read(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
{
|
2022-07-22 17:11:52 -07:00
|
|
|
if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
|
|
|
|
|
predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
|
2016-05-18 21:54:35 -07:00
|
|
|
/* The vertical predication modes combine corresponding bits from
|
2024-02-17 22:15:44 -08:00
|
|
|
* f0.0 and f1.0 on Gfx7+.
|
2016-05-18 21:54:35 -07:00
|
|
|
*/
|
2024-02-17 22:15:44 -08:00
|
|
|
const unsigned shift = 4;
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
|
2016-05-18 21:54:35 -07:00
|
|
|
} else if (predicate) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
|
2016-05-18 21:54:35 -07:00
|
|
|
} else {
|
2017-06-22 16:42:34 -07:00
|
|
|
unsigned mask = 0;
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
2024-06-19 10:50:51 -07:00
|
|
|
mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
|
2017-06-22 16:42:34 -07:00
|
|
|
}
|
|
|
|
|
return mask;
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
unsigned
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
fs_inst::flags_written(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
{
|
2024-02-19 19:41:48 -08:00
|
|
|
if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
|
|
|
|
opcode != BRW_OPCODE_CSEL &&
|
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 1);
|
2025-01-17 01:53:44 -08:00
|
|
|
} else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS ||
|
|
|
|
|
opcode == SHADER_OPCODE_BALLOT ||
|
|
|
|
|
opcode == SHADER_OPCODE_VOTE_ANY ||
|
|
|
|
|
opcode == SHADER_OPCODE_VOTE_ALL ||
|
|
|
|
|
opcode == SHADER_OPCODE_VOTE_EQUAL) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 32);
|
2016-05-18 21:54:35 -07:00
|
|
|
} else {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(dst, size_written);
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
}
|
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
bool
|
|
|
|
|
fs_inst::has_sampler_residency() const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2023-05-23 13:11:02 +03:00
|
|
|
assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
|
|
|
|
return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Copy prop from raw integer moves with mismatched types
The specific pattern from the unit test was observed in ray tracing
trampoline shaders.
v2: Refactor the is_raw_move tests out to a utility function. Suggested
by Ken.
v3: Fix a regression caused by being too picky about source
modifiers. This was introduced somewhere between when I did initial
shader-db runs an v2.
v4: Fix typo in comment. Noticed by Caio.
shader-db:
All Intel platforms had similar results. (Meteor Lake shown)
total instructions in shared programs: 19734086 -> 19733997 (<.01%)
instructions in affected programs: 135388 -> 135299 (-0.07%)
helped: 76 / HURT: 2
total cycles in shared programs: 916290451 -> 916264968 (<.01%)
cycles in affected programs: 41046002 -> 41020519 (-0.06%)
helped: 32 / HURT: 29
fossil-db:
Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 151531355 -> 151513669 (-0.01%); split: -0.01%, +0.00%
Cycle count: 17209372399 -> 17208178205 (-0.01%); split: -0.01%, +0.00%
Max live registers: 32016490 -> 32016493 (+0.00%)
Totals from 17361 (2.75% of 630198) affected shaders:
Instrs: 2642048 -> 2624362 (-0.67%); split: -0.67%, +0.00%
Cycle count: 79803066 -> 78608872 (-1.50%); split: -1.75%, +0.25%
Max live registers: 421668 -> 421671 (+0.00%)
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 149995644 -> 149977326 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15567293770 -> 15566524840 (-0.00%); split: -0.02%, +0.01%
Spill count: 61241 -> 61238 (-0.00%)
Fill count: 107304 -> 107301 (-0.00%)
Max live registers: 31993109 -> 31993112 (+0.00%)
Totals from 17813 (2.83% of 629912) affected shaders:
Instrs: 3738236 -> 3719918 (-0.49%); split: -0.49%, +0.00%
Cycle count: 4251157049 -> 4250388119 (-0.02%); split: -0.06%, +0.04%
Spill count: 28268 -> 28265 (-0.01%)
Fill count: 50377 -> 50374 (-0.01%)
Max live registers: 470648 -> 470651 (+0.00%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30251>
2024-07-16 16:04:38 -07:00
|
|
|
/* \sa inst_is_raw_move in brw_eu_validate. */
|
|
|
|
|
bool
|
|
|
|
|
fs_inst::is_raw_move() const
|
|
|
|
|
{
|
|
|
|
|
if (opcode != BRW_OPCODE_MOV)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (src[0].file == IMM) {
|
|
|
|
|
if (brw_type_is_vector_imm(src[0].type))
|
|
|
|
|
return false;
|
|
|
|
|
} else if (src[0].negate || src[0].abs) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (saturate)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return src[0].type == dst.type ||
|
|
|
|
|
(brw_type_is_int(src[0].type) &&
|
|
|
|
|
brw_type_is_int(dst.type) &&
|
|
|
|
|
brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
|
|
|
|
|
}
|
|
|
|
|
|
2024-03-13 11:01:16 +02:00
|
|
|
bool
|
|
|
|
|
fs_inst::uses_address_register_implicitly() const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
return true;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
*/
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
this->uniforms = v->uniforms;
|
2011-03-23 12:50:53 -07:00
|
|
|
}
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr)
|
2016-07-11 15:00:37 -07:00
|
|
|
{
|
2022-07-06 13:01:24 -07:00
|
|
|
const glsl_interp_mode mode =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
|
|
2016-07-11 15:00:37 -07:00
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned bary;
|
2022-07-06 13:01:24 -07:00
|
|
|
switch (intr->intrinsic) {
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2024-04-18 09:54:11 +03:00
|
|
|
/* When per sample interpolation is dynamic, assume sample
|
|
|
|
|
* interpolation. We'll dynamically remap things so that the FS thread
|
|
|
|
|
* payload is not affected.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
bary = key->persample_interp == INTEL_SOMETIMES ?
|
2024-11-18 11:33:35 +02:00
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
|
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
bary += 3;
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
return (enum intel_barycentric_mode) bary;
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/**
|
|
|
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
|
|
|
* isn't in control flow, and mark it with EOT.
|
|
|
|
|
*
|
|
|
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::mark_last_urb_write_with_eot()
|
|
|
|
|
{
|
|
|
|
|
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
|
|
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
|
|
|
if (dead == prev)
|
|
|
|
|
break;
|
|
|
|
|
dead->remove();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-03 12:15:21 -07:00
|
|
|
static unsigned
|
|
|
|
|
round_components_to_whole_registers(const intel_device_info *devinfo,
|
|
|
|
|
unsigned c)
|
|
|
|
|
{
|
|
|
|
|
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
{
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned uniform_push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2024-02-19 23:07:04 -08:00
|
|
|
ubo_push_length += prog_data->ubo_ranges[i].length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
|
|
|
|
|
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
|
2016-11-29 02:47:15 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2024-07-01 14:45:38 -07:00
|
|
|
if (stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
|
|
|
|
|
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
|
2014-02-19 15:27:01 +01:00
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
uint64_t used = 0;
|
2021-10-04 13:58:07 +03:00
|
|
|
bool is_compute = gl_shader_stage_is_compute(stage);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2024-09-30 08:45:21 +03:00
|
|
|
if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
|
2022-07-15 13:08:23 +03:00
|
|
|
assert(devinfo->has_lsc);
|
2023-11-21 09:47:18 -08:00
|
|
|
fs_builder ubld = fs_builder(this, 1).exec_all().at(
|
2020-05-04 16:17:58 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
2022-07-15 13:08:23 +03:00
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
|
|
|
|
* to mask off the bottom 6 bits.
|
2020-05-04 16:17:58 -05:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg base_addr =
|
2024-04-12 17:43:22 -07:00
|
|
|
ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
2020-05-04 16:17:58 -05:00
|
|
|
* stateless messages.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
2022-07-15 13:08:23 +03:00
|
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
|
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
2020-05-04 16:17:58 -05:00
|
|
|
assert(num_regs > 0);
|
|
|
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
|
|
2024-01-26 12:25:41 -08:00
|
|
|
/* This pass occurs after all of the optimization passes, so don't
|
|
|
|
|
* emit an 'ADD addr, base_addr, 0' instruction.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg addr = i == 0 ? base_addr :
|
2024-04-12 17:43:22 -07:00
|
|
|
ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[4] = {
|
2020-05-04 16:17:58 -05:00
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
2022-07-15 13:08:23 +03:00
|
|
|
addr, /* payload */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* payload2 */
|
2020-05-04 16:17:58 -05:00
|
|
|
};
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2022-07-15 13:08:23 +03:00
|
|
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
|
|
|
|
|
|
|
|
|
send->sfid = GFX12_SFID_UGM;
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
uint32_t desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
|
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
|
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
num_regs * 8 /* num_channels */,
|
|
|
|
|
true /* transpose */,
|
|
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-15 13:08:23 +03:00
|
|
|
send->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
|
2022-07-15 13:08:23 +03:00
|
|
|
send->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
2020-05-04 16:17:58 -05:00
|
|
|
send->send_is_volatile = true;
|
|
|
|
|
|
brw: move final send lowering up into the IR
Because we do emit the final send message form in code generation, a
lot of emissions look like this :
add(8) vgrf0, u0, 0x100
mov(1) a0.1, vgrf0 # emitted by the generator
send(8) ..., a0.1
By moving address register manipulation in the IR, we can get this
down to :
add(1) a0.1, u0, 0x100
send(8) ..., a0.1
This reduce register pressure around some send messages by 1 vgrf.
All lost shaders in the below results are fragment SIMD32, due to the
throughput estimator. If turned off, we loose no SIMD32 shaders with
this change.
DG2 results:
Assassin's Creed Valhalla:
Totals from 2044 (96.87% of 2110) affected shaders:
Instrs: 852879 -> 832044 (-2.44%); split: -2.45%, +0.00%
Subgroup size: 23832 -> 23824 (-0.03%)
Cycle count: 53345742 -> 52144277 (-2.25%); split: -5.08%, +2.82%
Spill count: 729 -> 554 (-24.01%); split: -28.40%, +4.39%
Fill count: 2005 -> 1256 (-37.36%)
Scratch Memory Size: 25600 -> 19456 (-24.00%); split: -32.00%, +8.00%
Max live registers: 116765 -> 115058 (-1.46%)
Max dispatch width: 19152 -> 18872 (-1.46%); split: +0.21%, -1.67%
Cyberpunk 2077:
Totals from 1181 (93.43% of 1264) affected shaders:
Instrs: 667192 -> 663615 (-0.54%); split: -0.55%, +0.01%
Subgroup size: 13016 -> 13032 (+0.12%)
Cycle count: 17383539 -> 17986073 (+3.47%); split: -0.93%, +4.39%
Spill count: 12 -> 8 (-33.33%)
Fill count: 9 -> 6 (-33.33%)
Dota2:
Totals from 173 (11.59% of 1493) affected shaders:
Cycle count: 274403 -> 280817 (+2.34%); split: -0.01%, +2.34%
Max live registers: 5787 -> 5779 (-0.14%)
Max dispatch width: 1344 -> 1152 (-14.29%)
Hitman3:
Totals from 5072 (95.39% of 5317) affected shaders:
Instrs: 2879952 -> 2841804 (-1.32%); split: -1.32%, +0.00%
Cycle count: 153208505 -> 165860401 (+8.26%); split: -2.22%, +10.48%
Spill count: 3942 -> 3200 (-18.82%)
Fill count: 10158 -> 8846 (-12.92%)
Scratch Memory Size: 257024 -> 223232 (-13.15%)
Max live registers: 328467 -> 324631 (-1.17%)
Max dispatch width: 43928 -> 42768 (-2.64%); split: +0.09%, -2.73%
Fortnite:
Totals from 360 (4.82% of 7472) affected shaders:
Instrs: 778068 -> 777925 (-0.02%)
Subgroup size: 3128 -> 3136 (+0.26%)
Cycle count: 38684183 -> 38734579 (+0.13%); split: -0.06%, +0.19%
Max live registers: 50689 -> 50658 (-0.06%)
Hogwarts Legacy:
Totals from 1376 (84.00% of 1638) affected shaders:
Instrs: 758810 -> 749727 (-1.20%); split: -1.23%, +0.03%
Cycle count: 27778983 -> 28805469 (+3.70%); split: -1.42%, +5.12%
Spill count: 2475 -> 2299 (-7.11%); split: -7.47%, +0.36%
Fill count: 2677 -> 2445 (-8.67%); split: -9.90%, +1.23%
Scratch Memory Size: 99328 -> 89088 (-10.31%)
Max live registers: 84969 -> 84671 (-0.35%); split: -0.58%, +0.23%
Max dispatch width: 11848 -> 11920 (+0.61%)
Metro Exodus:
Totals from 92 (0.21% of 43072) affected shaders:
Instrs: 262995 -> 262968 (-0.01%)
Cycle count: 13818007 -> 13851266 (+0.24%); split: -0.01%, +0.25%
Max live registers: 11152 -> 11140 (-0.11%)
Red Dead Redemption 2 :
Totals from 451 (7.71% of 5847) affected shaders:
Instrs: 754178 -> 753811 (-0.05%); split: -0.05%, +0.00%
Cycle count: 3484078523 -> 3484111965 (+0.00%); split: -0.00%, +0.00%
Max live registers: 42294 -> 42185 (-0.26%)
Spiderman Remastered:
Totals from 6820 (98.02% of 6958) affected shaders:
Instrs: 6921500 -> 6747933 (-2.51%); split: -4.16%, +1.65%
Cycle count: 234400692460 -> 236846720707 (+1.04%); split: -0.20%, +1.25%
Spill count: 72971 -> 72622 (-0.48%); split: -8.08%, +7.61%
Fill count: 212921 -> 198483 (-6.78%); split: -12.37%, +5.58%
Scratch Memory Size: 3491840 -> 3410944 (-2.32%); split: -12.05%, +9.74%
Max live registers: 493149 -> 487458 (-1.15%)
Max dispatch width: 56936 -> 56856 (-0.14%); split: +0.06%, -0.20%
Strange Brigade:
Totals from 3769 (91.21% of 4132) affected shaders:
Instrs: 1354476 -> 1321474 (-2.44%)
Cycle count: 25351530 -> 25339190 (-0.05%); split: -1.64%, +1.59%
Max live registers: 199057 -> 193656 (-2.71%)
Max dispatch width: 30272 -> 30240 (-0.11%)
Witcher 3:
Totals from 25 (2.40% of 1041) affected shaders:
Instrs: 24621 -> 24606 (-0.06%)
Cycle count: 2218793 -> 2217503 (-0.06%); split: -0.11%, +0.05%
Max live registers: 1963 -> 1955 (-0.41%)
LNL results:
Assassin's Creed Valhalla:
Totals from 1928 (98.02% of 1967) affected shaders:
Instrs: 856107 -> 835756 (-2.38%); split: -2.48%, +0.11%
Subgroup size: 41264 -> 41280 (+0.04%)
Cycle count: 64606590 -> 62371700 (-3.46%); split: -5.57%, +2.11%
Spill count: 915 -> 669 (-26.89%); split: -32.79%, +5.90%
Fill count: 2414 -> 1617 (-33.02%); split: -36.62%, +3.60%
Scratch Memory Size: 62464 -> 44032 (-29.51%); split: -36.07%, +6.56%
Max live registers: 205483 -> 202192 (-1.60%)
Cyberpunk 2077:
Totals from 1177 (96.40% of 1221) affected shaders:
Instrs: 682237 -> 678931 (-0.48%); split: -0.51%, +0.03%
Subgroup size: 24912 -> 24944 (+0.13%)
Cycle count: 24355928 -> 25089292 (+3.01%); split: -0.80%, +3.81%
Spill count: 8 -> 3 (-62.50%)
Fill count: 6 -> 3 (-50.00%)
Max live registers: 126922 -> 125472 (-1.14%)
Dota2:
Totals from 428 (32.47% of 1318) affected shaders:
Instrs: 89355 -> 89740 (+0.43%)
Cycle count: 1152412 -> 1152706 (+0.03%); split: -0.52%, +0.55%
Max live registers: 32863 -> 32847 (-0.05%)
Fortnite:
Totals from 5354 (81.72% of 6552) affected shaders:
Instrs: 4135059 -> 4239015 (+2.51%); split: -0.01%, +2.53%
Cycle count: 132557506 -> 132427302 (-0.10%); split: -0.75%, +0.65%
Spill count: 7144 -> 7234 (+1.26%); split: -0.46%, +1.72%
Fill count: 12086 -> 12403 (+2.62%); split: -0.73%, +3.35%
Scratch Memory Size: 600064 -> 604160 (+0.68%); split: -1.02%, +1.71%
Hitman3:
Totals from 4912 (97.09% of 5059) affected shaders:
Instrs: 2952124 -> 2916824 (-1.20%); split: -1.20%, +0.00%
Cycle count: 179985656 -> 189175250 (+5.11%); split: -2.44%, +7.55%
Spill count: 3739 -> 3136 (-16.13%)
Fill count: 10657 -> 9564 (-10.26%)
Scratch Memory Size: 373760 -> 318464 (-14.79%)
Max live registers: 597566 -> 589460 (-1.36%)
Hogwarts Legacy:
Totals from 1471 (96.33% of 1527) affected shaders:
Instrs: 748749 -> 766214 (+2.33%); split: -0.71%, +3.05%
Cycle count: 33301528 -> 34426308 (+3.38%); split: -1.30%, +4.68%
Spill count: 3278 -> 3070 (-6.35%); split: -8.30%, +1.95%
Fill count: 4553 -> 4097 (-10.02%); split: -10.85%, +0.83%
Scratch Memory Size: 251904 -> 217088 (-13.82%)
Max live registers: 168911 -> 168106 (-0.48%); split: -0.59%, +0.12%
Metro Exodus:
Totals from 18356 (49.81% of 36854) affected shaders:
Instrs: 7559386 -> 7621591 (+0.82%); split: -0.01%, +0.83%
Cycle count: 195240612 -> 196455186 (+0.62%); split: -1.22%, +1.84%
Spill count: 595 -> 546 (-8.24%)
Fill count: 1604 -> 1408 (-12.22%)
Max live registers: 2086937 -> 2086933 (-0.00%)
Red Dead Redemption 2:
Totals from 4171 (79.31% of 5259) affected shaders:
Instrs: 2619392 -> 2719587 (+3.83%); split: -0.00%, +3.83%
Subgroup size: 86416 -> 86432 (+0.02%)
Cycle count: 8542836160 -> 8531976886 (-0.13%); split: -0.65%, +0.53%
Fill count: 12949 -> 12970 (+0.16%); split: -0.43%, +0.59%
Scratch Memory Size: 401408 -> 385024 (-4.08%)
Spiderman Remastered:
Totals from 6639 (98.94% of 6710) affected shaders:
Instrs: 6877980 -> 6800592 (-1.13%); split: -3.11%, +1.98%
Cycle count: 282183352210 -> 282100051824 (-0.03%); split: -0.62%, +0.59%
Spill count: 63147 -> 64218 (+1.70%); split: -7.12%, +8.82%
Fill count: 184931 -> 175591 (-5.05%); split: -10.81%, +5.76%
Scratch Memory Size: 5318656 -> 5970944 (+12.26%); split: -5.91%, +18.17%
Max live registers: 918240 -> 906604 (-1.27%)
Strange Brigade:
Totals from 3675 (92.24% of 3984) affected shaders:
Instrs: 1462231 -> 1429345 (-2.25%); split: -2.25%, +0.00%
Cycle count: 37404050 -> 37345292 (-0.16%); split: -1.25%, +1.09%
Max live registers: 361849 -> 351265 (-2.92%)
Witcher 3:
Totals from 13 (46.43% of 28) affected shaders:
Instrs: 593 -> 660 (+11.30%)
Cycle count: 28302 -> 28714 (+1.46%)
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/28199>
2024-02-29 20:51:50 +02:00
|
|
|
send->src[0] = brw_imm_ud(desc |
|
|
|
|
|
brw_message_desc(devinfo,
|
|
|
|
|
send->mlen,
|
|
|
|
|
send->size_written / REG_SIZE,
|
|
|
|
|
send->header_size));
|
|
|
|
|
|
2020-05-04 16:17:58 -05:00
|
|
|
i += num_regs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2025-01-01 23:52:33 -08:00
|
|
|
constant_nr = uniform_nr;
|
2014-03-11 14:35:27 -07:00
|
|
|
} else {
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
*/
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
assert(constant_nr / 8 < 64);
|
|
|
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
constant_nr / 8,
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
2024-02-01 15:02:37 -08:00
|
|
|
/* The combination of is_scalar for load_uniform, copy prop, and
|
|
|
|
|
* lower_btd_logical_send can generate a MOV from a UNIFORM with
|
|
|
|
|
* exec size 2 and stride of 1.
|
|
|
|
|
*/
|
|
|
|
|
assert(inst->src[i].stride == 0 || inst->exec_size == 2);
|
2015-10-24 15:29:03 -07:00
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
2024-02-19 23:07:04 -08:00
|
|
|
uint64_t want_zero = used & prog_data->zero_push_reg;
|
2020-04-03 20:20:53 -05:00
|
|
|
if (want_zero) {
|
2023-11-21 09:47:18 -08:00
|
|
|
fs_builder ubld = fs_builder(this, 8).exec_all().at(
|
2020-04-03 20:20:53 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
2024-02-19 23:07:04 -08:00
|
|
|
unsigned mask_param = prog_data->push_reg_mask_param;
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
|
|
|
mask_param % 8);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg b32;
|
2020-04-03 20:20:53 -05:00
|
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
|
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
2024-04-20 17:08:02 -07:00
|
|
|
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
2020-04-03 20:20:53 -05:00
|
|
|
brw_imm_v(0x01234567));
|
|
|
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
|
|
|
|
|
|
fs_builder ubld16 = ubld.group(16, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
b32 = ubld16.vgrf(BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
|
|
|
assert(i < prog_data->curb_read_length);
|
|
|
|
|
struct brw_reg push_reg =
|
2024-04-20 17:30:23 -07:00
|
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-03 19:05:32 -07:00
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
2022-08-19 12:40:20 -07:00
|
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/*
|
|
|
|
|
* Build up an array of indices into the urb_setup array that
|
|
|
|
|
* references the active entries of the urb_setup array.
|
|
|
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
|
|
|
* on each upload.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
2021-10-29 12:56:22 -07:00
|
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
|
|
|
* skip per-primitive attributes here.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/* Make sure uint8_t is sufficient */
|
|
|
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
|
|
|
uint8_t index = 0;
|
|
|
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
|
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
|
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-09-12 16:49:11 -07:00
|
|
|
assert(inst->src[i].nr == 0);
|
2022-08-19 12:40:20 -07:00
|
|
|
int grf = payload().num_regs +
|
2015-03-11 23:14:31 -07:00
|
|
|
prog_data->curb_read_length +
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2016-03-23 12:20:05 +01:00
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
|
|
|
*
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries.
|
|
|
|
|
*
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
*/
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
inst->src[i].stride *
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(inst->src[i].type);
|
2016-03-23 12:20:05 +01:00
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int
|
|
|
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
{
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2021-03-29 13:43:47 -07:00
|
|
|
if (devinfo->verx10 >= 125)
|
2020-06-16 23:06:25 -05:00
|
|
|
return -1;
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-23 12:50:50 -08:00
|
|
|
/**
|
|
|
|
|
* Get the mask of SIMD channels enabled during dispatch and not yet disabled
|
|
|
|
|
* by discard. Due to the layout of the sample mask in the fragment shader
|
|
|
|
|
* thread payload, \p bld is required to have a dispatch_width() not greater
|
|
|
|
|
* than 16 for fragment shaders.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2022-06-27 12:24:58 -07:00
|
|
|
brw_sample_mask_reg(const fs_builder &bld)
|
2020-01-23 12:50:50 -08:00
|
|
|
{
|
2023-12-05 17:16:34 -08:00
|
|
|
const fs_visitor &s = *bld.shader;
|
2020-01-23 12:50:50 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage != MESA_SHADER_FRAGMENT) {
|
2020-01-23 12:50:50 -08:00
|
|
|
return brw_imm_ud(0xffffffff);
|
2022-08-16 17:40:31 -07:00
|
|
|
} else if (s.devinfo->ver >= 20 ||
|
|
|
|
|
brw_wm_prog_data(s.prog_data)->uses_kill) {
|
2023-12-05 17:16:34 -08:00
|
|
|
return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
|
2020-01-23 12:50:50 -08:00
|
|
|
} else {
|
2024-02-17 22:43:47 -08:00
|
|
|
assert(bld.dispatch_width() <= 16);
|
2022-06-11 17:36:09 -07:00
|
|
|
assert(s.devinfo->ver < 20);
|
2020-01-23 12:50:50 -08:00
|
|
|
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2020-01-23 12:50:50 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t
|
|
|
|
|
brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
2024-02-19 19:41:48 -08:00
|
|
|
if (prog_data->dual_src_blend) {
|
2022-07-22 17:33:12 -07:00
|
|
|
assert(inst->exec_size < 32);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
else
|
|
|
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
|
|
|
} else {
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
2022-07-22 17:33:12 -07:00
|
|
|
else if (inst->exec_size == 32)
|
|
|
|
|
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
|
2019-08-25 23:59:25 -07:00
|
|
|
else
|
|
|
|
|
unreachable("Invalid FB write execution size");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
/**
|
|
|
|
|
* Predicate the specified instruction on the sample mask.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
bld.group() == inst->group &&
|
|
|
|
|
bld.dispatch_width() == inst->exec_size);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
const fs_visitor &s = *bld.shader;
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg sample_mask = brw_sample_mask_reg(bld);
|
2023-12-05 17:16:34 -08:00
|
|
|
const unsigned subreg = sample_mask_flag_subreg(s);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
2022-08-16 17:40:31 -07:00
|
|
|
if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
|
2022-06-27 12:24:58 -07:00
|
|
|
assert(sample_mask.file == ARF &&
|
|
|
|
|
sample_mask.nr == brw_flag_subreg(subreg).nr &&
|
|
|
|
|
sample_mask.subnr == brw_flag_subreg(
|
|
|
|
|
subreg + inst->group / 16).subnr);
|
|
|
|
|
} else {
|
|
|
|
|
bld.group(1, 0).exec_all()
|
|
|
|
|
.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
|
2015-07-13 17:59:34 +03:00
|
|
|
}
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
if (inst->predicate) {
|
|
|
|
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
|
|
|
|
assert(!inst->predicate_inverse);
|
|
|
|
|
assert(inst->flag_subreg == 0);
|
2022-07-22 17:11:52 -07:00
|
|
|
assert(s.devinfo->ver < 20);
|
2022-06-27 12:24:58 -07:00
|
|
|
/* Combine the sample mask with the existing predicate by using a
|
|
|
|
|
* vertical predication mode.
|
2015-11-16 17:23:01 -08:00
|
|
|
*/
|
2022-06-27 12:24:58 -07:00
|
|
|
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
|
2015-07-13 17:59:34 +03:00
|
|
|
} else {
|
2022-06-27 12:24:58 -07:00
|
|
|
inst->flag_subreg = subreg;
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->predicate_inverse = false;
|
2015-07-13 17:59:34 +03:00
|
|
|
}
|
2015-07-27 16:14:36 +03:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::register_pressure(const fs_visitor *v)
|
2013-08-04 23:27:14 -07:00
|
|
|
{
|
2016-03-13 16:35:49 -07:00
|
|
|
const fs_live_variables &live = v->live_analysis.require();
|
2016-03-13 16:37:03 -07:00
|
|
|
const unsigned num_instructions = v->cfg->num_blocks ?
|
|
|
|
|
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip = new unsigned[num_instructions]();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
|
2016-03-13 16:25:57 -07:00
|
|
|
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip[ip] += v->alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
const unsigned payload_count = v->first_non_payload_grf;
|
|
|
|
|
|
|
|
|
|
int *payload_last_use_ip = new int[payload_count];
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
for (unsigned reg = 0; reg < payload_count; reg++) {
|
|
|
|
|
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
|
|
|
|
|
++regs_live_at_ip[ip];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
delete[] payload_last_use_ip;
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::~register_pressure()
|
|
|
|
|
{
|
|
|
|
|
delete[] regs_live_at_ip;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 18:50:24 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
|
|
|
|
|
{
|
2016-03-13 16:25:57 -07:00
|
|
|
live_analysis.invalidate(c);
|
2016-03-13 16:35:49 -07:00
|
|
|
regpressure_analysis.invalidate(c);
|
2024-12-29 16:09:03 -08:00
|
|
|
performance_analysis.invalidate(c);
|
2024-02-19 22:25:16 -08:00
|
|
|
idom_analysis.invalidate(c);
|
2023-11-16 01:16:45 -08:00
|
|
|
def_analysis.invalidate(c);
|
2016-03-12 18:50:24 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
void
|
2023-08-14 16:59:17 -07:00
|
|
|
fs_visitor::debug_optimizer(const nir_shader *nir,
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const
|
|
|
|
|
{
|
2023-08-14 16:59:17 -07:00
|
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
|
2023-08-06 15:46:12 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
char *filename;
|
2023-08-07 17:06:49 +03:00
|
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
|
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
2023-09-24 21:38:47 -07:00
|
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
2023-08-06 15:46:12 +03:00
|
|
|
iteration, pass_num, pass_name);
|
|
|
|
|
if (ret == -1)
|
|
|
|
|
return;
|
2024-12-07 09:53:31 -08:00
|
|
|
|
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
if (__normal_user()) {
|
|
|
|
|
file = fopen(filename, "w");
|
|
|
|
|
if (!file)
|
|
|
|
|
file = stderr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_print_instructions(*this, file);
|
|
|
|
|
|
|
|
|
|
if (file != stderr)
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
free(filename);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
static uint32_t
|
|
|
|
|
brw_compute_max_register_pressure(fs_visitor &s)
|
2023-02-03 17:02:28 +01:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const register_pressure &rp = s.regpressure_analysis.require();
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t ip = 0, max_pressure = 0;
|
2024-07-12 16:55:33 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
2023-02-03 17:02:28 +01:00
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
|
|
|
|
return max_pressure;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
static fs_inst **
|
|
|
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
|
|
|
{
|
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
|
|
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
|
|
|
|
fs_inst **inst_arr = new fs_inst * [num_insts];
|
|
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
assert(ip >= block->start_ip && ip <= block->end_ip);
|
|
|
|
|
inst_arr[ip++] = inst;
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
|
|
|
|
return inst_arr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
|
|
|
|
|
{
|
2024-01-03 16:31:23 +10:00
|
|
|
ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block (block, cfg) {
|
|
|
|
|
block->instructions.make_empty();
|
|
|
|
|
|
|
|
|
|
assert(ip == block->start_ip);
|
|
|
|
|
for (; ip <= block->end_ip; ip++)
|
|
|
|
|
block->instructions.push_tail(inst_arr[ip]);
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_get_scratch_size(int size)
|
|
|
|
|
{
|
|
|
|
|
return MAX2(1024, util_next_power_of_two(size));
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
void
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_allocate_registers(fs_visitor &s, bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
const nir_shader *nir = s.nir;
|
2019-05-09 14:44:16 -05:00
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
2021-11-09 22:55:49 -06:00
|
|
|
SCHEDULE_NONE,
|
2014-11-13 16:28:19 -08:00
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
};
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
static const char *scheduler_mode_name[] = {
|
2023-08-14 19:35:32 -07:00
|
|
|
[SCHEDULE_PRE] = "top-down",
|
|
|
|
|
[SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
|
|
|
[SCHEDULE_PRE_LIFO] = "lifo",
|
|
|
|
|
[SCHEDULE_POST] = "post",
|
|
|
|
|
[SCHEDULE_NONE] = "none",
|
2016-10-17 14:12:28 -07:00
|
|
|
};
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
|
|
|
|
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_opt_compact_virtual_grfs(s);
|
2023-03-17 09:42:31 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.needs_register_pressure)
|
|
|
|
|
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
|
2023-02-03 17:02:28 +01:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
2021-10-13 11:21:41 +02:00
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
2021-11-09 19:03:19 -06:00
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
fs_inst **orig_order = save_instruction_order(s.cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
fs_inst **best_pressure_order = NULL;
|
2021-11-09 19:03:19 -06:00
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
2024-07-12 16:55:33 -07:00
|
|
|
instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
|
2023-10-20 10:32:54 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
2023-08-14 19:35:32 -07:00
|
|
|
enum instruction_scheduler_mode sched_mode = pre_modes[i];
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
|
2023-08-15 01:15:26 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
if (0) {
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_assign_regs_trivial(s);
|
2019-05-09 14:44:16 -05:00
|
|
|
allocated = true;
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
2024-07-12 16:55:33 -07:00
|
|
|
assert(!s.spilled_any_registers);
|
2019-05-09 14:44:16 -05:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, false, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
break;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
/* Save the maximum register pressure */
|
2024-07-12 16:55:33 -07:00
|
|
|
uint32_t this_pressure = brw_compute_max_register_pressure(s);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
|
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
|
|
|
best_register_pressure = this_pressure;
|
|
|
|
|
best_sched = sched_mode;
|
|
|
|
|
delete[] best_pressure_order;
|
2024-07-12 16:55:33 -07:00
|
|
|
best_pressure_order = save_instruction_order(s.cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, orig_order);
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
if (!allocated) {
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
|
|
|
scheduler_mode_name[best_sched]);
|
|
|
|
|
}
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, best_pressure_order);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
delete[] orig_order;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
delete[] best_pressure_order;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
if (!allocated) {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Failure to register allocate. Reduce number of "
|
2020-05-19 14:37:44 -07:00
|
|
|
"live scalar values to avoid this.");
|
2024-07-12 16:55:33 -07:00
|
|
|
} else if (s.spilled_any_registers) {
|
|
|
|
|
brw_shader_perf_log(s.compiler, s.log_data,
|
2021-07-29 14:27:57 -07:00
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
"values to improve performance.\n",
|
2024-07-12 16:55:33 -07:00
|
|
|
_mesa_shader_stage_to_string(s.stage));
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2014-11-13 16:28:19 -08:00
|
|
|
return;
|
|
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
int pass_num = 0;
|
|
|
|
|
|
|
|
|
|
s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_opt_bank_conflicts(s);
|
2017-06-15 15:23:57 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_post_ra(s);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-04-04 16:03:34 -07:00
|
|
|
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
|
|
|
|
* of part of assign_regs since both bank conflicts optimization and post
|
|
|
|
|
* RA scheduling take advantage of distinguishing references to registers
|
|
|
|
|
* that were allocated from references that were already fixed.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Change the passes above, then move this lowering to be part of
|
|
|
|
|
* assign_regs.
|
|
|
|
|
*/
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_vgrfs_to_fixed_grfs(s);
|
2024-04-04 16:03:34 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch > 0) {
|
2016-06-09 18:13:26 -07:00
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
*
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
|
2024-07-19 10:55:59 -07:00
|
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
|
|
|
* case of bindless shaders with return parts, this will also take the
|
|
|
|
|
* max of all parts.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
|
|
|
|
|
s.prog_data->total_scratch);
|
2024-07-19 10:55:59 -07:00
|
|
|
} else {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Scratch space required is larger than supported");
|
2024-07-19 10:55:59 -07:00
|
|
|
}
|
2016-06-09 16:56:31 -07:00
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2024-07-19 10:55:59 -07:00
|
|
|
return;
|
|
|
|
|
|
2024-12-06 11:37:57 -08:00
|
|
|
brw_lower_scoreboard(s);
|
2024-11-20 16:18:40 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
unsigned threads)
|
|
|
|
|
{
|
|
|
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
|
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
|
|
|
cs_prog_data->push.cross_thread.size;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info
|
2021-04-28 10:54:53 -07:00
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
const unsigned *override_local_size)
|
|
|
|
|
{
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info info = {};
|
2021-04-28 10:54:53 -07:00
|
|
|
|
|
|
|
|
const unsigned *sizes =
|
|
|
|
|
override_local_size ? override_local_size :
|
|
|
|
|
prog_data->local_size;
|
|
|
|
|
|
2022-11-08 01:24:36 -08:00
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
2021-10-11 07:49:40 -07:00
|
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
2021-10-11 07:49:40 -07:00
|
|
|
info.simd_size = 8u << simd;
|
2021-04-28 10:54:53 -07:00
|
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
|
|
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
|
|
|
if (remainder > 0)
|
|
|
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
|
|
|
else
|
|
|
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
void
|
|
|
|
|
brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
|
|
|
|
|
{
|
|
|
|
|
assert(phase == s.phase + 1);
|
|
|
|
|
s.phase = phase;
|
2024-12-29 17:39:39 -08:00
|
|
|
brw_validate(s);
|
2024-08-27 10:16:11 -07:00
|
|
|
}
|
|
|
|
|
|
2023-06-20 14:42:02 -07:00
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
|
|
|
|
|
{
|
|
|
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
2023-06-21 07:51:00 -07:00
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
namespace brw {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2023-11-21 07:49:02 -08:00
|
|
|
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
2022-08-03 16:47:52 -07:00
|
|
|
brw_reg_type type, unsigned n)
|
2023-11-21 07:49:02 -08:00
|
|
|
{
|
|
|
|
|
if (!regs[0])
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg();
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
if (bld.dispatch_width() > 16) {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(type, n);
|
2023-11-21 07:49:02 -08:00
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(16, 0);
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *const components = new brw_reg[m * n];
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
for (unsigned c = 0; c < n; c++) {
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
components[c * m + g] =
|
|
|
|
|
offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
|
|
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
delete[] components;
|
2023-11-21 07:49:02 -08:00
|
|
|
return tmp;
|
|
|
|
|
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
|
2023-11-21 07:49:02 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2023-11-21 07:49:02 -08:00
|
|
|
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
|
|
|
|
|
{
|
|
|
|
|
if (!regs[0])
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg();
|
2023-12-01 21:51:19 -08:00
|
|
|
else if (bld.shader->devinfo->ver >= 20)
|
2024-04-20 17:08:02 -07:00
|
|
|
return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
|
2023-11-21 07:49:02 -08:00
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(8, 0);
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *const components = new brw_reg[2 * m];
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < 2; c++) {
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
|
|
|
|
|
hbld, c + 2 * (g % 2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
|
|
|
|
|
|
|
|
|
|
delete[] components;
|
|
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
check_dynamic_msaa_flag(const fs_builder &bld,
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags flag)
|
2023-11-21 07:49:02 -08:00
|
|
|
{
|
|
|
|
|
fs_inst *inst = bld.AND(bld.null_reg_ud(),
|
|
|
|
|
dynamic_msaa_flags(wm_prog_data),
|
|
|
|
|
brw_imm_ud(flag));
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
}
|
|
|
|
|
}
|