2010-08-10 20:39:06 -07:00
|
|
|
/*
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
*
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
*
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
* Software.
|
|
|
|
|
*
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
*/
|
|
|
|
|
|
2024-07-13 00:19:44 -07:00
|
|
|
/** @file
|
2010-08-10 20:39:06 -07:00
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
#include "brw_fs_builder.h"
|
2020-01-23 22:55:33 -08:00
|
|
|
#include "brw_fs_live_variables.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
#include "brw_nir.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
#include "brw_cfg.h"
|
2021-10-07 00:23:07 -07:00
|
|
|
#include "brw_private.h"
|
2023-11-01 12:51:33 -07:00
|
|
|
#include "intel_nir.h"
|
2023-09-24 21:38:47 -07:00
|
|
|
#include "shader_enums.h"
|
2021-04-05 10:44:41 -07:00
|
|
|
#include "dev/intel_debug.h"
|
2023-01-20 23:19:34 -08:00
|
|
|
#include "dev/intel_wa.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
#include "compiler/nir/nir_builder.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
using namespace brw;
|
|
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources);
|
2024-03-21 15:42:44 -07:00
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
void
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
initialize_sources(this, src, sources);
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
this->opcode = opcode;
|
|
|
|
|
this->dst = dst;
|
2014-08-14 13:56:24 -07:00
|
|
|
this->exec_size = exec_size;
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
switch (dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
case ATTR:
|
2016-09-07 13:38:20 -07:00
|
|
|
this->size_written = dst.component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
break;
|
|
|
|
|
case BAD_FILE:
|
2016-09-07 13:38:20 -07:00
|
|
|
this->size_written = 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
break;
|
|
|
|
|
case IMM:
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2015-06-18 12:30:43 -07:00
|
|
|
init(opcode, exec_size, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg &src0)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg &src0, const brw_reg &src1)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const brw_reg &dst,
|
|
|
|
|
const brw_reg &src0, const brw_reg &src1, const brw_reg &src2)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const brw_reg &dst,
|
|
|
|
|
const brw_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
{
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
}
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
memcpy((void*)this, &that, sizeof(that));
|
2024-03-21 15:42:44 -07:00
|
|
|
initialize_sources(this, that.src, that.sources);
|
2014-02-20 09:40:02 -08:00
|
|
|
}
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
{
|
2024-03-21 15:42:44 -07:00
|
|
|
if (this->src != this->builtin_src)
|
|
|
|
|
delete[] this->src;
|
2015-02-06 01:14:51 +02:00
|
|
|
}
|
|
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
static void
|
2024-06-18 23:42:59 -07:00
|
|
|
initialize_sources(fs_inst *inst, const brw_reg src[], uint8_t num_sources)
|
2014-02-20 13:14:05 -08:00
|
|
|
{
|
2024-03-21 15:42:44 -07:00
|
|
|
if (num_sources > ARRAY_SIZE(inst->builtin_src))
|
2024-06-18 23:42:59 -07:00
|
|
|
inst->src = new brw_reg[num_sources];
|
2024-03-21 15:42:44 -07:00
|
|
|
else
|
|
|
|
|
inst->src = inst->builtin_src;
|
2015-02-06 01:14:51 +02:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
inst->src[i] = src[i];
|
2015-02-06 01:14:51 +02:00
|
|
|
|
2024-03-21 15:42:44 -07:00
|
|
|
inst->sources = num_sources;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
{
|
|
|
|
|
if (this->sources == num_sources)
|
|
|
|
|
return;
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *old_src = this->src;
|
|
|
|
|
brw_reg *new_src;
|
2024-03-21 15:42:44 -07:00
|
|
|
|
|
|
|
|
const unsigned builtin_size = ARRAY_SIZE(this->builtin_src);
|
|
|
|
|
|
|
|
|
|
if (old_src == this->builtin_src) {
|
|
|
|
|
if (num_sources > builtin_size) {
|
2024-06-18 23:42:59 -07:00
|
|
|
new_src = new brw_reg[num_sources];
|
2024-03-21 15:42:44 -07:00
|
|
|
for (unsigned i = 0; i < this->sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
new_src = old_src;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (num_sources <= builtin_size) {
|
|
|
|
|
new_src = this->builtin_src;
|
|
|
|
|
assert(this->sources > num_sources);
|
|
|
|
|
for (unsigned i = 0; i < num_sources; i++)
|
|
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
|
|
|
|
|
} else if (num_sources < this->sources) {
|
|
|
|
|
new_src = old_src;
|
|
|
|
|
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
new_src = new brw_reg[num_sources];
|
2024-12-11 15:57:25 -08:00
|
|
|
for (unsigned i = 0; i < this->sources; i++)
|
2024-03-21 15:42:44 -07:00
|
|
|
new_src[i] = old_src[i];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (old_src != new_src)
|
|
|
|
|
delete[] old_src;
|
2014-02-20 13:14:05 -08:00
|
|
|
}
|
2024-03-21 15:42:44 -07:00
|
|
|
|
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
this->src = new_src;
|
2014-02-20 13:14:05 -08:00
|
|
|
}
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
case SHADER_OPCODE_SEND:
|
2014-09-13 11:49:55 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2019-04-26 17:11:42 -07:00
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2014-09-13 11:49:55 -07:00
|
|
|
return true;
|
2024-03-26 02:15:47 -07:00
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return src[1].file == VGRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
}
|
|
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
bool
|
|
|
|
|
fs_inst::is_control_source(unsigned arg) const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return arg == 1;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
return arg == 1 || arg == 2;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return arg == 0 || arg == 1;
|
|
|
|
|
|
intel/brw: Introduce new MEMORY_*_LOGICAL opcodes
This is a new unified set of opcodes for memory access loosely patterned
after the new LSC-style data port messages introduced on Alchemist GPUs.
Rather than creating an opcode for every type of memory access, it has
only three opcodes: load, store, and atomic. It has various sources to
indicate the rest:
- Binding type (raw pointer, pointer to surface state, or BT index)
- Address size (A64, A32, A16)
- Data size (bit size, number of components)
- Opcode (atomic opcode, or LOAD/STORE vs. LOAD_CMASK/STORE_CMASK)
- Mode (typed vs. untyped vs. shared-local vs. scratch)
- Address (and its dimensionality)
- Data (0 for loads, 1 for stores, 2 for atomics)
- Whether we want block access
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2022-12-25 02:00:46 -08:00
|
|
|
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
|
|
|
|
|
return arg != MEMORY_LOGICAL_BINDING &&
|
|
|
|
|
arg != MEMORY_LOGICAL_ADDRESS &&
|
|
|
|
|
arg != MEMORY_LOGICAL_DATA0 &&
|
|
|
|
|
arg != MEMORY_LOGICAL_DATA1;
|
|
|
|
|
|
2024-09-05 17:37:25 -07:00
|
|
|
case SHADER_OPCODE_QUAD_SWAP:
|
2024-11-21 10:00:04 -08:00
|
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_VOTE_ANY:
|
|
|
|
|
case SHADER_OPCODE_VOTE_ALL:
|
|
|
|
|
case SHADER_OPCODE_REDUCE:
|
|
|
|
|
return arg != 0;
|
2024-09-05 17:37:25 -07:00
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-11-09 14:13:37 -08:00
|
|
|
bool
|
|
|
|
|
fs_inst::is_payload(unsigned arg) const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
return arg == 2 || arg == 3;
|
|
|
|
|
|
|
|
|
|
default:
|
2023-10-09 08:23:53 -07:00
|
|
|
return false;
|
2018-11-09 14:13:37 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
bool
|
2021-04-05 13:19:39 -07:00
|
|
|
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
|
2012-11-09 11:48:20 -08:00
|
|
|
{
|
2014-06-23 21:57:31 -07:00
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
return false;
|
|
|
|
|
|
2024-06-05 13:23:39 -07:00
|
|
|
/* From TGL PRM Vol 2a Pg. 1053 and Pg. 1069 MAD and MUL Instructions:
|
2018-12-07 14:13:53 -08:00
|
|
|
*
|
|
|
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
|
|
|
* is not supported."
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
|
2018-12-07 14:13:53 -08:00
|
|
|
opcode == BRW_OPCODE_MAD)) {
|
|
|
|
|
const brw_reg_type exec_type = get_exec_type(this);
|
2024-04-21 00:57:59 -07:00
|
|
|
const unsigned min_brw_type_size_bytes = opcode == BRW_OPCODE_MAD ?
|
|
|
|
|
MIN2(brw_type_size_bytes(src[1].type), brw_type_size_bytes(src[2].type)) :
|
|
|
|
|
MIN2(brw_type_size_bytes(src[0].type), brw_type_size_bytes(src[1].type));
|
2018-12-07 14:13:53 -08:00
|
|
|
|
2024-04-20 23:19:43 -07:00
|
|
|
if (brw_type_is_int(exec_type) &&
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(exec_type) >= 4 &&
|
|
|
|
|
brw_type_size_bytes(exec_type) != min_brw_type_size_bytes)
|
2018-12-07 14:13:53 -08:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-20 21:12:17 -08:00
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
case BRW_OPCODE_ROL:
|
|
|
|
|
case BRW_OPCODE_ROR:
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
case BRW_OPCODE_DP4A:
|
|
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2024-07-15 15:09:12 -07:00
|
|
|
case SHADER_OPCODE_REDUCE:
|
2024-07-16 14:06:12 -07:00
|
|
|
case SHADER_OPCODE_INCLUSIVE_SCAN:
|
|
|
|
|
case SHADER_OPCODE_EXCLUSIVE_SCAN:
|
2024-09-04 10:07:52 -07:00
|
|
|
case SHADER_OPCODE_VOTE_ANY:
|
|
|
|
|
case SHADER_OPCODE_VOTE_ALL:
|
|
|
|
|
case SHADER_OPCODE_VOTE_EQUAL:
|
2024-09-05 09:23:11 -07:00
|
|
|
case SHADER_OPCODE_BALLOT:
|
2024-09-05 17:37:25 -07:00
|
|
|
case SHADER_OPCODE_QUAD_SWAP:
|
2024-11-29 15:31:05 -08:00
|
|
|
case SHADER_OPCODE_READ_FROM_LIVE_CHANNEL:
|
|
|
|
|
case SHADER_OPCODE_READ_FROM_CHANNEL:
|
2013-09-19 19:48:22 -07:00
|
|
|
return false;
|
2024-02-20 21:12:17 -08:00
|
|
|
default:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
}
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
bool
|
2024-02-20 21:12:17 -08:00
|
|
|
fs_inst::can_do_cmod() const
|
2018-10-08 12:22:35 -05:00
|
|
|
{
|
2024-02-20 21:12:17 -08:00
|
|
|
switch (opcode) {
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
case BRW_OPCODE_ADD3:
|
|
|
|
|
case BRW_OPCODE_ADDC:
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
case BRW_OPCODE_ASR:
|
|
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
case BRW_OPCODE_DP2:
|
|
|
|
|
case BRW_OPCODE_DP3:
|
|
|
|
|
case BRW_OPCODE_DP4:
|
|
|
|
|
case BRW_OPCODE_DPH:
|
|
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
case BRW_OPCODE_LINE:
|
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
case BRW_OPCODE_MAC:
|
|
|
|
|
case BRW_OPCODE_MACH:
|
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
case BRW_OPCODE_SUBB:
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2018-10-08 12:22:35 -05:00
|
|
|
return false;
|
2024-02-20 21:12:17 -08:00
|
|
|
}
|
2018-10-08 12:22:35 -05:00
|
|
|
|
|
|
|
|
/* The accumulator result appears to get used for the conditional modifier
|
|
|
|
|
* generation. When negating a UD value, there is a 33rd bit generated for
|
|
|
|
|
* the sign in the accumulator value, so now you can't check, for example,
|
|
|
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < sources; i++) {
|
2024-04-20 23:19:43 -07:00
|
|
|
if (brw_type_is_uint(src[i].type) && src[i].negate)
|
2018-10-08 12:22:35 -05:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2024-11-20 08:12:52 -08:00
|
|
|
if (dst.file == ARF && dst.nr == BRW_ARF_SCALAR && src[0].file == IMM)
|
|
|
|
|
return false;
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2015-10-14 02:12:09 -07:00
|
|
|
bool
|
|
|
|
|
fs_inst::can_change_types() const
|
|
|
|
|
{
|
|
|
|
|
return dst.type == src[0].type &&
|
2022-06-22 16:17:21 -07:00
|
|
|
!src[0].abs && !src[0].negate && !saturate && src[0].file != ATTR &&
|
2015-10-14 02:12:09 -07:00
|
|
|
(opcode == BRW_OPCODE_MOV ||
|
2024-01-24 16:19:39 -08:00
|
|
|
(opcode == SHADER_OPCODE_LOAD_PAYLOAD && sources == 1) ||
|
2015-10-14 02:12:09 -07:00
|
|
|
(opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
dst.type == src[1].type &&
|
|
|
|
|
predicate != BRW_PREDICATE_NONE &&
|
2022-06-22 16:17:21 -07:00
|
|
|
!src[1].abs && !src[1].negate && src[1].file != ATTR));
|
2015-10-14 02:12:09 -07:00
|
|
|
}
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
bool
|
2024-06-18 14:52:02 -07:00
|
|
|
brw_reg::equals(const brw_reg &r) const
|
2012-07-04 13:12:50 -07:00
|
|
|
{
|
2024-06-18 14:00:53 -07:00
|
|
|
return brw_regs_equal(this, &r);
|
2012-07-04 13:12:50 -07:00
|
|
|
}
|
|
|
|
|
|
2015-04-07 16:11:37 -07:00
|
|
|
bool
|
2024-06-18 14:52:02 -07:00
|
|
|
brw_reg::negative_equals(const brw_reg &r) const
|
2015-04-07 16:11:37 -07:00
|
|
|
{
|
2024-06-18 14:00:53 -07:00
|
|
|
return brw_regs_negative_equal(this, &r);
|
2015-04-07 16:11:37 -07:00
|
|
|
}
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
bool
|
2024-06-18 14:52:02 -07:00
|
|
|
brw_reg::is_contiguous() const
|
2013-12-08 04:57:35 +01:00
|
|
|
{
|
2020-01-02 15:32:56 -08:00
|
|
|
switch (file) {
|
|
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
return hstride == BRW_HORIZONTAL_STRIDE_1 &&
|
|
|
|
|
vstride == width + hstride;
|
|
|
|
|
case VGRF:
|
|
|
|
|
case ATTR:
|
|
|
|
|
return stride == 1;
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
case IMM:
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unreachable("Invalid register file");
|
2013-12-08 04:57:35 +01:00
|
|
|
}
|
|
|
|
|
|
2015-07-14 15:43:44 +03:00
|
|
|
unsigned
|
2024-06-18 14:52:02 -07:00
|
|
|
brw_reg::component_size(unsigned width) const
|
2015-07-14 15:43:44 +03:00
|
|
|
{
|
2022-06-22 16:18:13 -07:00
|
|
|
if (file == ARF || file == FIXED_GRF) {
|
|
|
|
|
const unsigned w = MIN2(width, 1u << this->width);
|
|
|
|
|
const unsigned h = width >> this->width;
|
|
|
|
|
const unsigned vs = vstride ? 1 << (vstride - 1) : 0;
|
|
|
|
|
const unsigned hs = hstride ? 1 << (hstride - 1) : 0;
|
|
|
|
|
assert(w > 0);
|
2024-06-14 13:07:15 -07:00
|
|
|
/* Note this rounds up to next horizontal stride to be consistent with
|
|
|
|
|
* the VGRF case below.
|
|
|
|
|
*/
|
|
|
|
|
return ((MAX2(1, h) - 1) * vs + MAX2(w * hs, 1)) * brw_type_size_bytes(type);
|
2022-06-22 16:18:13 -07:00
|
|
|
} else {
|
2024-04-21 00:57:59 -07:00
|
|
|
return MAX2(width * stride, 1) * brw_type_size_bytes(type);
|
2022-06-22 16:18:13 -07:00
|
|
|
}
|
2015-07-14 15:43:44 +03:00
|
|
|
}
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
if (failed)
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2020-07-02 13:37:10 +02:00
|
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
2021-03-23 11:31:51 -07:00
|
|
|
if (unlikely(debug_enabled)) {
|
2011-06-10 15:26:02 -03:00
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
{
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
va_end(va);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2016-05-18 14:39:52 -07:00
|
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
} else {
|
2020-10-30 17:41:02 +02:00
|
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
2021-07-29 14:27:57 -07:00
|
|
|
brw_shader_perf_log(compiler, log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
/**
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
*
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
* it.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
2019-04-24 12:38:28 +02:00
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
{
|
2023-03-14 18:22:50 +02:00
|
|
|
if (this->predicate && !this->predicate_trivial &&
|
|
|
|
|
this->opcode != BRW_OPCODE_SEL)
|
2023-03-10 16:11:56 +02:00
|
|
|
return true;
|
|
|
|
|
|
intel/brw: Use size_written for NoMask instructions in is_partial_write
The intention of inst->is_partial_write() is that it should return true
when any REG_SIZE (32B) chunk of inst's destination is written but not
fully overwritten. This can be used to tell whether inst combines new
data with existing data, or screens off any previous writes, so the old
values are no longer required.
The existing (exec_size * brw_type_size_bytes(this->dst.type) < 32)
check doesn't work in a number of cases. For example, LSC block loads
have exec_size == 1 and force_writemask_all set, but may write multiple
full registers of data. (Currently, we only see them with exec_size 1
after logical-send-lowering, so our SHADER_OPCODE_SEND special case
was covering those.) We had also special cased UNDEF.
Instead, we can simply check:
1. Predication
2. !inst->dst.contiguous()
3. inst->dst.offset % REG_SIZE != 0
4. inst->size_written % REG_SIZE != 0
We had the first three already, but #4 is new. If either #3 or #4
are true, then that implies there is a REG_SIZE chunk of the destination
which is written, but not entirely written, so it's a partial write.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2024-08-26 15:34:21 -07:00
|
|
|
if (!this->dst.is_contiguous())
|
2023-03-10 16:11:56 +02:00
|
|
|
return true;
|
|
|
|
|
|
intel/brw: Use size_written for NoMask instructions in is_partial_write
The intention of inst->is_partial_write() is that it should return true
when any REG_SIZE (32B) chunk of inst's destination is written but not
fully overwritten. This can be used to tell whether inst combines new
data with existing data, or screens off any previous writes, so the old
values are no longer required.
The existing (exec_size * brw_type_size_bytes(this->dst.type) < 32)
check doesn't work in a number of cases. For example, LSC block loads
have exec_size == 1 and force_writemask_all set, but may write multiple
full registers of data. (Currently, we only see them with exec_size 1
after logical-send-lowering, so our SHADER_OPCODE_SEND special case
was covering those.) We had also special cased UNDEF.
Instead, we can simply check:
1. Predication
2. !inst->dst.contiguous()
3. inst->dst.offset % REG_SIZE != 0
4. inst->size_written % REG_SIZE != 0
We had the first three already, but #4 is new. If either #3 or #4
are true, then that implies there is a REG_SIZE chunk of the destination
which is written, but not entirely written, so it's a partial write.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2024-08-26 15:34:21 -07:00
|
|
|
if (this->dst.offset % REG_SIZE != 0)
|
|
|
|
|
return true;
|
2023-07-23 18:20:23 +03:00
|
|
|
|
intel/brw: Use size_written for NoMask instructions in is_partial_write
The intention of inst->is_partial_write() is that it should return true
when any REG_SIZE (32B) chunk of inst's destination is written but not
fully overwritten. This can be used to tell whether inst combines new
data with existing data, or screens off any previous writes, so the old
values are no longer required.
The existing (exec_size * brw_type_size_bytes(this->dst.type) < 32)
check doesn't work in a number of cases. For example, LSC block loads
have exec_size == 1 and force_writemask_all set, but may write multiple
full registers of data. (Currently, we only see them with exec_size 1
after logical-send-lowering, so our SHADER_OPCODE_SEND special case
was covering those.) We had also special cased UNDEF.
Instead, we can simply check:
1. Predication
2. !inst->dst.contiguous()
3. inst->dst.offset % REG_SIZE != 0
4. inst->size_written % REG_SIZE != 0
We had the first three already, but #4 is new. If either #3 or #4
are true, then that implies there is a REG_SIZE chunk of the destination
which is written, but not entirely written, so it's a partial write.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2024-08-26 15:34:21 -07:00
|
|
|
return this->size_written % REG_SIZE != 0;
|
2012-06-04 08:59:00 -07:00
|
|
|
}
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
unsigned
|
|
|
|
|
fs_inst::components_read(unsigned i) const
|
|
|
|
|
{
|
2016-08-12 18:33:58 -07:00
|
|
|
/* Return zero if the source is not present. */
|
|
|
|
|
if (src[i].file == BAD_FILE)
|
|
|
|
|
return 0;
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
switch (opcode) {
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
return i == 0 ? 1 : 2;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
2020-10-29 15:10:59 +02:00
|
|
|
assert(i < 2);
|
|
|
|
|
if (i == 0)
|
|
|
|
|
return 2;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
2015-10-20 14:29:37 -07:00
|
|
|
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
2015-07-27 16:14:36 +03:00
|
|
|
/* First/second FB write color. */
|
|
|
|
|
if (i < 2)
|
2015-10-24 14:55:57 -07:00
|
|
|
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
2015-07-27 16:14:36 +03:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
2018-10-31 09:52:33 -05:00
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
2015-09-08 15:52:09 +01:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
2020-07-07 23:54:00 -07:00
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-02-05 18:39:13 -08:00
|
|
|
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
|
2023-05-23 13:11:02 +03:00
|
|
|
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
|
|
|
|
|
src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture coordinates. */
|
2016-02-05 18:39:13 -08:00
|
|
|
if (i == TEX_LOGICAL_SRC_COORDINATE)
|
|
|
|
|
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture derivatives. */
|
2016-02-05 18:39:13 -08:00
|
|
|
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
|
|
|
|
|
opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
|
|
|
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
/* Texture offset. */
|
2016-11-28 18:13:02 -08:00
|
|
|
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
return 2;
|
2015-09-08 15:52:09 +01:00
|
|
|
/* MCS */
|
2020-07-07 23:54:00 -07:00
|
|
|
else if (i == TEX_LOGICAL_SRC_MCS) {
|
|
|
|
|
if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
|
|
|
|
|
return 2;
|
|
|
|
|
else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
|
|
|
|
|
return 4;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
} else
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
return 1;
|
|
|
|
|
|
intel/brw: Introduce new MEMORY_*_LOGICAL opcodes
This is a new unified set of opcodes for memory access loosely patterned
after the new LSC-style data port messages introduced on Alchemist GPUs.
Rather than creating an opcode for every type of memory access, it has
only three opcodes: load, store, and atomic. It has various sources to
indicate the rest:
- Binding type (raw pointer, pointer to surface state, or BT index)
- Address size (A64, A32, A16)
- Data size (bit size, number of components)
- Opcode (atomic opcode, or LOAD/STORE vs. LOAD_CMASK/STORE_CMASK)
- Mode (typed vs. untyped vs. shared-local vs. scratch)
- Address (and its dimensionality)
- Data (0 for loads, 1 for stores, 2 for atomics)
- Whether we want block access
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Rohan Garg <rohan.garg@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30828>
2022-12-25 02:00:46 -08:00
|
|
|
case SHADER_OPCODE_MEMORY_LOAD_LOGICAL:
|
|
|
|
|
if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA0)
|
|
|
|
|
return 0;
|
|
|
|
|
/* fallthrough */
|
|
|
|
|
case SHADER_OPCODE_MEMORY_STORE_LOGICAL:
|
|
|
|
|
if (i == MEMORY_LOGICAL_DATA1)
|
|
|
|
|
return 0;
|
|
|
|
|
/* fallthrough */
|
|
|
|
|
case SHADER_OPCODE_MEMORY_ATOMIC_LOGICAL:
|
|
|
|
|
if (i == MEMORY_LOGICAL_DATA0 || i == MEMORY_LOGICAL_DATA1)
|
|
|
|
|
return src[MEMORY_LOGICAL_COMPONENTS].ud;
|
|
|
|
|
else if (i == MEMORY_LOGICAL_ADDRESS)
|
|
|
|
|
return src[MEMORY_LOGICAL_COORD_COMPONENTS].ud;
|
|
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
2016-04-25 18:06:13 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
return (i == 0 ? 2 : 1);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
2022-09-28 16:38:35 -07:00
|
|
|
assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
|
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
if (i == URB_LOGICAL_SRC_DATA)
|
2022-09-28 16:38:35 -07:00
|
|
|
return src[URB_LOGICAL_SRC_COMPONENTS].ud;
|
2022-07-12 15:32:01 -07:00
|
|
|
else
|
|
|
|
|
return 1;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
case BRW_OPCODE_DPAS:
|
|
|
|
|
unreachable("Do not use components_read() for DPAS.");
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
default:
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-09-07 17:00:58 -07:00
|
|
|
unsigned
|
2024-06-19 10:50:51 -07:00
|
|
|
fs_inst::size_read(const struct intel_device_info *devinfo, int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
{
|
2015-06-18 11:53:08 -07:00
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
if (arg == 2) {
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
} else if (arg == 3) {
|
|
|
|
|
return ex_mlen * REG_SIZE;
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2018-04-19 20:48:42 -07:00
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
2015-06-18 11:53:08 -07:00
|
|
|
if (arg == 0)
|
2016-09-07 17:00:07 -07:00
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
break;
|
|
|
|
|
|
2024-04-11 01:10:51 -07:00
|
|
|
case BRW_OPCODE_PLN:
|
|
|
|
|
if (arg == 0)
|
2016-09-07 13:02:55 -07:00
|
|
|
return 16;
|
2015-06-18 17:48:27 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-06-30 15:51:13 -07:00
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
if (arg < this->header_size)
|
2024-04-20 17:08:02 -07:00
|
|
|
return retype(src[arg], BRW_TYPE_UD).component_size(8);
|
2015-06-30 15:51:13 -07:00
|
|
|
break;
|
|
|
|
|
|
2015-09-15 14:01:17 -07:00
|
|
|
case SHADER_OPCODE_BARRIER:
|
2016-09-07 17:00:07 -07:00
|
|
|
return REG_SIZE;
|
2015-07-16 15:04:43 -07:00
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
assert(src[2].file == IMM);
|
2016-09-07 14:36:32 -07:00
|
|
|
return src[2].ud;
|
2015-11-07 18:58:34 -08:00
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
|
2024-04-12 16:23:49 -07:00
|
|
|
case BRW_OPCODE_DPAS: {
|
|
|
|
|
/* This is a little bit sketchy. There's no way to get at devinfo from
|
|
|
|
|
* here, so the regular reg_unit() cannot be used. However, on
|
|
|
|
|
* reg_unit() == 1 platforms, DPAS exec_size must be 8, and on known
|
|
|
|
|
* reg_unit() == 2 platforms, DPAS exec_size must be 16. This is not a
|
|
|
|
|
* coincidence, so this isn't so bad.
|
|
|
|
|
*/
|
|
|
|
|
const unsigned reg_unit = this->exec_size / 8;
|
|
|
|
|
|
2023-09-20 12:42:24 -07:00
|
|
|
switch (arg) {
|
|
|
|
|
case 0:
|
2024-04-20 17:08:02 -07:00
|
|
|
if (src[0].type == BRW_TYPE_HF) {
|
2024-04-12 16:23:49 -07:00
|
|
|
return rcount * reg_unit * REG_SIZE / 2;
|
2023-09-20 12:42:24 -07:00
|
|
|
} else {
|
2024-04-12 16:23:49 -07:00
|
|
|
return rcount * reg_unit * REG_SIZE;
|
2023-09-20 12:42:24 -07:00
|
|
|
}
|
|
|
|
|
case 1:
|
2024-04-12 16:23:49 -07:00
|
|
|
return sdepth * reg_unit * REG_SIZE;
|
2023-09-20 12:42:24 -07:00
|
|
|
case 2:
|
|
|
|
|
/* This is simpler than the formula described in the Bspec, but it
|
2024-04-12 16:23:49 -07:00
|
|
|
* covers all of the cases that we support. Each inner sdepth
|
|
|
|
|
* iteration of the DPAS consumes a single dword for int8, uint8, or
|
|
|
|
|
* float16 types. These are the one source types currently
|
|
|
|
|
* supportable through Vulkan. This is independent of reg_unit.
|
2023-09-20 12:42:24 -07:00
|
|
|
*/
|
2024-04-12 16:23:49 -07:00
|
|
|
return rcount * sdepth * 4;
|
2023-09-20 12:42:24 -07:00
|
|
|
default:
|
|
|
|
|
unreachable("Invalid source number.");
|
|
|
|
|
}
|
|
|
|
|
break;
|
2024-04-12 16:23:49 -07:00
|
|
|
}
|
2023-09-20 12:42:24 -07:00
|
|
|
|
2023-10-09 08:23:53 -07:00
|
|
|
default:
|
|
|
|
|
break;
|
2014-08-18 14:27:55 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
case IMM:
|
2024-04-21 00:57:59 -07:00
|
|
|
return components_read(arg) * brw_type_size_bytes(src[arg].type);
|
2016-08-12 18:33:58 -07:00
|
|
|
case BAD_FILE:
|
2015-10-26 17:52:57 -07:00
|
|
|
case ARF:
|
|
|
|
|
case FIXED_GRF:
|
2015-10-26 17:09:25 -07:00
|
|
|
case VGRF:
|
2015-08-05 16:29:30 +03:00
|
|
|
case ATTR:
|
2016-09-07 17:00:07 -07:00
|
|
|
return components_read(arg) * src[arg].component_size(exec_size);
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
2015-10-26 06:58:56 -07:00
|
|
|
return 0;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
namespace {
|
2019-09-24 17:06:12 -05:00
|
|
|
unsigned
|
2022-07-22 17:11:52 -07:00
|
|
|
predicate_width(const intel_device_info *devinfo, brw_predicate predicate)
|
2019-09-24 17:06:12 -05:00
|
|
|
{
|
2022-07-22 17:11:52 -07:00
|
|
|
if (devinfo->ver >= 20) {
|
|
|
|
|
return 1;
|
|
|
|
|
} else {
|
|
|
|
|
switch (predicate) {
|
|
|
|
|
case BRW_PREDICATE_NONE: return 1;
|
|
|
|
|
case BRW_PREDICATE_NORMAL: return 1;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
|
|
|
|
|
default: unreachable("Unsupported predicate");
|
|
|
|
|
}
|
2019-09-24 17:06:12 -05:00
|
|
|
}
|
|
|
|
|
}
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
fs_inst::flags_read(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
{
|
2022-07-22 17:11:52 -07:00
|
|
|
if (devinfo->ver < 20 && (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
|
|
|
|
|
predicate == BRW_PREDICATE_ALIGN1_ALLV)) {
|
2016-05-18 21:54:35 -07:00
|
|
|
/* The vertical predication modes combine corresponding bits from
|
2024-02-17 22:15:44 -08:00
|
|
|
* f0.0 and f1.0 on Gfx7+.
|
2016-05-18 21:54:35 -07:00
|
|
|
*/
|
2024-02-17 22:15:44 -08:00
|
|
|
const unsigned shift = 4;
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 1) << shift | brw_fs_flag_mask(this, 1);
|
2016-05-18 21:54:35 -07:00
|
|
|
} else if (predicate) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, predicate_width(devinfo, predicate));
|
2016-05-18 21:54:35 -07:00
|
|
|
} else {
|
2017-06-22 16:42:34 -07:00
|
|
|
unsigned mask = 0;
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
2024-06-19 10:50:51 -07:00
|
|
|
mask |= brw_fs_flag_mask(src[i], size_read(devinfo, i));
|
2017-06-22 16:42:34 -07:00
|
|
|
}
|
|
|
|
|
return mask;
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
}
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
unsigned
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
fs_inst::flags_written(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
{
|
2024-02-19 19:41:48 -08:00
|
|
|
if (conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
|
|
|
|
opcode != BRW_OPCODE_CSEL &&
|
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 1);
|
2024-02-27 02:02:24 -08:00
|
|
|
} else if (opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(this, 32);
|
2016-05-18 21:54:35 -07:00
|
|
|
} else {
|
2024-01-04 22:29:54 -08:00
|
|
|
return brw_fs_flag_mask(dst, size_written);
|
2016-05-18 21:54:35 -07:00
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
}
|
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
bool
|
|
|
|
|
fs_inst::has_sampler_residency() const
|
|
|
|
|
{
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2023-02-16 20:30:30 -08:00
|
|
|
case SHADER_OPCODE_TG4_BIAS_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_EXPLICIT_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_IMPLICIT_LOD_LOGICAL:
|
2023-03-05 15:27:08 -08:00
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOD_LOGICAL:
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_BIAS_LOGICAL:
|
2023-05-23 13:11:02 +03:00
|
|
|
assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
|
|
|
|
return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
intel/brw: Copy prop from raw integer moves with mismatched types
The specific pattern from the unit test was observed in ray tracing
trampoline shaders.
v2: Refactor the is_raw_move tests out to a utility function. Suggested
by Ken.
v3: Fix a regression caused by being too picky about source
modifiers. This was introduced somewhere between when I did initial
shader-db runs an v2.
v4: Fix typo in comment. Noticed by Caio.
shader-db:
All Intel platforms had similar results. (Meteor Lake shown)
total instructions in shared programs: 19734086 -> 19733997 (<.01%)
instructions in affected programs: 135388 -> 135299 (-0.07%)
helped: 76 / HURT: 2
total cycles in shared programs: 916290451 -> 916264968 (<.01%)
cycles in affected programs: 41046002 -> 41020519 (-0.06%)
helped: 32 / HURT: 29
fossil-db:
Meteor Lake, DG2, and Skylake had similar results. (Meteor Lake shown)
Totals:
Instrs: 151531355 -> 151513669 (-0.01%); split: -0.01%, +0.00%
Cycle count: 17209372399 -> 17208178205 (-0.01%); split: -0.01%, +0.00%
Max live registers: 32016490 -> 32016493 (+0.00%)
Totals from 17361 (2.75% of 630198) affected shaders:
Instrs: 2642048 -> 2624362 (-0.67%); split: -0.67%, +0.00%
Cycle count: 79803066 -> 78608872 (-1.50%); split: -1.75%, +0.25%
Max live registers: 421668 -> 421671 (+0.00%)
Tiger Lake and Ice Lake had similar results. (Tiger Lake shown)
Totals:
Instrs: 149995644 -> 149977326 (-0.01%); split: -0.01%, +0.00%
Cycle count: 15567293770 -> 15566524840 (-0.00%); split: -0.02%, +0.01%
Spill count: 61241 -> 61238 (-0.00%)
Fill count: 107304 -> 107301 (-0.00%)
Max live registers: 31993109 -> 31993112 (+0.00%)
Totals from 17813 (2.83% of 629912) affected shaders:
Instrs: 3738236 -> 3719918 (-0.49%); split: -0.49%, +0.00%
Cycle count: 4251157049 -> 4250388119 (-0.02%); split: -0.06%, +0.04%
Spill count: 28268 -> 28265 (-0.01%)
Fill count: 50377 -> 50374 (-0.01%)
Max live registers: 470648 -> 470651 (+0.00%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30251>
2024-07-16 16:04:38 -07:00
|
|
|
/* \sa inst_is_raw_move in brw_eu_validate. */
|
|
|
|
|
bool
|
|
|
|
|
fs_inst::is_raw_move() const
|
|
|
|
|
{
|
|
|
|
|
if (opcode != BRW_OPCODE_MOV)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (src[0].file == IMM) {
|
|
|
|
|
if (brw_type_is_vector_imm(src[0].type))
|
|
|
|
|
return false;
|
|
|
|
|
} else if (src[0].negate || src[0].abs) {
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (saturate)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
return src[0].type == dst.type ||
|
|
|
|
|
(brw_type_is_int(src[0].type) &&
|
|
|
|
|
brw_type_is_int(dst.type) &&
|
|
|
|
|
brw_type_size_bits(src[0].type) == brw_type_size_bits(dst.type));
|
|
|
|
|
}
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
*/
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
|
|
|
|
this->uniforms = v->uniforms;
|
2011-03-23 12:50:53 -07:00
|
|
|
}
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
enum intel_barycentric_mode
|
2024-04-18 09:54:11 +03:00
|
|
|
brw_barycentric_mode(const struct brw_wm_prog_key *key,
|
|
|
|
|
nir_intrinsic_instr *intr)
|
2016-07-11 15:00:37 -07:00
|
|
|
{
|
2022-07-06 13:01:24 -07:00
|
|
|
const glsl_interp_mode mode =
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
|
|
2016-07-11 15:00:37 -07:00
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
unsigned bary;
|
2022-07-06 13:01:24 -07:00
|
|
|
switch (intr->intrinsic) {
|
2016-07-12 03:57:25 -07:00
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2024-04-18 09:54:11 +03:00
|
|
|
/* When per sample interpolation is dynamic, assume sample
|
|
|
|
|
* interpolation. We'll dynamically remap things so that the FS thread
|
|
|
|
|
* payload is not affected.
|
|
|
|
|
*/
|
2024-11-18 10:58:46 +02:00
|
|
|
bary = key->persample_interp == INTEL_SOMETIMES ?
|
2024-11-18 11:33:35 +02:00
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE :
|
|
|
|
|
INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
2024-11-18 11:33:35 +02:00
|
|
|
bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
2016-07-12 03:57:25 -07:00
|
|
|
break;
|
|
|
|
|
default:
|
|
|
|
|
unreachable("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
bary += 3;
|
|
|
|
|
|
2024-11-18 11:33:35 +02:00
|
|
|
return (enum intel_barycentric_mode) bary;
|
2016-07-11 15:00:37 -07:00
|
|
|
}
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
/**
|
|
|
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
|
|
|
* isn't in control flow, and mark it with EOT.
|
|
|
|
|
*
|
|
|
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
|
|
|
*/
|
|
|
|
|
bool
|
|
|
|
|
fs_visitor::mark_last_urb_write_with_eot()
|
|
|
|
|
{
|
|
|
|
|
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
|
|
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
|
|
|
if (dead == prev)
|
|
|
|
|
break;
|
|
|
|
|
dead->remove();
|
|
|
|
|
}
|
|
|
|
|
return true;
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-03 12:15:21 -07:00
|
|
|
static unsigned
|
|
|
|
|
round_components_to_whole_registers(const intel_device_info *devinfo,
|
|
|
|
|
unsigned c)
|
|
|
|
|
{
|
|
|
|
|
return DIV_ROUND_UP(c, 8 * reg_unit(devinfo)) * reg_unit(devinfo);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
{
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned uniform_push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2024-02-19 23:07:04 -08:00
|
|
|
ubo_push_length += prog_data->ubo_ranges[i].length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(ubo_push_start[i] % (8 * reg_unit(devinfo)) == 0);
|
|
|
|
|
assert(ubo_push_length % (1 * reg_unit(devinfo)) == 0);
|
2016-11-29 02:47:15 -08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2024-07-01 14:45:38 -07:00
|
|
|
if (stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
((struct brw_wm_prog_key *)key)->null_push_constant_tbimr_workaround)
|
|
|
|
|
prog_data->curb_read_length = MAX2(1, prog_data->curb_read_length);
|
2014-02-19 15:27:01 +01:00
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
uint64_t used = 0;
|
2021-10-04 13:58:07 +03:00
|
|
|
bool is_compute = gl_shader_stage_is_compute(stage);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2024-09-30 08:45:21 +03:00
|
|
|
if (is_compute && devinfo->verx10 >= 125 && uniform_push_length > 0) {
|
2022-07-15 13:08:23 +03:00
|
|
|
assert(devinfo->has_lsc);
|
2023-11-21 09:47:18 -08:00
|
|
|
fs_builder ubld = fs_builder(this, 1).exec_all().at(
|
2020-05-04 16:17:58 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
2022-07-15 13:08:23 +03:00
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
|
|
|
|
* to mask off the bottom 6 bits.
|
2020-05-04 16:17:58 -05:00
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg base_addr =
|
2024-04-12 17:43:22 -07:00
|
|
|
ubld.AND(retype(brw_vec1_grf(0, 0), BRW_TYPE_UD),
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
2020-05-04 16:17:58 -05:00
|
|
|
* stateless messages.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
2022-07-15 13:08:23 +03:00
|
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
|
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
2020-05-04 16:17:58 -05:00
|
|
|
assert(num_regs > 0);
|
|
|
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
|
|
2024-01-26 12:25:41 -08:00
|
|
|
/* This pass occurs after all of the optimization passes, so don't
|
|
|
|
|
* emit an 'ADD addr, base_addr, 0' instruction.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg addr = i == 0 ? base_addr :
|
2024-04-12 17:43:22 -07:00
|
|
|
ubld.ADD(base_addr, brw_imm_ud(i * REG_SIZE));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg srcs[4] = {
|
2020-05-04 16:17:58 -05:00
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
2022-07-15 13:08:23 +03:00
|
|
|
addr, /* payload */
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg(), /* payload2 */
|
2020-05-04 16:17:58 -05:00
|
|
|
};
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UD);
|
2022-07-15 13:08:23 +03:00
|
|
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
|
|
|
|
|
|
|
|
|
send->sfid = GFX12_SFID_UGM;
|
|
|
|
|
send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
|
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
|
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
num_regs * 8 /* num_channels */,
|
|
|
|
|
true /* transpose */,
|
2022-09-29 12:38:19 -07:00
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS));
|
2022-07-15 13:08:23 +03:00
|
|
|
send->header_size = 0;
|
2022-09-28 16:17:02 -07:00
|
|
|
send->mlen = lsc_msg_addr_len(devinfo, LSC_ADDR_SIZE_A32, 1);
|
2022-07-15 13:08:23 +03:00
|
|
|
send->size_written =
|
2022-09-28 16:17:02 -07:00
|
|
|
lsc_msg_dest_len(devinfo, LSC_DATA_SIZE_D32, num_regs * 8) * REG_SIZE;
|
2020-05-04 16:17:58 -05:00
|
|
|
send->send_is_volatile = true;
|
|
|
|
|
|
|
|
|
|
i += num_regs;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2014-03-11 14:35:27 -07:00
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
} else {
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
*/
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
assert(constant_nr / 8 < 64);
|
|
|
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
constant_nr / 8,
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
assert(inst->src[i].stride == 0);
|
2015-10-24 15:29:03 -07:00
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
2024-02-19 23:07:04 -08:00
|
|
|
uint64_t want_zero = used & prog_data->zero_push_reg;
|
2020-04-03 20:20:53 -05:00
|
|
|
if (want_zero) {
|
2023-11-21 09:47:18 -08:00
|
|
|
fs_builder ubld = fs_builder(this, 8).exec_all().at(
|
2020-04-03 20:20:53 -05:00
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
2024-02-19 23:07:04 -08:00
|
|
|
unsigned mask_param = prog_data->push_reg_mask_param;
|
2022-08-19 12:40:20 -07:00
|
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
|
|
|
mask_param % 8);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg b32;
|
2020-04-03 20:20:53 -05:00
|
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
|
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg shifted = ubld.vgrf(BRW_TYPE_W, 2);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
2024-04-20 17:08:02 -07:00
|
|
|
byte_offset(retype(mask, BRW_TYPE_W), i / 8),
|
2020-04-03 20:20:53 -05:00
|
|
|
brw_imm_v(0x01234567));
|
|
|
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
|
|
|
|
|
|
fs_builder ubld16 = ubld.group(16, 0);
|
2024-04-20 17:08:02 -07:00
|
|
|
b32 = ubld16.vgrf(BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
|
|
|
assert(i < prog_data->curb_read_length);
|
|
|
|
|
struct brw_reg push_reg =
|
2024-04-20 17:30:23 -07:00
|
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0), BRW_TYPE_D);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
}
|
|
|
|
|
|
2014-10-03 19:05:32 -07:00
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
2022-08-19 12:40:20 -07:00
|
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
}
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/*
|
|
|
|
|
* Build up an array of indices into the urb_setup array that
|
|
|
|
|
* references the active entries of the urb_setup array.
|
|
|
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
|
|
|
* on each upload.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
{
|
2021-10-29 12:56:22 -07:00
|
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
|
|
|
* skip per-primitive attributes here.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
/* Make sure uint8_t is sufficient */
|
|
|
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
|
|
|
uint8_t index = 0;
|
|
|
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
|
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
|
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
|
|
|
}
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
void
|
|
|
|
|
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-09-12 16:49:11 -07:00
|
|
|
assert(inst->src[i].nr == 0);
|
2022-08-19 12:40:20 -07:00
|
|
|
int grf = payload().num_regs +
|
2015-03-11 23:14:31 -07:00
|
|
|
prog_data->curb_read_length +
|
2016-09-01 12:42:20 -07:00
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
2016-03-23 12:20:05 +01:00
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
|
|
|
*
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
* boundaries.
|
|
|
|
|
*
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
*/
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
inst->src[i].stride *
|
2024-04-21 00:57:59 -07:00
|
|
|
brw_type_size_bytes(inst->src[i].type);
|
2016-03-23 12:20:05 +01:00
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
int
|
|
|
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
{
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2021-03-29 13:43:47 -07:00
|
|
|
if (devinfo->verx10 >= 125)
|
2020-06-16 23:06:25 -05:00
|
|
|
return -1;
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-18 17:04:53 -07:00
|
|
|
/**
|
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2012-11-08 16:06:24 -08:00
|
|
|
*
|
2015-08-18 17:04:53 -07:00
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
2015-12-08 17:34:38 -08:00
|
|
|
* update the program to load them.
|
2012-11-08 16:06:24 -08:00
|
|
|
*/
|
|
|
|
|
void
|
2015-08-18 17:04:53 -07:00
|
|
|
fs_visitor::assign_constant_locations()
|
2012-11-08 16:06:24 -08:00
|
|
|
{
|
2016-02-22 10:42:07 -08:00
|
|
|
/* Only the first compile gets to decide on locations. */
|
2021-12-03 21:34:06 -06:00
|
|
|
if (push_constant_loc)
|
2014-03-07 16:10:50 -08:00
|
|
|
return;
|
2019-10-31 15:57:52 -05:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
for (unsigned u = 0; u < uniforms; u++)
|
|
|
|
|
push_constant_loc[u] = u;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
/* Now that we know how many regular uniforms we'll push, reduce the
|
|
|
|
|
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
|
2021-07-28 13:51:38 +10:00
|
|
|
*/
|
2024-02-17 22:43:47 -08:00
|
|
|
const unsigned max_push_length = 64;
|
2022-08-03 12:15:21 -07:00
|
|
|
unsigned push_length =
|
|
|
|
|
round_components_to_whole_registers(devinfo, prog_data->nr_params);
|
2016-11-29 05:20:20 -08:00
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
|
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
|
|
|
|
|
|
2021-07-28 13:51:38 +10:00
|
|
|
if (push_length + range->length > max_push_length)
|
|
|
|
|
range->length = max_push_length - push_length;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
|
|
|
|
push_length += range->length;
|
2022-08-03 12:15:21 -07:00
|
|
|
|
|
|
|
|
assert(push_length % (1 * reg_unit(devinfo)) == 0);
|
|
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
}
|
2021-07-28 13:51:38 +10:00
|
|
|
assert(push_length <= max_push_length);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
}
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
bool
|
2024-06-18 23:42:59 -07:00
|
|
|
fs_visitor::get_pull_locs(const brw_reg &src,
|
2017-06-02 09:54:31 -07:00
|
|
|
unsigned *out_surf_index,
|
|
|
|
|
unsigned *out_pull_index)
|
|
|
|
|
{
|
|
|
|
|
assert(src.file == UNIFORM);
|
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
if (src.nr < UBO_START)
|
|
|
|
|
return false;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
const struct brw_ubo_range *range =
|
|
|
|
|
&prog_data->ubo_ranges[src.nr - UBO_START];
|
2019-09-09 22:21:17 -07:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
/* If this access is in our (reduced) range, use the push data. */
|
|
|
|
|
if (src.offset / 32 < range->length)
|
|
|
|
|
return false;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
2021-12-03 22:20:30 -06:00
|
|
|
*out_surf_index = range->block;
|
2021-12-03 21:34:06 -06:00
|
|
|
*out_pull_index = (32 * range->start + src.offset) / 4;
|
2019-09-09 22:21:17 -07:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
prog_data->has_ubo_pull = true;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
return true;
|
2017-06-02 09:54:31 -07:00
|
|
|
}
|
|
|
|
|
|
2020-01-23 12:50:50 -08:00
|
|
|
/**
|
|
|
|
|
* Get the mask of SIMD channels enabled during dispatch and not yet disabled
|
|
|
|
|
* by discard. Due to the layout of the sample mask in the fragment shader
|
|
|
|
|
* thread payload, \p bld is required to have a dispatch_width() not greater
|
|
|
|
|
* than 16 for fragment shaders.
|
|
|
|
|
*/
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2022-06-27 12:24:58 -07:00
|
|
|
brw_sample_mask_reg(const fs_builder &bld)
|
2020-01-23 12:50:50 -08:00
|
|
|
{
|
2023-12-05 17:16:34 -08:00
|
|
|
const fs_visitor &s = *bld.shader;
|
2020-01-23 12:50:50 -08:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
if (s.stage != MESA_SHADER_FRAGMENT) {
|
2020-01-23 12:50:50 -08:00
|
|
|
return brw_imm_ud(0xffffffff);
|
2022-08-16 17:40:31 -07:00
|
|
|
} else if (s.devinfo->ver >= 20 ||
|
|
|
|
|
brw_wm_prog_data(s.prog_data)->uses_kill) {
|
2023-12-05 17:16:34 -08:00
|
|
|
return brw_flag_subreg(sample_mask_flag_subreg(s) + bld.group() / 16);
|
2020-01-23 12:50:50 -08:00
|
|
|
} else {
|
2024-02-17 22:43:47 -08:00
|
|
|
assert(bld.dispatch_width() <= 16);
|
2022-06-11 17:36:09 -07:00
|
|
|
assert(s.devinfo->ver < 20);
|
2020-01-23 12:50:50 -08:00
|
|
|
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
|
2024-04-20 17:08:02 -07:00
|
|
|
BRW_TYPE_UW);
|
2020-01-23 12:50:50 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
uint32_t
|
|
|
|
|
brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
{
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
2024-02-19 19:41:48 -08:00
|
|
|
if (prog_data->dual_src_blend) {
|
2022-07-22 17:33:12 -07:00
|
|
|
assert(inst->exec_size < 32);
|
2019-08-25 23:59:25 -07:00
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
else
|
|
|
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
|
|
|
} else {
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
2022-07-22 17:33:12 -07:00
|
|
|
else if (inst->exec_size == 32)
|
|
|
|
|
mctl = XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE;
|
2019-08-25 23:59:25 -07:00
|
|
|
else
|
|
|
|
|
unreachable("Invalid FB write execution size");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
/**
|
|
|
|
|
* Predicate the specified instruction on the sample mask.
|
|
|
|
|
*/
|
|
|
|
|
void
|
|
|
|
|
brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
{
|
|
|
|
|
assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
bld.group() == inst->group &&
|
|
|
|
|
bld.dispatch_width() == inst->exec_size);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
2023-12-05 17:16:34 -08:00
|
|
|
const fs_visitor &s = *bld.shader;
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg sample_mask = brw_sample_mask_reg(bld);
|
2023-12-05 17:16:34 -08:00
|
|
|
const unsigned subreg = sample_mask_flag_subreg(s);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
2022-08-16 17:40:31 -07:00
|
|
|
if (s.devinfo->ver >= 20 || brw_wm_prog_data(s.prog_data)->uses_kill) {
|
2022-06-27 12:24:58 -07:00
|
|
|
assert(sample_mask.file == ARF &&
|
|
|
|
|
sample_mask.nr == brw_flag_subreg(subreg).nr &&
|
|
|
|
|
sample_mask.subnr == brw_flag_subreg(
|
|
|
|
|
subreg + inst->group / 16).subnr);
|
|
|
|
|
} else {
|
|
|
|
|
bld.group(1, 0).exec_all()
|
|
|
|
|
.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
|
2015-07-13 17:59:34 +03:00
|
|
|
}
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
if (inst->predicate) {
|
|
|
|
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
|
|
|
|
assert(!inst->predicate_inverse);
|
|
|
|
|
assert(inst->flag_subreg == 0);
|
2022-07-22 17:11:52 -07:00
|
|
|
assert(s.devinfo->ver < 20);
|
2022-06-27 12:24:58 -07:00
|
|
|
/* Combine the sample mask with the existing predicate by using a
|
|
|
|
|
* vertical predication mode.
|
2015-11-16 17:23:01 -08:00
|
|
|
*/
|
2022-06-27 12:24:58 -07:00
|
|
|
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
|
2015-07-13 17:59:34 +03:00
|
|
|
} else {
|
2022-06-27 12:24:58 -07:00
|
|
|
inst->flag_subreg = subreg;
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
inst->predicate_inverse = false;
|
2015-07-13 17:59:34 +03:00
|
|
|
}
|
2015-07-27 16:14:36 +03:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::register_pressure(const fs_visitor *v)
|
2013-08-04 23:27:14 -07:00
|
|
|
{
|
2016-03-13 16:35:49 -07:00
|
|
|
const fs_live_variables &live = v->live_analysis.require();
|
2016-03-13 16:37:03 -07:00
|
|
|
const unsigned num_instructions = v->cfg->num_blocks ?
|
|
|
|
|
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip = new unsigned[num_instructions]();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
|
2016-03-13 16:25:57 -07:00
|
|
|
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
|
2016-03-13 16:35:49 -07:00
|
|
|
regs_live_at_ip[ip] += v->alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
const unsigned payload_count = v->first_non_payload_grf;
|
|
|
|
|
|
|
|
|
|
int *payload_last_use_ip = new int[payload_count];
|
intel/brw: Only force g0's liveness to be the whole program if spilling
We don't actually need to extend g0's live range to the EOT message
generally - most messages that end a shader are headerless. The main
implicit use of g0 is for constructing scratch headers. With the last
two patches, we now consider scratch access that may exist in the IR
and already extend the liveness appropriately.
There is one remaining problem: spilling. The register allocator will
create new scratch messages when spilling a register, which need to
create scratch headers, which need g0. So, every new spill or fill
might extend the live range of g0, which would create new interference,
altering the graph. This can be problematic.
However, when compiling SIMD16 or SIMD32 fragment shaders, we don't
allow spilling anyway. So, why not use allow g0? Also, when trying
various scheduling modes, we first try allocation without spilling.
If it works, great, if not, we try a (hopefully) less aggressive
schedule, and only allow spilling on the lowest-pressure schedule.
So, even for regular SIMD8 shaders, we can potentially gain the use
of g0 on the first few tries at scheduling+allocation.
Once we try to allocate with spilling, we go back to reserving g0
for the entire program, so that we can construct scratch headers at
any point. We could possibly do better here, but this is simple and
reliable with some benefit.
Thanks to Ian Romanick for suggesting I try this approach.
fossil-db on Alchemist shows some more spill/fill improvements:
Totals:
Instrs: 149062395 -> 149053010 (-0.01%); split: -0.01%, +0.00%
Cycles: 12609496913 -> 12611652181 (+0.02%); split: -0.45%, +0.47%
Spill count: 52891 -> 52471 (-0.79%)
Fill count: 101599 -> 100818 (-0.77%)
Scratch Memory Size: 3292160 -> 3197952 (-2.86%)
Totals from 416541 (66.59% of 625484) affected shaders:
Instrs: 124058587 -> 124049202 (-0.01%); split: -0.01%, +0.01%
Cycles: 3567164271 -> 3569319539 (+0.06%); split: -1.61%, +1.67%
Spill count: 420 -> 0 (-inf%)
Fill count: 781 -> 0 (-inf%)
Scratch Memory Size: 94208 -> 0 (-inf%)
Witcher 3 shows a 33% reduction in scratch memory size, for example.
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/30319>
2024-07-22 17:22:47 -07:00
|
|
|
v->calculate_payload_ranges(true, payload_count, payload_last_use_ip);
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
for (unsigned reg = 0; reg < payload_count; reg++) {
|
|
|
|
|
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
|
|
|
|
|
++regs_live_at_ip[ip];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
delete[] payload_last_use_ip;
|
2013-08-04 23:27:14 -07:00
|
|
|
}
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
brw::register_pressure::~register_pressure()
|
|
|
|
|
{
|
|
|
|
|
delete[] regs_live_at_ip;
|
|
|
|
|
}
|
|
|
|
|
|
2016-03-12 18:50:24 -08:00
|
|
|
void
|
|
|
|
|
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
|
|
|
|
|
{
|
2016-03-13 16:25:57 -07:00
|
|
|
live_analysis.invalidate(c);
|
2016-03-13 16:35:49 -07:00
|
|
|
regpressure_analysis.invalidate(c);
|
2024-02-19 22:25:16 -08:00
|
|
|
idom_analysis.invalidate(c);
|
2023-11-16 01:16:45 -08:00
|
|
|
def_analysis.invalidate(c);
|
2016-03-12 18:50:24 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
void
|
2023-08-14 16:59:17 -07:00
|
|
|
fs_visitor::debug_optimizer(const nir_shader *nir,
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
int iteration, int pass_num) const
|
|
|
|
|
{
|
2023-08-14 16:59:17 -07:00
|
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
|
2023-08-06 15:46:12 +03:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
char *filename;
|
2023-08-07 17:06:49 +03:00
|
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
|
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
2023-09-24 21:38:47 -07:00
|
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
2023-08-06 15:46:12 +03:00
|
|
|
iteration, pass_num, pass_name);
|
|
|
|
|
if (ret == -1)
|
|
|
|
|
return;
|
2024-12-07 09:53:31 -08:00
|
|
|
|
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
if (__normal_user()) {
|
|
|
|
|
file = fopen(filename, "w");
|
|
|
|
|
if (!file)
|
|
|
|
|
file = stderr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
brw_print_instructions(*this, file);
|
|
|
|
|
|
|
|
|
|
if (file != stderr)
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
free(filename);
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
static uint32_t
|
|
|
|
|
brw_compute_max_register_pressure(fs_visitor &s)
|
2023-02-03 17:02:28 +01:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const register_pressure &rp = s.regpressure_analysis.require();
|
2023-02-03 17:02:28 +01:00
|
|
|
uint32_t ip = 0, max_pressure = 0;
|
2024-07-12 16:55:33 -07:00
|
|
|
foreach_block_and_inst(block, fs_inst, inst, s.cfg) {
|
2023-02-03 17:02:28 +01:00
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
ip++;
|
|
|
|
|
}
|
|
|
|
|
return max_pressure;
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
static fs_inst **
|
|
|
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
|
|
|
{
|
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
|
|
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
|
|
|
|
fs_inst **inst_arr = new fs_inst * [num_insts];
|
|
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
assert(ip >= block->start_ip && ip <= block->end_ip);
|
|
|
|
|
inst_arr[ip++] = inst;
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
|
|
|
|
return inst_arr;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
|
|
|
|
|
{
|
2024-01-03 16:31:23 +10:00
|
|
|
ASSERTED int num_insts = cfg->last_block()->end_ip + 1;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
foreach_block (block, cfg) {
|
|
|
|
|
block->instructions.make_empty();
|
|
|
|
|
|
|
|
|
|
assert(ip == block->start_ip);
|
|
|
|
|
for (; ip <= block->end_ip; ip++)
|
|
|
|
|
block->instructions.push_tail(inst_arr[ip]);
|
|
|
|
|
}
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-28 13:59:35 -08:00
|
|
|
/* Per-thread scratch space is a power-of-two multiple of 1KB. */
|
|
|
|
|
static inline unsigned
|
|
|
|
|
brw_get_scratch_size(int size)
|
|
|
|
|
{
|
|
|
|
|
return MAX2(1024, util_next_power_of_two(size));
|
|
|
|
|
}
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
void
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_allocate_registers(fs_visitor &s, bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
{
|
2024-07-12 16:55:33 -07:00
|
|
|
const struct intel_device_info *devinfo = s.devinfo;
|
|
|
|
|
const nir_shader *nir = s.nir;
|
2019-05-09 14:44:16 -05:00
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
2021-11-09 22:55:49 -06:00
|
|
|
SCHEDULE_NONE,
|
2014-11-13 16:28:19 -08:00
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
};
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
static const char *scheduler_mode_name[] = {
|
2023-08-14 19:35:32 -07:00
|
|
|
[SCHEDULE_PRE] = "top-down",
|
|
|
|
|
[SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
|
|
|
[SCHEDULE_PRE_LIFO] = "lifo",
|
|
|
|
|
[SCHEDULE_POST] = "post",
|
|
|
|
|
[SCHEDULE_NONE] = "none",
|
2016-10-17 14:12:28 -07:00
|
|
|
};
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
|
|
|
|
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_fs_opt_compact_virtual_grfs(s);
|
2023-03-17 09:42:31 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.needs_register_pressure)
|
|
|
|
|
s.shader_stats.max_register_pressure = brw_compute_max_register_pressure(s);
|
2023-02-03 17:02:28 +01:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
2021-10-13 11:21:41 +02:00
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
2021-11-09 19:03:19 -06:00
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
fs_inst **orig_order = save_instruction_order(s.cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
fs_inst **best_pressure_order = NULL;
|
2021-11-09 19:03:19 -06:00
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
2024-07-12 16:55:33 -07:00
|
|
|
instruction_scheduler *sched = brw_prepare_scheduler(s, scheduler_ctx);
|
2023-10-20 10:32:54 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
*/
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
2023-08-14 19:35:32 -07:00
|
|
|
enum instruction_scheduler_mode sched_mode = pre_modes[i];
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_pre_ra(s, sched, sched_mode);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
s.debug_optimizer(nir, s.shader_stats.scheduler_mode, 95, i);
|
2023-08-15 01:15:26 -07:00
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
if (0) {
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_assign_regs_trivial(s);
|
2019-05-09 14:44:16 -05:00
|
|
|
allocated = true;
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
2024-07-12 16:55:33 -07:00
|
|
|
assert(!s.spilled_any_registers);
|
2019-05-09 14:44:16 -05:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, false, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
break;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
/* Save the maximum register pressure */
|
2024-07-12 16:55:33 -07:00
|
|
|
uint32_t this_pressure = brw_compute_max_register_pressure(s);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
|
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
|
|
|
best_register_pressure = this_pressure;
|
|
|
|
|
best_sched = sched_mode;
|
|
|
|
|
delete[] best_pressure_order;
|
2024-07-12 16:55:33 -07:00
|
|
|
best_pressure_order = save_instruction_order(s.cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, orig_order);
|
|
|
|
|
s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
}
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
if (!allocated) {
|
|
|
|
|
if (0) {
|
|
|
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
|
|
|
scheduler_mode_name[best_sched]);
|
|
|
|
|
}
|
2024-07-12 16:55:33 -07:00
|
|
|
restore_instruction_order(s.cfg, best_pressure_order);
|
|
|
|
|
s.shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
allocated = brw_assign_regs(s, allow_spilling, spill_all);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
delete[] orig_order;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
delete[] best_pressure_order;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
if (!allocated) {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Failure to register allocate. Reduce number of "
|
2020-05-19 14:37:44 -07:00
|
|
|
"live scalar values to avoid this.");
|
2024-07-12 16:55:33 -07:00
|
|
|
} else if (s.spilled_any_registers) {
|
|
|
|
|
brw_shader_perf_log(s.compiler, s.log_data,
|
2021-07-29 14:27:57 -07:00
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
"values to improve performance.\n",
|
2024-07-12 16:55:33 -07:00
|
|
|
_mesa_shader_stage_to_string(s.stage));
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2014-11-13 16:28:19 -08:00
|
|
|
return;
|
|
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
int pass_num = 0;
|
|
|
|
|
|
|
|
|
|
s.debug_optimizer(nir, "post_ra_alloc", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_fs_opt_bank_conflicts(s);
|
2017-06-15 15:23:57 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "bank_conflict", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_schedule_instructions_post_ra(s);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "post_ra_alloc_scheduling", 96, pass_num++);
|
2024-03-14 16:28:56 +02:00
|
|
|
|
2024-04-04 16:03:34 -07:00
|
|
|
/* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead
|
|
|
|
|
* of part of assign_regs since both bank conflicts optimization and post
|
|
|
|
|
* RA scheduling take advantage of distinguishing references to registers
|
|
|
|
|
* that were allocated from references that were already fixed.
|
|
|
|
|
*
|
|
|
|
|
* TODO: Change the passes above, then move this lowering to be part of
|
|
|
|
|
* assign_regs.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_fs_lower_vgrfs_to_fixed_grfs(s);
|
2024-04-04 16:03:34 -07:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, pass_num++);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
brw_shader_phase_update(s, BRW_SHADER_PHASE_AFTER_REGALLOC);
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch > 0) {
|
2016-06-09 18:13:26 -07:00
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
*
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.last_scratch <= devinfo->max_scratch_size_per_thread) {
|
2024-07-19 10:55:59 -07:00
|
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
|
|
|
* case of bindless shaders with return parts, this will also take the
|
|
|
|
|
* max of all parts.
|
|
|
|
|
*/
|
2024-07-12 16:55:33 -07:00
|
|
|
s.prog_data->total_scratch = MAX2(brw_get_scratch_size(s.last_scratch),
|
|
|
|
|
s.prog_data->total_scratch);
|
2024-07-19 10:55:59 -07:00
|
|
|
} else {
|
2024-07-12 16:55:33 -07:00
|
|
|
s.fail("Scratch space required is larger than supported");
|
2024-07-19 10:55:59 -07:00
|
|
|
}
|
2016-06-09 16:56:31 -07:00
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
if (s.failed)
|
2024-07-19 10:55:59 -07:00
|
|
|
return;
|
|
|
|
|
|
2024-07-12 16:55:33 -07:00
|
|
|
brw_fs_lower_scoreboard(s);
|
2024-11-20 16:18:40 -08:00
|
|
|
|
2024-12-18 11:07:08 -08:00
|
|
|
s.debug_optimizer(nir, "scoreboard", 96, pass_num++);
|
2014-11-13 16:28:19 -08:00
|
|
|
}
|
|
|
|
|
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
/**
|
|
|
|
|
* Move load_interpolated_input with simple (payload-based) barycentric modes
|
|
|
|
|
* to the top of the program so we don't emit multiple PLNs for the same input.
|
|
|
|
|
*
|
|
|
|
|
* This works around CSE not being able to handle non-dominating cases
|
|
|
|
|
* such as:
|
|
|
|
|
*
|
|
|
|
|
* if (...) {
|
|
|
|
|
* interpolate input
|
|
|
|
|
* } else {
|
|
|
|
|
* interpolate the same exact input
|
|
|
|
|
* }
|
|
|
|
|
*
|
|
|
|
|
* This should be replaced by global value numbering someday.
|
|
|
|
|
*/
|
2019-07-18 09:23:23 -05:00
|
|
|
bool
|
|
|
|
|
brw_nir_move_interpolation_to_top(nir_shader *nir)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
{
|
2017-03-09 11:05:08 -08:00
|
|
|
bool progress = false;
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
nir_foreach_function_impl(impl, nir) {
|
|
|
|
|
nir_block *top = nir_start_block(impl);
|
2023-03-20 20:57:47 -07:00
|
|
|
nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
|
2023-03-20 20:57:47 -07:00
|
|
|
bool impl_progress = false;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
2023-03-20 20:57:47 -07:00
|
|
|
for (nir_block *block = nir_block_cf_tree_next(top);
|
|
|
|
|
block != NULL;
|
|
|
|
|
block = nir_block_cf_tree_next(block)) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
nir_foreach_instr_safe(instr, block) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2016-07-26 13:19:46 -07:00
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
continue;
|
2016-07-26 13:19:46 -07:00
|
|
|
nir_intrinsic_instr *bary_intrinsic =
|
|
|
|
|
nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
|
|
|
|
|
nir_intrinsic_op op = bary_intrinsic->intrinsic;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
/* Leave interpolateAtSample/Offset() where they are. */
|
|
|
|
|
if (op == nir_intrinsic_load_barycentric_at_sample ||
|
|
|
|
|
op == nir_intrinsic_load_barycentric_at_offset)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
nir_instr *move[3] = {
|
|
|
|
|
&bary_intrinsic->instr,
|
|
|
|
|
intrin->src[1].ssa->parent_instr,
|
|
|
|
|
instr
|
|
|
|
|
};
|
|
|
|
|
|
2016-08-01 10:35:06 +10:00
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
|
2016-07-26 13:19:46 -07:00
|
|
|
if (move[i]->block != top) {
|
2023-03-20 20:57:47 -07:00
|
|
|
nir_instr_move(cursor, move[i]);
|
2023-03-20 20:57:47 -07:00
|
|
|
impl_progress = true;
|
2016-07-26 13:19:46 -07:00
|
|
|
}
|
|
|
|
|
}
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
}
|
|
|
|
|
}
|
2023-03-20 20:57:47 -07:00
|
|
|
|
|
|
|
|
progress = progress || impl_progress;
|
|
|
|
|
|
2024-06-16 16:32:01 -04:00
|
|
|
nir_metadata_preserve(impl, impl_progress ? nir_metadata_control_flow
|
|
|
|
|
: nir_metadata_all);
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
}
|
2017-03-09 11:05:08 -08:00
|
|
|
|
|
|
|
|
return progress;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
}
|
|
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
unsigned
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
unsigned threads)
|
|
|
|
|
{
|
|
|
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
|
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
|
|
|
cs_prog_data->push.cross_thread.size;
|
|
|
|
|
}
|
|
|
|
|
|
2020-04-28 13:09:27 -07:00
|
|
|
static bool
|
2020-07-29 17:50:03 -07:00
|
|
|
filter_simd(const nir_instr *instr, const void * /* options */)
|
2020-04-28 13:09:27 -07:00
|
|
|
{
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_simd_width_intel:
|
|
|
|
|
case nir_intrinsic_load_subgroup_id:
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
static nir_def *
|
2020-04-28 13:09:27 -07:00
|
|
|
lower_simd(nir_builder *b, nir_instr *instr, void *options)
|
|
|
|
|
{
|
|
|
|
|
uintptr_t simd_width = (uintptr_t)options;
|
|
|
|
|
|
|
|
|
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
|
|
|
case nir_intrinsic_load_simd_width_intel:
|
|
|
|
|
return nir_imm_int(b, simd_width);
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_id:
|
|
|
|
|
/* If the whole workgroup fits in one thread, we can lower subgroup_id
|
|
|
|
|
* to a constant zero.
|
|
|
|
|
*/
|
2021-05-05 12:24:44 -07:00
|
|
|
if (!b->shader->info.workgroup_size_variable) {
|
|
|
|
|
unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
|
|
|
|
|
b->shader->info.workgroup_size[1] *
|
|
|
|
|
b->shader->info.workgroup_size[2];
|
2020-04-28 13:09:27 -07:00
|
|
|
if (local_workgroup_size <= simd_width)
|
|
|
|
|
return nir_imm_int(b, 0);
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
bool
|
2020-04-28 13:09:27 -07:00
|
|
|
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
|
|
|
|
|
{
|
2022-07-18 18:35:34 +02:00
|
|
|
return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
|
2020-04-28 13:09:27 -07:00
|
|
|
(void *)(uintptr_t)dispatch_width);
|
|
|
|
|
}
|
|
|
|
|
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info
|
2021-04-28 10:54:53 -07:00
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
const unsigned *override_local_size)
|
|
|
|
|
{
|
2024-02-01 16:02:50 -08:00
|
|
|
struct intel_cs_dispatch_info info = {};
|
2021-04-28 10:54:53 -07:00
|
|
|
|
|
|
|
|
const unsigned *sizes =
|
|
|
|
|
override_local_size ? override_local_size :
|
|
|
|
|
prog_data->local_size;
|
|
|
|
|
|
2022-11-08 01:24:36 -08:00
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
2021-10-11 07:49:40 -07:00
|
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
2021-10-11 07:49:40 -07:00
|
|
|
info.simd_size = 8u << simd;
|
2021-04-28 10:54:53 -07:00
|
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
|
|
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
|
|
|
if (remainder > 0)
|
|
|
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
|
|
|
else
|
|
|
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-27 10:16:11 -07:00
|
|
|
void
|
|
|
|
|
brw_shader_phase_update(fs_visitor &s, enum brw_shader_phase phase)
|
|
|
|
|
{
|
|
|
|
|
assert(phase == s.phase + 1);
|
|
|
|
|
s.phase = phase;
|
|
|
|
|
brw_fs_validate(s);
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-20 14:42:02 -07:00
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
|
|
|
|
|
{
|
|
|
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
2023-06-21 07:51:00 -07:00
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
namespace brw {
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2023-11-21 07:49:02 -08:00
|
|
|
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
2022-08-03 16:47:52 -07:00
|
|
|
brw_reg_type type, unsigned n)
|
2023-11-21 07:49:02 -08:00
|
|
|
{
|
|
|
|
|
if (!regs[0])
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg();
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
if (bld.dispatch_width() > 16) {
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(type, n);
|
2023-11-21 07:49:02 -08:00
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(16, 0);
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *const components = new brw_reg[m * n];
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
for (unsigned c = 0; c < n; c++) {
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
components[c * m + g] =
|
|
|
|
|
offset(retype(brw_vec8_grf(regs[g], 0), type), hbld, c);
|
|
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, m * n, 0);
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2022-08-03 16:47:52 -07:00
|
|
|
delete[] components;
|
2023-11-21 07:49:02 -08:00
|
|
|
return tmp;
|
|
|
|
|
|
|
|
|
|
} else {
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg(retype(brw_vec8_grf(regs[0], 0), type));
|
2023-11-21 07:49:02 -08:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg
|
2023-11-21 07:49:02 -08:00
|
|
|
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
|
|
|
|
|
{
|
|
|
|
|
if (!regs[0])
|
2024-06-18 23:42:59 -07:00
|
|
|
return brw_reg();
|
2023-12-01 21:51:19 -08:00
|
|
|
else if (bld.shader->devinfo->ver >= 20)
|
2024-04-20 17:08:02 -07:00
|
|
|
return fetch_payload_reg(bld, regs, BRW_TYPE_F, 2);
|
2023-11-21 07:49:02 -08:00
|
|
|
|
2024-06-18 23:42:59 -07:00
|
|
|
const brw_reg tmp = bld.vgrf(BRW_TYPE_F, 2);
|
2023-11-21 07:49:02 -08:00
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(8, 0);
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
2024-06-18 23:42:59 -07:00
|
|
|
brw_reg *const components = new brw_reg[2 * m];
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < 2; c++) {
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
|
|
|
|
|
hbld, c + 2 * (g % 2));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
|
|
|
|
|
|
|
|
|
|
delete[] components;
|
|
|
|
|
return tmp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
check_dynamic_msaa_flag(const fs_builder &bld,
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
2024-02-01 13:17:42 -08:00
|
|
|
enum intel_msaa_flags flag)
|
2023-11-21 07:49:02 -08:00
|
|
|
{
|
|
|
|
|
fs_inst *inst = bld.AND(bld.null_reg_ud(),
|
|
|
|
|
dynamic_msaa_flags(wm_prog_data),
|
|
|
|
|
brw_imm_ud(flag));
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
}
|
|
|
|
|
}
|