2010-08-10 20:39:06 -07:00
|
|
|
|
/*
|
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
|
*
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
|
*
|
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
|
* Software.
|
|
|
|
|
|
*
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/** @file brw_fs.cpp
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "brw_eu.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
|
#include "brw_fs.h"
|
2023-11-21 09:58:55 -08:00
|
|
|
|
#include "brw_fs_builder.h"
|
2020-01-23 22:55:33 -08:00
|
|
|
|
#include "brw_fs_live_variables.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
|
#include "brw_nir.h"
|
2015-03-11 23:14:31 -07:00
|
|
|
|
#include "brw_vec4_gs_visitor.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
|
#include "brw_cfg.h"
|
2013-10-30 10:32:12 -07:00
|
|
|
|
#include "brw_dead_control_flow.h"
|
2021-10-07 00:23:07 -07:00
|
|
|
|
#include "brw_private.h"
|
2023-09-24 21:38:47 -07:00
|
|
|
|
#include "shader_enums.h"
|
2021-04-05 10:44:41 -07:00
|
|
|
|
#include "dev/intel_debug.h"
|
2023-01-20 23:19:34 -08:00
|
|
|
|
#include "dev/intel_wa.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
|
#include "compiler/nir/nir_builder.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
|
#include <memory>
|
|
|
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
|
using namespace brw;
|
|
|
|
|
|
|
2022-06-29 14:13:31 -07:00
|
|
|
|
static unsigned get_lowered_simd_width(const struct brw_compiler *compiler,
|
2016-06-28 14:48:22 -07:00
|
|
|
|
const fs_inst *inst);
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(sources, 3)];
|
|
|
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
this->opcode = opcode;
|
|
|
|
|
|
this->dst = dst;
|
2014-02-20 08:18:22 -08:00
|
|
|
|
this->sources = sources;
|
2014-08-14 13:56:24 -07:00
|
|
|
|
this->exec_size = exec_size;
|
i965: Set fs_inst::base_mrf = -1 by default.
On MRF platforms, we need to set base_mrf to the first MRF value we'd
like to use for the message. On send-from-GRF platforms, we set it to
-1 to indicate that the operation doesn't use MRFs.
As MRF platforms are becoming increasingly a thing of the past, we've
forgotten to bother with this. It makes more sense to set it to -1 by
default, so we don't have to think about it for new code.
I searched the code for every instance of 'mlen =' in brw_fs*cpp, and
it appears that all MRF-based messages correctly program a base_mrf.
Forgetting to set base_mrf = -1 can confuse the register allocator,
causing it to think we have a large fake-MRF region. This ends up
moving the send-with-EOT registers earlier, sometimes even out of
the g112-g127 range, which is illegal. For example, this fixes
illegal sends in Piglit's arb_gpu_shader_fp64-layout-std430-fp64-shader,
which had SSBO messages with mlen > 0 but base_mrf == 0.
Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-06-22 17:01:12 -07:00
|
|
|
|
this->base_mrf = -1;
|
2014-08-14 13:56:24 -07:00
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
|
switch (dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
case FIXED_GRF:
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case MRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2016-09-07 13:38:20 -07:00
|
|
|
|
this->size_written = dst.component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2016-09-07 13:38:20 -07:00
|
|
|
|
this->size_written = 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-18 12:30:43 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-06-18 12:30:43 -07:00
|
|
|
|
init(opcode, exec_size, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
|
{
|
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
|
memcpy((void*)this, &that, sizeof(that));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(that.sources, 3)];
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
for (unsigned i = 0; i < that.sources; i++)
|
2014-02-19 21:18:44 -08:00
|
|
|
|
this->src[i] = that.src[i];
|
2014-02-20 09:40:02 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
|
{
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 13:14:05 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->sources != num_sources) {
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
|
|
|
|
|
|
src[i] = this->src[i];
|
|
|
|
|
|
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
this->src = src;
|
2014-02-20 13:14:05 -08:00
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 22:22:39 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
|
|
|
|
|
|
const fs_reg &dst,
|
2023-01-13 12:29:30 +02:00
|
|
|
|
const fs_reg &surface,
|
|
|
|
|
|
const fs_reg &surface_handle,
|
2014-02-19 20:31:14 -08:00
|
|
|
|
const fs_reg &varying_offset,
|
2020-02-21 10:59:38 -06:00
|
|
|
|
uint32_t const_offset,
|
|
|
|
|
|
uint8_t alignment)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
|
|
|
|
|
* be any component of a vector, and then we load 4 contiguous
|
|
|
|
|
|
* components starting from that.
|
|
|
|
|
|
*
|
2016-09-02 13:53:13 -07:00
|
|
|
|
* We break down the const_offset to a portion added to the variable offset
|
|
|
|
|
|
* and a portion done using fs_reg::offset, which means that if you have
|
|
|
|
|
|
* GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
|
|
|
|
|
|
* we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
|
|
|
|
|
|
* later notice that those loads are all the same and eliminate the
|
|
|
|
|
|
* redundant ones.
|
2013-03-18 10:16:42 -07:00
|
|
|
|
*/
|
2015-11-25 09:59:03 -08:00
|
|
|
|
fs_reg vec4_offset = vgrf(glsl_type::uint_type);
|
2015-10-31 16:52:29 -07:00
|
|
|
|
bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
|
2013-03-18 10:16:42 -07:00
|
|
|
|
|
2016-01-14 08:55:28 +01:00
|
|
|
|
/* The pull load message will load a vec4 (16 bytes). If we are loading
|
|
|
|
|
|
* a double this means we are only loading 2 elements worth of data.
|
|
|
|
|
|
* We also want to use a 32-bit data type for the dst of the load operation
|
|
|
|
|
|
* so other parts of the driver don't get confused about the size of the
|
|
|
|
|
|
* result.
|
|
|
|
|
|
*/
|
2016-05-18 01:26:03 -07:00
|
|
|
|
fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
|
2023-01-13 12:29:30 +02:00
|
|
|
|
|
|
|
|
|
|
fs_reg srcs[PULL_VARYING_CONSTANT_SRCS];
|
|
|
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_SURFACE] = surface;
|
|
|
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_SURFACE_HANDLE] = surface_handle;
|
|
|
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_OFFSET] = vec4_offset;
|
|
|
|
|
|
srcs[PULL_VARYING_CONSTANT_SRC_ALIGNMENT] = brw_imm_ud(alignment);
|
|
|
|
|
|
|
2016-05-17 23:18:38 -07:00
|
|
|
|
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
|
2023-01-13 12:29:30 +02:00
|
|
|
|
vec4_result, srcs, PULL_VARYING_CONSTANT_SRCS);
|
2016-09-01 18:43:48 -07:00
|
|
|
|
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2018-06-09 11:45:42 +02:00
|
|
|
|
shuffle_from_32bit_read(bld, dst, vec4_result,
|
|
|
|
|
|
(const_offset & 0xf) / type_sz(dst.type), 1);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* A helper for MOV generation for fixing up broken hardware SEND dependency
|
|
|
|
|
|
* handling.
|
|
|
|
|
|
*/
|
2015-06-03 22:22:10 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* The caller always wants uncompressed to emit the minimal extra
|
|
|
|
|
|
* dependencies, and to avoid having to deal with aligning its regs to 2.
|
|
|
|
|
|
*/
|
2015-06-03 22:22:10 +03:00
|
|
|
|
const fs_builder ubld = bld.annotate("send dependency resolve")
|
2020-04-03 13:04:43 -07:00
|
|
|
|
.quarter(0);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
|
case SHADER_OPCODE_SEND:
|
2014-09-13 11:49:55 -07:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2019-04-26 17:11:42 -07:00
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2014-09-13 11:49:55 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
return src[1].file == VGRF;
|
2014-09-12 16:17:37 -07:00
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
2016-07-21 16:52:33 -07:00
|
|
|
|
case FS_OPCODE_FB_READ:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
return src[0].file == VGRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
|
default:
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::is_control_source(unsigned arg) const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2021-03-29 16:02:30 -07:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
|
2019-01-16 18:30:08 -08:00
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
return arg == 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
|
case SHADER_OPCODE_TEX:
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
case SHADER_OPCODE_LOD:
|
|
|
|
|
|
case SHADER_OPCODE_TG4:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
|
|
|
|
|
return arg == 1 || arg == 2;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
|
return arg == 0 || arg == 1;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-09 14:13:37 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::is_payload(unsigned arg) const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
case FS_OPCODE_FB_READ:
|
|
|
|
|
|
case VEC4_OPCODE_UNTYPED_ATOMIC:
|
|
|
|
|
|
case VEC4_OPCODE_UNTYPED_SURFACE_READ:
|
|
|
|
|
|
case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2023-10-09 08:23:53 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
case SHADER_OPCODE_LOD:
|
|
|
|
|
|
case SHADER_OPCODE_TG4:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
2018-11-09 14:13:37 -08:00
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
|
return arg == 2 || arg == 3;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
2023-10-09 08:23:53 -07:00
|
|
|
|
return false;
|
2018-11-09 14:13:37 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if this instruction's sources and destinations cannot
|
|
|
|
|
|
* safely be the same register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* In most cases, a register can be written over safely by the same
|
|
|
|
|
|
* instruction that is its last use. For a single instruction, the
|
|
|
|
|
|
* sources are dereferenced before writing of the destination starts
|
|
|
|
|
|
* (naturally).
|
|
|
|
|
|
*
|
|
|
|
|
|
* However, there are a few cases where this can be problematic:
|
|
|
|
|
|
*
|
|
|
|
|
|
* - Virtual opcodes that translate to multiple instructions in the
|
|
|
|
|
|
* code generator: if src == dst and one instruction writes the
|
|
|
|
|
|
* destination before a later instruction reads the source, then
|
|
|
|
|
|
* src will have been clobbered.
|
|
|
|
|
|
*
|
|
|
|
|
|
* - SIMD16 compressed instructions with certain regioning (see below).
|
|
|
|
|
|
*
|
|
|
|
|
|
* The register allocator uses this information to set up conflicts between
|
|
|
|
|
|
* GRF sources and the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::has_source_and_destination_hazard() const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
|
/* Multiple partial writes to the destination */
|
|
|
|
|
|
return true;
|
2017-08-29 09:21:32 -07:00
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
|
/* This instruction returns an arbitrary channel from the source and
|
|
|
|
|
|
* gets split into smaller instructions in the generator. It's possible
|
|
|
|
|
|
* that one of the instructions will read from a channel corresponding
|
|
|
|
|
|
* to an earlier instruction.
|
|
|
|
|
|
*/
|
2017-08-31 21:45:30 -07:00
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
|
/* This is implemented as
|
|
|
|
|
|
*
|
|
|
|
|
|
* mov(16) g4<1>D 0D { align1 WE_all 1H };
|
|
|
|
|
|
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
|
|
|
|
|
|
*
|
|
|
|
|
|
* Because the source is only read in the second instruction, the first
|
|
|
|
|
|
* may stomp all over it.
|
|
|
|
|
|
*/
|
2017-08-29 09:21:32 -07:00
|
|
|
|
return true;
|
2018-12-06 14:11:34 -08:00
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
|
switch (src[1].ud) {
|
|
|
|
|
|
case BRW_SWIZZLE_XXXX:
|
|
|
|
|
|
case BRW_SWIZZLE_YYYY:
|
|
|
|
|
|
case BRW_SWIZZLE_ZZZZ:
|
|
|
|
|
|
case BRW_SWIZZLE_WWWW:
|
|
|
|
|
|
case BRW_SWIZZLE_XXZZ:
|
|
|
|
|
|
case BRW_SWIZZLE_YYWW:
|
|
|
|
|
|
case BRW_SWIZZLE_XYXY:
|
|
|
|
|
|
case BRW_SWIZZLE_ZWZW:
|
|
|
|
|
|
/* These can be implemented as a single Align1 region on all
|
|
|
|
|
|
* platforms, so there's never a hazard between source and
|
|
|
|
|
|
* destination. C.f. fs_generator::generate_quad_swizzle().
|
|
|
|
|
|
*/
|
|
|
|
|
|
return false;
|
|
|
|
|
|
default:
|
|
|
|
|
|
return !is_uniform(src[0]);
|
|
|
|
|
|
}
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
|
default:
|
|
|
|
|
|
/* The SIMD16 compressed instruction
|
|
|
|
|
|
*
|
|
|
|
|
|
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* is actually decoded in hardware as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
|
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* Which is safe. However, if we have uniform accesses
|
|
|
|
|
|
* happening, we get into trouble:
|
|
|
|
|
|
*
|
|
|
|
|
|
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
|
|
|
|
|
|
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* Now our destination for the first instruction overwrote the
|
|
|
|
|
|
* second instruction's src0, and we get garbage for those 8
|
2021-03-29 15:40:04 -07:00
|
|
|
|
* pixels. There's a similar issue for the pre-gfx6
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
|
* pixel_x/pixel_y, which are registers of 16-bit values and thus
|
|
|
|
|
|
* would get stomped by the first decode as well.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (exec_size == 16) {
|
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
|
|
|
|
|
if (src[i].file == VGRF && (src[i].stride == 0 ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_UW ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_W ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_UB ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_B)) {
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2021-04-05 13:19:39 -07:00
|
|
|
|
fs_inst::can_do_source_mods(const struct intel_device_info *devinfo) const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver == 6 && is_math())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2021-03-29 17:15:41 -07:00
|
|
|
|
/* From Wa_1604601757:
|
2018-12-07 14:13:53 -08:00
|
|
|
|
*
|
|
|
|
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
|
|
|
|
* is not supported."
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 12 && (opcode == BRW_OPCODE_MUL ||
|
2018-12-07 14:13:53 -08:00
|
|
|
|
opcode == BRW_OPCODE_MAD)) {
|
|
|
|
|
|
const brw_reg_type exec_type = get_exec_type(this);
|
|
|
|
|
|
const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
|
|
|
|
|
|
MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
|
|
|
|
|
|
MIN2(type_sz(src[0].type), type_sz(src[1].type));
|
|
|
|
|
|
|
|
|
|
|
|
if (brw_reg_type_is_integer(exec_type) &&
|
|
|
|
|
|
type_sz(exec_type) >= 4 &&
|
|
|
|
|
|
type_sz(exec_type) != min_type_sz)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (!backend_instruction::can_do_source_mods())
|
2013-09-19 19:48:22 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::can_do_cmod()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!backend_instruction::can_do_cmod())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* The accumulator result appears to get used for the conditional modifier
|
|
|
|
|
|
* generation. When negating a UD value, there is a 33rd bit generated for
|
|
|
|
|
|
* the sign in the accumulator value, so now you can't check, for example,
|
|
|
|
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < sources; i++) {
|
2021-08-18 14:04:45 -07:00
|
|
|
|
if (brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate)
|
2018-10-08 12:22:35 -05:00
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-14 02:12:09 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::can_change_types() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return dst.type == src[0].type &&
|
|
|
|
|
|
!src[0].abs && !src[0].negate && !saturate &&
|
|
|
|
|
|
(opcode == BRW_OPCODE_MOV ||
|
|
|
|
|
|
(opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
|
dst.type == src[1].type &&
|
|
|
|
|
|
predicate != BRW_PREDICATE_NONE &&
|
|
|
|
|
|
!src[1].abs && !src[1].negate));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_reg::init()
|
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2017-07-25 13:16:25 -07:00
|
|
|
|
type = BRW_REGISTER_TYPE_UD;
|
2013-12-08 04:57:35 +01:00
|
|
|
|
stride = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Generic unset register constructor. */
|
|
|
|
|
|
fs_reg::fs_reg()
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = BAD_FILE;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-23 16:17:28 -08:00
|
|
|
|
fs_reg::fs_reg(struct ::brw_reg reg) :
|
2015-10-24 15:29:03 -07:00
|
|
|
|
backend_reg(reg)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2016-09-01 12:42:20 -07:00
|
|
|
|
this->offset = 0;
|
2015-10-24 15:29:03 -07:00
|
|
|
|
this->stride = 1;
|
2015-11-02 00:25:04 +00:00
|
|
|
|
if (this->file == IMM &&
|
|
|
|
|
|
(this->type != BRW_REGISTER_TYPE_V &&
|
|
|
|
|
|
this->type != BRW_REGISTER_TYPE_UV &&
|
|
|
|
|
|
this->type != BRW_REGISTER_TYPE_VF)) {
|
|
|
|
|
|
this->stride = 0;
|
|
|
|
|
|
}
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
2015-11-22 13:25:05 -08:00
|
|
|
|
return (this->backend_reg::equals(r) &&
|
2014-06-29 15:13:24 -07:00
|
|
|
|
stride == r.stride);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-07 16:11:37 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::negative_equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
|
|
|
|
|
return (this->backend_reg::negative_equals(r) &&
|
|
|
|
|
|
stride == r.stride);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_contiguous() const
|
|
|
|
|
|
{
|
2020-01-02 15:32:56 -08:00
|
|
|
|
switch (file) {
|
|
|
|
|
|
case ARF:
|
|
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
|
return hstride == BRW_HORIZONTAL_STRIDE_1 &&
|
|
|
|
|
|
vstride == width + hstride;
|
|
|
|
|
|
case MRF:
|
|
|
|
|
|
case VGRF:
|
|
|
|
|
|
case ATTR:
|
|
|
|
|
|
return stride == 1;
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
unreachable("Invalid register file");
|
2013-12-08 04:57:35 +01:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-14 15:43:44 +03:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_reg::component_size(unsigned width) const
|
|
|
|
|
|
{
|
2015-10-26 17:52:57 -07:00
|
|
|
|
const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
|
2015-10-24 15:29:03 -07:00
|
|
|
|
hstride == 0 ? 0 :
|
|
|
|
|
|
1 << (hstride - 1));
|
2015-07-14 15:43:44 +03:00
|
|
|
|
return MAX2(width * stride, 1) * type_sz(type);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2020-07-02 13:37:10 +02:00
|
|
|
|
msg = ralloc_asprintf(mem_ctx, "SIMD%d %s compile failed: %s\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
|
dispatch_width, _mesa_shader_stage_to_abbrev(stage), msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
|
2021-03-23 11:31:51 -07:00
|
|
|
|
if (unlikely(debug_enabled)) {
|
2011-06-10 15:26:02 -03:00
|
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
*
|
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
2016-05-18 14:39:52 -07:00
|
|
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
} else {
|
2020-10-30 17:41:02 +02:00
|
|
|
|
max_dispatch_width = MIN2(max_dispatch_width, n);
|
2021-07-29 14:27:57 -07:00
|
|
|
|
brw_shader_perf_log(compiler, log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
|
"Shader dispatch width limited to SIMD%d: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
|
* it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
2019-04-24 12:38:28 +02:00
|
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
|
{
|
2023-03-14 18:22:50 +02:00
|
|
|
|
if (this->predicate && !this->predicate_trivial &&
|
|
|
|
|
|
this->opcode != BRW_OPCODE_SEL)
|
2023-03-10 16:11:56 +02:00
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
if (this->dst.offset % REG_SIZE != 0)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
/* SEND instructions always write whole registers */
|
|
|
|
|
|
if (this->opcode == SHADER_OPCODE_SEND)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2023-07-23 18:20:23 +03:00
|
|
|
|
/* Special case UNDEF since a lot of places in the backend do things like this :
|
|
|
|
|
|
*
|
|
|
|
|
|
* fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
|
* fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
* ubld.UNDEF(tmp); <- partial write, even if the whole register is concerned
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (this->opcode == SHADER_OPCODE_UNDEF) {
|
|
|
|
|
|
assert(this->dst.is_contiguous());
|
|
|
|
|
|
return this->size_written < 32;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-03-10 16:11:56 +02:00
|
|
|
|
return this->exec_size * type_sz(this->dst.type) < 32 ||
|
|
|
|
|
|
!this->dst.is_contiguous();
|
2012-06-04 08:59:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_inst::components_read(unsigned i) const
|
|
|
|
|
|
{
|
2016-08-12 18:33:58 -07:00
|
|
|
|
/* Return zero if the source is not present. */
|
|
|
|
|
|
if (src[i].file == BAD_FILE)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_LINTERP:
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
2020-10-29 15:10:59 +02:00
|
|
|
|
assert(i < 2);
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
2015-07-21 17:28:39 +03:00
|
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
2015-10-20 14:29:37 -07:00
|
|
|
|
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
2015-07-27 16:14:36 +03:00
|
|
|
|
/* First/second FB write color. */
|
|
|
|
|
|
if (i < 2)
|
2015-10-24 14:55:57 -07:00
|
|
|
|
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
2015-07-27 16:14:36 +03:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
2018-10-31 09:52:33 -05:00
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
2015-09-08 15:52:09 +01:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
2020-07-07 23:54:00 -07:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-02-05 18:39:13 -08:00
|
|
|
|
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
|
2023-05-23 13:11:02 +03:00
|
|
|
|
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM &&
|
|
|
|
|
|
src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
/* Texture coordinates. */
|
2016-02-05 18:39:13 -08:00
|
|
|
|
if (i == TEX_LOGICAL_SRC_COORDINATE)
|
|
|
|
|
|
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
/* Texture derivatives. */
|
2016-02-05 18:39:13 -08:00
|
|
|
|
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
|
|
|
|
|
|
opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
|
|
|
|
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
/* Texture offset. */
|
2016-11-28 18:13:02 -08:00
|
|
|
|
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
return 2;
|
2015-09-08 15:52:09 +01:00
|
|
|
|
/* MCS */
|
2020-07-07 23:54:00 -07:00
|
|
|
|
else if (i == TEX_LOGICAL_SRC_MCS) {
|
|
|
|
|
|
if (opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else if (opcode == SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL)
|
|
|
|
|
|
return 4;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
} else
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface operation source (ignored for reads). */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
2015-07-21 18:45:32 +03:00
|
|
|
|
return 0;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface operation source. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
2020-10-05 14:43:41 -07:00
|
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
2018-11-14 17:13:57 -06:00
|
|
|
|
return 1;
|
|
|
|
|
|
|
2020-10-05 14:43:41 -07:00
|
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
|
|
|
|
|
if (i == A64_LOGICAL_SRC) { /* data to write */
|
|
|
|
|
|
const unsigned comps = src[A64_LOGICAL_ARG].ud / exec_size;
|
2020-10-05 14:43:41 -07:00
|
|
|
|
assert(comps > 0);
|
|
|
|
|
|
return comps;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-10-29 14:20:39 -07:00
|
|
|
|
case SHADER_OPCODE_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_OWORD_BLOCK_WRITE_LOGICAL:
|
|
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_DATA) {
|
|
|
|
|
|
const unsigned comps = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud / exec_size;
|
|
|
|
|
|
assert(comps > 0);
|
|
|
|
|
|
return comps;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
|
|
|
|
|
return i == A64_LOGICAL_SRC ? src[A64_LOGICAL_ARG].ud : 1;
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
2023-01-10 15:09:12 -08:00
|
|
|
|
assert(src[A64_LOGICAL_ARG].file == IMM);
|
|
|
|
|
|
return i == A64_LOGICAL_SRC ?
|
|
|
|
|
|
lsc_op_num_data_values(src[A64_LOGICAL_ARG].ud) : 1;
|
2018-11-26 15:15:04 -06:00
|
|
|
|
|
2017-07-01 08:19:17 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
2015-04-08 02:41:33 -07:00
|
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
|
2017-07-01 08:19:17 +02:00
|
|
|
|
/* Scattered logical opcodes use the following params:
|
|
|
|
|
|
* src[0] Surface coordinates
|
|
|
|
|
|
* src[1] Surface operation source (ignored for reads)
|
|
|
|
|
|
* src[2] Surface
|
|
|
|
|
|
* src[3] IMM with always 1 dimension.
|
|
|
|
|
|
* src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
|
|
|
|
|
|
*/
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
|
2017-07-01 08:19:17 +02:00
|
|
|
|
|
2017-07-01 08:16:01 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
2015-04-08 02:41:33 -07:00
|
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
2017-07-01 08:16:01 +02:00
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface operation source. */
|
2023-01-09 15:37:30 -08:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
|
|
|
|
|
return lsc_op_num_data_values(op);
|
2023-05-31 12:15:02 -07:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
}
|
2016-04-25 18:06:13 -07:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
return (i == 0 ? 2 : 1);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
2022-09-28 16:38:35 -07:00
|
|
|
|
assert(src[URB_LOGICAL_SRC_COMPONENTS].file == IMM);
|
|
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
|
if (i == URB_LOGICAL_SRC_DATA)
|
2022-09-28 16:38:35 -07:00
|
|
|
|
return src[URB_LOGICAL_SRC_COMPONENTS].ud;
|
2022-07-12 15:32:01 -07:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
default:
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-09-07 17:00:58 -07:00
|
|
|
|
unsigned
|
2016-09-07 17:00:07 -07:00
|
|
|
|
fs_inst::size_read(int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
{
|
2015-06-18 11:53:08 -07:00
|
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
|
if (arg == 2) {
|
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
|
} else if (arg == 3) {
|
|
|
|
|
|
return ex_mlen * REG_SIZE;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
2018-05-17 15:40:48 -07:00
|
|
|
|
case FS_OPCODE_REP_FB_WRITE:
|
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
|
if (base_mrf >= 0)
|
|
|
|
|
|
return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
|
|
|
|
|
|
else
|
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-07-21 16:52:33 -07:00
|
|
|
|
case FS_OPCODE_FB_READ:
|
2018-04-19 20:48:42 -07:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
2015-06-18 11:53:08 -07:00
|
|
|
|
if (arg == 0)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2017-01-13 15:32:05 -08:00
|
|
|
|
case FS_OPCODE_SET_SAMPLE_ID:
|
|
|
|
|
|
if (arg == 1)
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
case FS_OPCODE_LINTERP:
|
2015-07-21 17:28:39 +03:00
|
|
|
|
if (arg == 1)
|
2016-09-07 13:02:55 -07:00
|
|
|
|
return 16;
|
2015-06-18 17:48:27 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-30 15:51:13 -07:00
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
|
if (arg < this->header_size)
|
2023-05-19 11:10:01 +03:00
|
|
|
|
return retype(src[arg], BRW_REGISTER_TYPE_UD).component_size(8);
|
2015-06-30 15:51:13 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-07-16 15:04:43 -07:00
|
|
|
|
case CS_OPCODE_CS_TERMINATE:
|
2015-09-15 14:01:17 -07:00
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return REG_SIZE;
|
2015-07-16 15:04:43 -07:00
|
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
|
assert(src[2].file == IMM);
|
2016-09-07 14:36:32 -07:00
|
|
|
|
return src[2].ud;
|
2015-11-07 18:58:34 -08:00
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2023-10-09 08:23:53 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
case SHADER_OPCODE_LOD:
|
|
|
|
|
|
case SHADER_OPCODE_TG4:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
|
|
|
|
|
if (arg == 0 && src[0].file == VGRF)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
|
break;
|
2023-10-09 08:23:53 -07:00
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
case IMM:
|
2016-09-02 16:23:44 -07:00
|
|
|
|
return components_read(arg) * type_sz(src[arg].type);
|
2016-08-12 18:33:58 -07:00
|
|
|
|
case BAD_FILE:
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
case FIXED_GRF:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-08-05 16:29:30 +03:00
|
|
|
|
case ATTR:
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return components_read(arg) * src[arg].component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case MRF:
|
|
|
|
|
|
unreachable("MRF registers are not allowed as sources");
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
}
|
2015-10-26 06:58:56 -07:00
|
|
|
|
return 0;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
|
namespace {
|
2019-09-24 17:06:12 -05:00
|
|
|
|
unsigned
|
|
|
|
|
|
predicate_width(brw_predicate predicate)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (predicate) {
|
|
|
|
|
|
case BRW_PREDICATE_NONE: return 1;
|
|
|
|
|
|
case BRW_PREDICATE_NORMAL: return 1;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
|
|
|
|
|
|
default: unreachable("Unsupported predicate");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
|
/* Return the subset of flag registers that an instruction could
|
|
|
|
|
|
* potentially read or write based on the execution controls and flag
|
|
|
|
|
|
* subregister number of the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned
|
2019-09-24 17:06:12 -05:00
|
|
|
|
flag_mask(const fs_inst *inst, unsigned width)
|
2016-05-18 21:54:35 -07:00
|
|
|
|
{
|
2019-09-24 17:06:12 -05:00
|
|
|
|
assert(util_is_power_of_two_nonzero(width));
|
|
|
|
|
|
const unsigned start = (inst->flag_subreg * 16 + inst->group) &
|
|
|
|
|
|
~(width - 1);
|
|
|
|
|
|
const unsigned end = start + ALIGN(inst->exec_size, width);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
|
|
|
|
|
|
}
|
2017-06-20 22:38:48 -07:00
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
|
bit_mask(unsigned n)
|
|
|
|
|
|
{
|
|
|
|
|
|
return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
|
flag_mask(const fs_reg &r, unsigned sz)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (r.file == ARF) {
|
|
|
|
|
|
const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
|
|
|
|
|
|
const unsigned end = start + sz;
|
|
|
|
|
|
return bit_mask(end) & ~bit_mask(start);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-05-18 21:54:35 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
|
fs_inst::flags_read(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
2016-05-18 21:54:35 -07:00
|
|
|
|
if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
|
|
|
|
|
|
predicate == BRW_PREDICATE_ALIGN1_ALLV) {
|
|
|
|
|
|
/* The vertical predication modes combine corresponding bits from
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* f0.0 and f1.0 on Gfx7+, and f0.0 and f0.1 on older hardware.
|
2016-05-18 21:54:35 -07:00
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
const unsigned shift = devinfo->ver >= 7 ? 4 : 2;
|
2019-09-24 17:06:12 -05:00
|
|
|
|
return flag_mask(this, 1) << shift | flag_mask(this, 1);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
} else if (predicate) {
|
2019-09-24 17:06:12 -05:00
|
|
|
|
return flag_mask(this, predicate_width(predicate));
|
2016-05-18 21:54:35 -07:00
|
|
|
|
} else {
|
2017-06-22 16:42:34 -07:00
|
|
|
|
unsigned mask = 0;
|
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
|
|
|
|
|
mask |= flag_mask(src[i], size_read(i));
|
|
|
|
|
|
}
|
|
|
|
|
|
return mask;
|
2016-05-18 21:54:35 -07:00
|
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
|
unsigned
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
|
fs_inst::flags_written(const intel_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
|
/* On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
|
2022-06-22 18:31:08 +02:00
|
|
|
|
* using a separate cmpn and sel instruction. This lowering occurs in
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
|
* fs_vistor::lower_minmax which is called very, very late.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if ((conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
|
2015-11-22 20:12:17 -08:00
|
|
|
|
opcode != BRW_OPCODE_CSEL &&
|
2016-05-18 21:54:35 -07:00
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) ||
|
2017-01-13 14:17:20 -08:00
|
|
|
|
opcode == FS_OPCODE_FB_WRITE) {
|
2019-09-24 17:06:12 -05:00
|
|
|
|
return flag_mask(this, 1);
|
2023-10-05 15:18:50 +03:00
|
|
|
|
} else if (opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
|
|
|
|
|
|
opcode == SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL ||
|
2020-01-23 23:01:32 -08:00
|
|
|
|
opcode == FS_OPCODE_LOAD_LIVE_CHANNELS) {
|
2020-01-23 23:00:54 -08:00
|
|
|
|
return flag_mask(this, 32);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
} else {
|
2017-06-20 22:38:48 -07:00
|
|
|
|
return flag_mask(dst, size_written);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns how many MRFs an FS opcode will write over.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this is not the 0 or 1 implied writes in an actual gen
|
|
|
|
|
|
* instruction -- the FS opcodes often generate MOVs in addition.
|
|
|
|
|
|
*/
|
2019-12-27 16:38:26 -08:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_inst::implied_mrf_writes() const
|
2010-11-19 15:57:05 +08:00
|
|
|
|
{
|
2019-12-27 16:38:26 -08:00
|
|
|
|
if (mlen == 0)
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 0;
|
|
|
|
|
|
|
2019-12-27 16:38:26 -08:00
|
|
|
|
if (base_mrf == -1)
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
return 0;
|
|
|
|
|
|
|
2019-12-27 16:38:26 -08:00
|
|
|
|
switch (opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2019-12-27 16:38:26 -08:00
|
|
|
|
return 1 * exec_size / 8;
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_POW:
|
2011-09-28 17:37:54 -07:00
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2019-12-27 16:38:26 -08:00
|
|
|
|
return 2 * exec_size / 8;
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
case FS_OPCODE_TXB:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
2013-12-10 16:36:31 +02:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2013-11-30 10:32:16 +13:00
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
2013-03-31 21:31:12 +13:00
|
|
|
|
case SHADER_OPCODE_TG4:
|
2013-10-08 21:42:10 +13:00
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
2013-03-06 14:47:01 -08:00
|
|
|
|
case SHADER_OPCODE_LOD:
|
2015-08-11 20:37:32 -04:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
2017-01-13 14:18:22 -08:00
|
|
|
|
case FS_OPCODE_REP_FB_WRITE:
|
2019-12-27 16:38:26 -08:00
|
|
|
|
return src[0].file == BAD_FILE ? 0 : 2;
|
2012-11-07 10:42:34 -08:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2021-03-29 16:02:30 -07:00
|
|
|
|
case SHADER_OPCODE_GFX4_SCRATCH_READ:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
2021-03-29 16:02:30 -07:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GFX4:
|
2019-12-27 16:38:26 -08:00
|
|
|
|
return mlen;
|
2021-03-29 16:02:30 -07:00
|
|
|
|
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
|
2019-12-27 16:38:26 -08:00
|
|
|
|
return mlen;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
default:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::has_sampler_residency() const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
|
assert(src[TEX_LOGICAL_SRC_RESIDENCY].file == IMM);
|
|
|
|
|
|
return src[TEX_LOGICAL_SRC_RESIDENCY].ud != 0;
|
|
|
|
|
|
default:
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::vgrf(const glsl_type *const type)
|
|
|
|
|
|
{
|
|
|
|
|
|
int reg_width = dispatch_width / 8;
|
2019-03-29 12:39:48 +11:00
|
|
|
|
return fs_reg(VGRF,
|
2020-01-06 13:09:25 -08:00
|
|
|
|
alloc.allocate(glsl_count_dword_slots(type, false) * reg_width),
|
2015-06-18 12:44:35 -07:00
|
|
|
|
brw_type_for_base_type(type));
|
2014-05-16 02:21:51 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-10-31 18:41:35 +01:00
|
|
|
|
fs_reg::fs_reg(enum brw_reg_file file, unsigned nr)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
2010-09-03 13:21:51 -07:00
|
|
|
|
init();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->file = file;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
this->nr = nr;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-10-31 18:41:35 +01:00
|
|
|
|
fs_reg::fs_reg(enum brw_reg_file file, unsigned nr, enum brw_reg_type type)
|
2010-10-15 12:04:52 -07:00
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = file;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
this->nr = nr;
|
2010-10-15 12:04:52 -07:00
|
|
|
|
this->type = type;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-10-15 12:04:52 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
|
|
|
|
|
this->uniforms = v->uniforms;
|
2011-03-23 12:50:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
enum brw_barycentric_mode
|
2022-07-06 13:01:24 -07:00
|
|
|
|
brw_barycentric_mode(nir_intrinsic_instr *intr)
|
2016-07-11 15:00:37 -07:00
|
|
|
|
{
|
2022-07-06 13:01:24 -07:00
|
|
|
|
const glsl_interp_mode mode =
|
|
|
|
|
|
(enum glsl_interp_mode) nir_intrinsic_interp_mode(intr);
|
|
|
|
|
|
|
2016-07-11 15:00:37 -07:00
|
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
unsigned bary;
|
2022-07-06 13:01:24 -07:00
|
|
|
|
switch (intr->intrinsic) {
|
2016-07-12 03:57:25 -07:00
|
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2016-07-11 15:00:37 -07:00
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
|
bary += 3;
|
|
|
|
|
|
|
|
|
|
|
|
return (enum brw_barycentric_mode) bary;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Turn one of the two CENTROID barycentric modes into PIXEL mode.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static enum brw_barycentric_mode
|
|
|
|
|
|
centroid_to_pixel(enum brw_barycentric_mode bary)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
|
|
|
|
|
|
bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
|
|
|
|
|
|
return (enum brw_barycentric_mode) ((unsigned) bary - 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Walk backwards from the end of the program looking for a URB write that
|
|
|
|
|
|
* isn't in control flow, and mark it with EOT.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Return true if successful or false if a separate EOT write is needed.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::mark_last_urb_write_with_eot()
|
|
|
|
|
|
{
|
|
|
|
|
|
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_LOGICAL) {
|
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
|
|
|
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
|
|
|
|
if (dead == prev)
|
|
|
|
|
|
break;
|
|
|
|
|
|
dead->remove();
|
|
|
|
|
|
}
|
|
|
|
|
|
return true;
|
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_gs_thread_end()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits > 0) {
|
|
|
|
|
|
emit_gs_control_data_bits(this->final_gs_vertex_count);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder abld = fs_builder(this, dispatch_width).at_end().annotate("thread end");
|
2015-03-11 23:14:31 -07:00
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
|
|
if (gs_prog_data->static_vertex_count != -1) {
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
|
/* Try and tag the last URB write with EOT instead of emitting a whole
|
|
|
|
|
|
* separate write just to finish the thread.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (mark_last_urb_write_with_eot())
|
|
|
|
|
|
return;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
2022-07-12 15:32:01 -07:00
|
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-08-22 22:23:17 -07:00
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
|
2022-09-28 16:38:35 -07:00
|
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(0);
|
2022-07-12 15:32:01 -07:00
|
|
|
|
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
|
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
2015-03-11 23:14:31 -07:00
|
|
|
|
} else {
|
2022-07-12 15:32:01 -07:00
|
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
2022-08-22 22:23:17 -07:00
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = gs_payload().urb_handles;
|
2022-07-12 15:32:01 -07:00
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = this->final_gs_vertex_count;
|
2022-09-28 16:38:35 -07:00
|
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
|
2022-07-12 15:32:01 -07:00
|
|
|
|
inst = abld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL, reg_undef,
|
|
|
|
|
|
srcs, ARRAY_SIZE(srcs));
|
2015-03-11 23:14:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
|
inst->offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
|
{
|
2016-11-29 02:47:15 -08:00
|
|
|
|
unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
|
|
|
|
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
ubo_push_length += stage_prog_data->ubo_ranges[i].length;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2014-02-19 15:27:01 +01:00
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
|
uint64_t used = 0;
|
2021-10-04 13:58:07 +03:00
|
|
|
|
bool is_compute = gl_shader_stage_is_compute(stage);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
2021-10-04 13:58:07 +03:00
|
|
|
|
if (is_compute && brw_cs_prog_data(prog_data)->uses_inline_data) {
|
2020-06-16 23:06:56 -05:00
|
|
|
|
/* With COMPUTE_WALKER, we can push up to one register worth of data via
|
|
|
|
|
|
* the inline data parameter in the COMPUTE_WALKER command itself.
|
|
|
|
|
|
*
|
|
|
|
|
|
* TODO: Support inline data and push at the same time.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(devinfo->verx10 >= 125);
|
2023-01-31 16:01:26 -08:00
|
|
|
|
assert(uniform_push_length <= reg_unit(devinfo));
|
2021-10-04 13:58:07 +03:00
|
|
|
|
} else if (is_compute && devinfo->verx10 >= 125) {
|
2022-07-15 13:08:23 +03:00
|
|
|
|
assert(devinfo->has_lsc);
|
2023-11-21 09:47:18 -08:00
|
|
|
|
fs_builder ubld = fs_builder(this, 1).exec_all().at(
|
2020-05-04 16:17:58 -05:00
|
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
|
2022-07-15 13:08:23 +03:00
|
|
|
|
/* The base offset for our push data is passed in as R0.0[31:6]. We have
|
|
|
|
|
|
* to mask off the bottom 6 bits.
|
2020-05-04 16:17:58 -05:00
|
|
|
|
*/
|
|
|
|
|
|
fs_reg base_addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
2022-07-15 13:08:23 +03:00
|
|
|
|
ubld.AND(base_addr,
|
|
|
|
|
|
retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
brw_imm_ud(INTEL_MASK(31, 6)));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* On Gfx12-HP we load constants at the start of the program using A32
|
2020-05-04 16:17:58 -05:00
|
|
|
|
* stateless messages.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < uniform_push_length;) {
|
2022-07-15 13:08:23 +03:00
|
|
|
|
/* Limit ourselves to LSC HW limit of 8 GRFs (256bytes D32V64). */
|
|
|
|
|
|
unsigned num_regs = MIN2(uniform_push_length - i, 8);
|
2020-05-04 16:17:58 -05:00
|
|
|
|
assert(num_regs > 0);
|
|
|
|
|
|
num_regs = 1 << util_logbase2(num_regs);
|
|
|
|
|
|
|
2022-07-15 13:08:23 +03:00
|
|
|
|
fs_reg addr = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ubld.ADD(addr, base_addr, brw_imm_ud(i * REG_SIZE));
|
2020-05-04 16:17:58 -05:00
|
|
|
|
|
|
|
|
|
|
fs_reg srcs[4] = {
|
|
|
|
|
|
brw_imm_ud(0), /* desc */
|
|
|
|
|
|
brw_imm_ud(0), /* ex_desc */
|
2022-07-15 13:08:23 +03:00
|
|
|
|
addr, /* payload */
|
|
|
|
|
|
fs_reg(), /* payload2 */
|
2020-05-04 16:17:58 -05:00
|
|
|
|
};
|
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
|
fs_reg dest = retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2020-05-04 16:17:58 -05:00
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
2022-07-15 13:08:23 +03:00
|
|
|
|
fs_inst *send = ubld.emit(SHADER_OPCODE_SEND, dest, srcs, 4);
|
|
|
|
|
|
|
|
|
|
|
|
send->sfid = GFX12_SFID_UGM;
|
|
|
|
|
|
send->desc = lsc_msg_desc(devinfo, LSC_OP_LOAD,
|
|
|
|
|
|
1 /* exec_size */,
|
|
|
|
|
|
LSC_ADDR_SURFTYPE_FLAT,
|
|
|
|
|
|
LSC_ADDR_SIZE_A32,
|
|
|
|
|
|
1 /* num_coordinates */,
|
|
|
|
|
|
LSC_DATA_SIZE_D32,
|
|
|
|
|
|
num_regs * 8 /* num_channels */,
|
|
|
|
|
|
true /* transpose */,
|
2022-08-05 14:58:09 -07:00
|
|
|
|
LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS),
|
2022-07-15 13:08:23 +03:00
|
|
|
|
true /* has_dest */);
|
|
|
|
|
|
send->header_size = 0;
|
|
|
|
|
|
send->mlen = lsc_msg_desc_src0_len(devinfo, send->desc);
|
|
|
|
|
|
send->size_written =
|
|
|
|
|
|
lsc_msg_desc_dest_len(devinfo, send->desc) * REG_SIZE;
|
2020-05-04 16:17:58 -05:00
|
|
|
|
send->send_is_volatile = true;
|
|
|
|
|
|
|
|
|
|
|
|
i += num_regs;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
|
*/
|
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
|
assert(constant_nr / 8 < 64);
|
|
|
|
|
|
used |= BITFIELD64_BIT(constant_nr / 8);
|
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload().num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
|
constant_nr / 8,
|
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
assert(inst->src[i].stride == 0);
|
2015-10-24 15:29:03 -07:00
|
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
|
2020-04-03 20:20:53 -05:00
|
|
|
|
uint64_t want_zero = used & stage_prog_data->zero_push_reg;
|
|
|
|
|
|
if (want_zero) {
|
2023-11-21 09:47:18 -08:00
|
|
|
|
fs_builder ubld = fs_builder(this, 8).exec_all().at(
|
2020-04-03 20:20:53 -05:00
|
|
|
|
cfg->first_block(), cfg->first_block()->start());
|
|
|
|
|
|
|
|
|
|
|
|
/* push_reg_mask_param is in 32-bit units */
|
|
|
|
|
|
unsigned mask_param = stage_prog_data->push_reg_mask_param;
|
2022-08-19 12:40:20 -07:00
|
|
|
|
struct brw_reg mask = brw_vec1_grf(payload().num_regs + mask_param / 8,
|
|
|
|
|
|
mask_param % 8);
|
2020-04-03 20:20:53 -05:00
|
|
|
|
|
|
|
|
|
|
fs_reg b32;
|
|
|
|
|
|
for (unsigned i = 0; i < 64; i++) {
|
|
|
|
|
|
if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) {
|
|
|
|
|
|
fs_reg shifted = ubld.vgrf(BRW_REGISTER_TYPE_W, 2);
|
|
|
|
|
|
ubld.SHL(horiz_offset(shifted, 8),
|
|
|
|
|
|
byte_offset(retype(mask, BRW_REGISTER_TYPE_W), i / 8),
|
|
|
|
|
|
brw_imm_v(0x01234567));
|
|
|
|
|
|
ubld.SHL(shifted, horiz_offset(shifted, 8), brw_imm_w(8));
|
|
|
|
|
|
|
|
|
|
|
|
fs_builder ubld16 = ubld.group(16, 0);
|
|
|
|
|
|
b32 = ubld16.vgrf(BRW_REGISTER_TYPE_D);
|
|
|
|
|
|
ubld16.group(16, 0).ASR(b32, shifted, brw_imm_w(15));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (want_zero & BITFIELD64_BIT(i)) {
|
|
|
|
|
|
assert(i < prog_data->curb_read_length);
|
|
|
|
|
|
struct brw_reg push_reg =
|
2022-08-19 12:40:20 -07:00
|
|
|
|
retype(brw_vec8_grf(payload().num_regs + i, 0),
|
2020-04-03 20:20:53 -05:00
|
|
|
|
BRW_REGISTER_TYPE_D);
|
|
|
|
|
|
|
|
|
|
|
|
ubld.AND(push_reg, push_reg, component(b32, i % 16));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-03 19:05:32 -07:00
|
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
2022-08-19 12:40:20 -07:00
|
|
|
|
this->first_non_payload_grf = payload().num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
|
/*
|
|
|
|
|
|
* Build up an array of indices into the urb_setup array that
|
|
|
|
|
|
* references the active entries of the urb_setup array.
|
|
|
|
|
|
* Used to accelerate walking the active entries of the urb_setup array
|
|
|
|
|
|
* on each upload.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
brw_compute_urb_setup_index(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
|
{
|
2021-10-29 12:56:22 -07:00
|
|
|
|
/* TODO(mesh): Review usage of this in the context of Mesh, we may want to
|
|
|
|
|
|
* skip per-primitive attributes here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2018-12-11 18:45:43 +01:00
|
|
|
|
/* Make sure uint8_t is sufficient */
|
|
|
|
|
|
STATIC_ASSERT(VARYING_SLOT_MAX <= 0xff);
|
|
|
|
|
|
uint8_t index = 0;
|
|
|
|
|
|
for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) {
|
|
|
|
|
|
if (wm_prog_data->urb_setup[attr] >= 0) {
|
|
|
|
|
|
wm_prog_data->urb_setup_attribs[index++] = attr;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
wm_prog_data->urb_setup_attribs_count = index;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-18 09:15:15 -05:00
|
|
|
|
static void
|
2021-04-05 13:19:39 -07:00
|
|
|
|
calculate_urb_setup(const struct intel_device_info *devinfo,
|
2019-07-18 09:15:15 -05:00
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
2021-05-18 11:05:33 -07:00
|
|
|
|
const nir_shader *nir,
|
|
|
|
|
|
const struct brw_mue_map *mue_map)
|
2010-08-16 21:53:02 -07:00
|
|
|
|
{
|
2022-12-21 15:40:07 +01:00
|
|
|
|
memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup));
|
|
|
|
|
|
memset(prog_data->urb_setup_channel, 0, sizeof(prog_data->urb_setup_channel));
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2022-12-21 15:40:07 +01:00
|
|
|
|
int urb_next = 0; /* in vec4s */
|
2021-05-18 10:17:43 -07:00
|
|
|
|
|
|
|
|
|
|
const uint64_t inputs_read =
|
|
|
|
|
|
nir->info.inputs_read & ~nir->info.per_primitive_inputs;
|
|
|
|
|
|
|
2010-08-16 21:53:02 -07:00
|
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2023-09-04 22:31:17 -07:00
|
|
|
|
if (key->mesh_input != BRW_NEVER) {
|
2022-01-27 00:50:52 -08:00
|
|
|
|
/* Per-Primitive Attributes are laid out by Hardware before the regular
|
|
|
|
|
|
* attributes, so order them like this to make easy later to map setup
|
|
|
|
|
|
* into real HW registers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (nir->info.per_primitive_inputs) {
|
2022-02-25 16:35:26 +01:00
|
|
|
|
uint64_t per_prim_inputs_read =
|
|
|
|
|
|
nir->info.inputs_read & nir->info.per_primitive_inputs;
|
|
|
|
|
|
|
2022-04-12 15:06:16 +02:00
|
|
|
|
/* In Mesh, PRIMITIVE_SHADING_RATE, VIEWPORT and LAYER slots
|
|
|
|
|
|
* are always at the beginning, because they come from MUE
|
|
|
|
|
|
* Primitive Header, not Per-Primitive Attributes.
|
2022-02-25 16:35:26 +01:00
|
|
|
|
*/
|
|
|
|
|
|
const uint64_t primitive_header_bits = VARYING_BIT_VIEWPORT |
|
2022-04-12 15:06:16 +02:00
|
|
|
|
VARYING_BIT_LAYER |
|
|
|
|
|
|
VARYING_BIT_PRIMITIVE_SHADING_RATE;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
if (mue_map) {
|
|
|
|
|
|
unsigned per_prim_start_dw = mue_map->per_primitive_start_dw;
|
|
|
|
|
|
unsigned per_prim_size_dw = mue_map->per_primitive_pitch_dw;
|
2022-04-12 15:06:16 +02:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
bool reads_header = (per_prim_inputs_read & primitive_header_bits) != 0;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
if (reads_header || mue_map->user_data_in_primitive_header) {
|
|
|
|
|
|
/* Primitive Shading Rate, Layer and Viewport live in the same
|
|
|
|
|
|
* 4-dwords slot (psr is dword 0, layer is dword 1, and viewport
|
|
|
|
|
|
* is dword 2).
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_PRIMITIVE_SHADING_RATE)
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PRIMITIVE_SHADING_RATE] = 0;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_LAYER)
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
2022-02-25 16:35:26 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
if (per_prim_inputs_read & VARYING_BIT_VIEWPORT)
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = 0;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
per_prim_inputs_read &= ~primitive_header_bits;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* If fs doesn't need primitive header, then it won't be made
|
|
|
|
|
|
* available through SBE_MESH, so we have to skip them when
|
|
|
|
|
|
* calculating offset from start of per-prim data.
|
|
|
|
|
|
*/
|
|
|
|
|
|
per_prim_start_dw += mue_map->per_primitive_header_size_dw;
|
|
|
|
|
|
per_prim_size_dw -= mue_map->per_primitive_header_size_dw;
|
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
u_foreach_bit64(i, per_prim_inputs_read) {
|
|
|
|
|
|
int start = mue_map->start_dw[i];
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
assert(start >= 0);
|
|
|
|
|
|
assert(mue_map->len_dw[i] > 0);
|
|
|
|
|
|
|
|
|
|
|
|
assert(unsigned(start) >= per_prim_start_dw);
|
|
|
|
|
|
unsigned pos_dw = unsigned(start) - per_prim_start_dw;
|
|
|
|
|
|
|
|
|
|
|
|
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
|
|
|
|
|
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
urb_next = per_prim_size_dw / 4;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* With no MUE map, we never read the primitive header, and
|
|
|
|
|
|
* per-primitive attributes won't be packed either, so just lay
|
|
|
|
|
|
* them in varying order.
|
|
|
|
|
|
*/
|
|
|
|
|
|
per_prim_inputs_read &= ~primitive_header_bits;
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
|
if (per_prim_inputs_read & BITFIELD64_BIT(i)) {
|
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2022-01-27 00:50:52 -08:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
/* The actual setup attributes later must be aligned to a full GRF. */
|
|
|
|
|
|
urb_next = ALIGN(urb_next, 2);
|
|
|
|
|
|
}
|
2022-01-27 00:50:52 -08:00
|
|
|
|
|
|
|
|
|
|
prog_data->num_per_primitive_inputs = urb_next;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2021-12-09 16:50:18 +01:00
|
|
|
|
const uint64_t clip_dist_bits = VARYING_BIT_CLIP_DIST0 |
|
|
|
|
|
|
VARYING_BIT_CLIP_DIST1;
|
|
|
|
|
|
|
2022-01-27 00:48:19 -08:00
|
|
|
|
uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
|
|
|
|
|
|
|
2021-12-09 16:50:18 +01:00
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
2023-09-04 22:31:17 -07:00
|
|
|
|
assert(!mue_map || mue_map->per_vertex_header_size_dw > 8);
|
2021-12-09 16:50:18 +01:00
|
|
|
|
unique_fs_attrs &= ~clip_dist_bits;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
if (mue_map) {
|
|
|
|
|
|
unsigned per_vertex_start_dw = mue_map->per_vertex_start_dw;
|
|
|
|
|
|
unsigned per_vertex_size_dw = mue_map->per_vertex_pitch_dw;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
/* Per-Vertex header is available to fragment shader only if there's
|
|
|
|
|
|
* user data there.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (!mue_map->user_data_in_vertex_header) {
|
|
|
|
|
|
per_vertex_start_dw += 8;
|
|
|
|
|
|
per_vertex_size_dw -= 8;
|
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
/* In Mesh, CLIP_DIST slots are always at the beginning, because
|
|
|
|
|
|
* they come from MUE Vertex Header, not Per-Vertex Attributes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next;
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next + 1;
|
|
|
|
|
|
} else if (mue_map && mue_map->per_vertex_header_size_dw > 8) {
|
|
|
|
|
|
/* Clip distances are in MUE, but we are not reading them in FS. */
|
|
|
|
|
|
per_vertex_start_dw += 8;
|
|
|
|
|
|
per_vertex_size_dw -= 8;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Per-Vertex attributes are laid out ordered. Because we always link
|
|
|
|
|
|
* Mesh and Fragment shaders, the which slots are written and read by
|
|
|
|
|
|
* each of them will match. */
|
|
|
|
|
|
u_foreach_bit64(i, unique_fs_attrs) {
|
|
|
|
|
|
int start = mue_map->start_dw[i];
|
2021-12-09 16:50:18 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
assert(start >= 0);
|
|
|
|
|
|
assert(mue_map->len_dw[i] > 0);
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
assert(unsigned(start) >= per_vertex_start_dw);
|
|
|
|
|
|
unsigned pos_dw = unsigned(start) - per_vertex_start_dw;
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next + pos_dw / 4;
|
|
|
|
|
|
prog_data->urb_setup_channel[i] = pos_dw % 4;
|
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
urb_next += per_vertex_size_dw / 4;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* If we don't have an MUE map, just lay down the inputs the FS reads
|
|
|
|
|
|
* in varying order, as we do for the legacy pipeline.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inputs_read & clip_dist_bits) {
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST0] = urb_next++;
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_CLIP_DIST1] = urb_next++;
|
|
|
|
|
|
}
|
2022-12-21 15:40:07 +01:00
|
|
|
|
|
2023-09-04 22:31:17 -07:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
|
if (unique_fs_attrs & BITFIELD64_BIT(i))
|
|
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
|
|
|
|
|
}
|
2022-01-27 00:48:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
} else if (devinfo->ver >= 6) {
|
2023-09-04 22:31:17 -07:00
|
|
|
|
assert(!nir->info.per_primitive_inputs);
|
|
|
|
|
|
|
2021-12-15 00:35:40 -08:00
|
|
|
|
uint64_t vue_header_bits =
|
|
|
|
|
|
VARYING_BIT_PSIZ | VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT;
|
|
|
|
|
|
|
|
|
|
|
|
uint64_t unique_fs_attrs = inputs_read & BRW_FS_VARYING_INPUT_MASK;
|
|
|
|
|
|
|
|
|
|
|
|
/* VUE header fields all live in the same URB slot, so we pass them
|
|
|
|
|
|
* as a single FS input attribute. We want to only count them once.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inputs_read & vue_header_bits) {
|
|
|
|
|
|
unique_fs_attrs &= ~vue_header_bits;
|
|
|
|
|
|
unique_fs_attrs |= VARYING_BIT_PSIZ;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (util_bitcount64(unique_fs_attrs) <= 16) {
|
2013-09-03 12:15:53 -07:00
|
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
|
* a different vertex (or geometry) shader.
|
2021-12-15 00:35:40 -08:00
|
|
|
|
*
|
|
|
|
|
|
* VUE header fields share the same FS input attribute.
|
2013-09-03 12:15:53 -07:00
|
|
|
|
*/
|
2021-12-15 00:35:40 -08:00
|
|
|
|
if (inputs_read & vue_header_bits) {
|
|
|
|
|
|
if (inputs_read & VARYING_BIT_PSIZ)
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PSIZ] = urb_next;
|
|
|
|
|
|
if (inputs_read & VARYING_BIT_LAYER)
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_LAYER] = urb_next;
|
|
|
|
|
|
if (inputs_read & VARYING_BIT_VIEWPORT)
|
|
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_VIEWPORT] = urb_next;
|
|
|
|
|
|
|
|
|
|
|
|
urb_next++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2021-12-15 00:35:40 -08:00
|
|
|
|
if (inputs_read & BRW_FS_VARYING_INPUT_MASK & ~vue_header_bits &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(i)) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
|
*/
|
2021-04-30 01:00:51 -07:00
|
|
|
|
|
|
|
|
|
|
/* Re-compute the VUE map here in the case that the one coming from
|
|
|
|
|
|
* geometry has more than one position slot (used for Primitive
|
|
|
|
|
|
* Replication).
|
|
|
|
|
|
*/
|
2013-09-03 12:15:53 -07:00
|
|
|
|
struct brw_vue_map prev_stage_vue_map;
|
2015-04-17 12:52:00 -07:00
|
|
|
|
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
|
i965: Don't re-layout varyings for separate shader programs.
Previously, our VUE map code always assigned slots to varyings
sequentially, in one contiguous block.
This was a bad fit for separate shaders - the GS input layout depended
or the VS output layout, so if we swapped out vertex shaders, we might
have to recompile the GS on the fly - which rather defeats the point of
using separate shader objects. (Tessellation would suffer from this
as well - we could have to recompile the HS, DS, and GS.)
Instead, this patch makes the VUE map for separate shaders use a fixed
layout, based on the input/output variable's location field. (This is
either specified by layout(location = ...) or assigned by the linker.)
Corresponding inputs/outputs will match up by location; if there's a
mismatch, we're allowed to have undefined behavior.
This may be less efficient - depending what locations were chosen, we
may have empty padding slots in the VUE. But applications presumably
use small consecutive integers for locations, so it hopefully won't be
much worse in practice.
3% of Dota 2 Reborn shaders are hurt, but only by 2 instructions.
This seems like a small price to pay for avoiding recompiles.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
2015-09-09 16:21:56 -07:00
|
|
|
|
key->input_slots_valid,
|
2018-09-21 16:07:38 -07:00
|
|
|
|
nir->info.separate_shader, 1);
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
|
|
2015-06-17 13:06:18 -07:00
|
|
|
|
int first_slot =
|
2021-05-18 10:17:43 -07:00
|
|
|
|
brw_compute_first_urb_slot_required(inputs_read,
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
|
&prev_stage_vue_map);
|
2015-06-17 13:06:18 -07:00
|
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
|
slot++) {
|
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
2015-10-26 01:03:12 -07:00
|
|
|
|
if (varying != BRW_VARYING_SLOT_PAD &&
|
2021-05-18 10:17:43 -07:00
|
|
|
|
(inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(varying))) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* Point size is packed into the header, not as a general attribute */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
if (i == VARYING_SLOT_PSIZ)
|
2012-07-19 22:00:16 +02:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* The back color slot is skipped when the front color is
|
|
|
|
|
|
* also written to. In addition, some slots can be
|
|
|
|
|
|
* written in the vertex shader and not read in the
|
|
|
|
|
|
* fragment shader. So the register number must always be
|
|
|
|
|
|
* incremented, mapped or not.
|
|
|
|
|
|
*/
|
2013-02-23 08:28:18 -08:00
|
|
|
|
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next;
|
2012-07-19 22:00:16 +02:00
|
|
|
|
urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-02-27 15:46:32 +08:00
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* It's a FS only attribute, and we did interpolation for this attribute
|
|
|
|
|
|
* in SF thread. So, count it here, too.
|
|
|
|
|
|
*
|
|
|
|
|
|
* See compile_sf_prog() for more info.
|
|
|
|
|
|
*/
|
2021-05-18 10:17:43 -07:00
|
|
|
|
if (inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2021-05-18 10:17:43 -07:00
|
|
|
|
prog_data->num_varying_inputs = urb_next - prog_data->num_per_primitive_inputs;
|
|
|
|
|
|
prog_data->inputs = inputs_read;
|
2018-12-11 18:45:43 +01:00
|
|
|
|
|
|
|
|
|
|
brw_compute_urb_setup_index(prog_data);
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_urb_setup()
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
|
2014-08-29 12:50:46 -07:00
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
|
int urb_start = payload().num_regs + prog_data->base.curb_read_length;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
2010-08-16 21:53:02 -07:00
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2016-04-25 18:33:22 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
|
|
|
|
|
/* ATTR regs in the FS are in units of logical scalar inputs each
|
|
|
|
|
|
* of which consumes half of a GRF register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->src[i].offset < REG_SIZE / 2);
|
|
|
|
|
|
const unsigned grf = urb_start + inst->src[i].nr / 2;
|
|
|
|
|
|
const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
|
|
|
|
|
|
inst->src[i].offset;
|
|
|
|
|
|
const unsigned width = inst->src[i].stride == 0 ?
|
|
|
|
|
|
1 : MIN2(inst->exec_size, 8);
|
|
|
|
|
|
struct brw_reg reg = stride(
|
|
|
|
|
|
byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
|
offset),
|
|
|
|
|
|
width * inst->src[i].stride,
|
|
|
|
|
|
width, inst->src[i].stride);
|
|
|
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
inst->src[i] = reg;
|
|
|
|
|
|
}
|
2011-01-12 12:52:16 -08:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-02 17:35:32 -07:00
|
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg. */
|
2014-10-03 19:05:32 -07:00
|
|
|
|
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
|
2021-05-18 10:17:43 -07:00
|
|
|
|
|
|
|
|
|
|
/* Unlike regular attributes, per-primitive attributes have all 4 channels
|
|
|
|
|
|
* in the same slot, so each GRF can store two slots.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(prog_data->num_per_primitive_inputs % 2 == 0);
|
|
|
|
|
|
this->first_non_payload_grf += prog_data->num_per_primitive_inputs / 2;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
2022-08-19 12:40:20 -07:00
|
|
|
|
int grf = payload().num_regs +
|
2015-03-11 23:14:31 -07:00
|
|
|
|
prog_data->curb_read_length +
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr +
|
2016-09-01 12:42:20 -07:00
|
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
2016-03-23 12:20:05 +01:00
|
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
|
|
|
|
*
|
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
|
* boundaries.
|
|
|
|
|
|
*
|
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
|
inst->src[i].stride *
|
|
|
|
|
|
type_sz(inst->src[i].type);
|
|
|
|
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_vs_urb_setup()
|
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
|
|
|
|
|
/* Each attribute is 4 regs. */
|
2016-04-04 12:47:57 +02:00
|
|
|
|
this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
assert(vs_prog_data->base.urb_read_length <= 15);
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to the hw grf that they land in. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-03-11 23:14:31 -07:00
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
|
void
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
fs_visitor::assign_tcs_urb_setup()
|
2015-11-14 17:40:43 -08:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to HW_REGs. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_tes_urb_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
|
|
first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to HW_REGs. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_gs_urb_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
first_non_payload_grf +=
|
2017-05-08 09:20:21 -07:00
|
|
|
|
8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:52:57 -07:00
|
|
|
|
/* Rewrite all ATTR file references to GRFs. */
|
2015-03-11 23:14:31 -07:00
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
2010-10-13 20:17:15 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Split large virtual GRFs into separate components if we can.
|
|
|
|
|
|
*
|
2021-11-09 14:38:48 -06:00
|
|
|
|
* This pass aggressively splits VGRFs into as small a chunks as possible,
|
|
|
|
|
|
* down to single registers if it can. If no VGRFs can be split, we return
|
|
|
|
|
|
* false so this pass can safely be used inside an optimization loop. We
|
|
|
|
|
|
* want to split, because virtual GRFs are what we register allocate and
|
|
|
|
|
|
* spill (due to contiguousness requirements for some instructions), and
|
|
|
|
|
|
* they're what we naturally generate in the codegen process, but most
|
|
|
|
|
|
* virtual GRFs don't actually need to be contiguous sets of GRFs. If we
|
|
|
|
|
|
* split, we'll end up with reduced live intervals and better dead code
|
|
|
|
|
|
* elimination and coalescing.
|
2010-10-13 20:17:15 -07:00
|
|
|
|
*/
|
2021-11-09 14:38:48 -06:00
|
|
|
|
bool
|
2010-10-13 20:17:15 -07:00
|
|
|
|
fs_visitor::split_virtual_grfs()
|
|
|
|
|
|
{
|
2016-10-15 03:18:36 -07:00
|
|
|
|
/* Compact the register file so we eliminate dead vgrfs. This
|
|
|
|
|
|
* only defines split points for live registers, so if we have
|
|
|
|
|
|
* too large dead registers they will hit assertions later.
|
|
|
|
|
|
*/
|
|
|
|
|
|
compact_virtual_grfs();
|
|
|
|
|
|
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned num_vars = this->alloc.count;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* Count the total number of registers */
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned reg_count = 0;
|
|
|
|
|
|
unsigned vgrf_to_reg[num_vars];
|
|
|
|
|
|
for (unsigned i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
vgrf_to_reg[i] = reg_count;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
reg_count += alloc.sizes[i];
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* An array of "split points". For each register slot, this indicates
|
|
|
|
|
|
* if this slot can be separated from the previous slot. Every time an
|
|
|
|
|
|
* instruction uses multiple elements of a register (as a source or
|
|
|
|
|
|
* destination), we mark the used slots as inseparable. Then we go
|
|
|
|
|
|
* through and split the registers into the smallest pieces we can.
|
|
|
|
|
|
*/
|
2019-07-22 00:28:27 -05:00
|
|
|
|
bool *split_points = new bool[reg_count];
|
|
|
|
|
|
memset(split_points, 0, reg_count * sizeof(*split_points));
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* Mark all used registers as fully splittable */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF) {
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned reg = vgrf_to_reg[inst->dst.nr];
|
2015-10-26 04:35:14 -07:00
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF) {
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned reg = vgrf_to_reg[inst->src[i].nr];
|
2015-10-26 04:35:14 -07:00
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2019-09-06 19:34:42 -05:00
|
|
|
|
/* We fix up undef instructions later */
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_UNDEF) {
|
|
|
|
|
|
assert(inst->dst.file == VGRF);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF) {
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
|
2016-09-07 16:59:35 -07:00
|
|
|
|
for (unsigned j = 1; j < regs_written(inst); j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = false;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF) {
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
|
2016-09-07 16:59:35 -07:00
|
|
|
|
for (unsigned j = 1; j < regs_read(inst, i); j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = false;
|
2013-08-28 11:22:01 -07:00
|
|
|
|
}
|
2013-03-19 15:28:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2021-11-09 14:37:57 -06:00
|
|
|
|
/* Bitset of which registers have been split */
|
|
|
|
|
|
bool *vgrf_has_split = new bool[num_vars];
|
|
|
|
|
|
memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
|
|
|
|
|
|
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned *new_virtual_grf = new unsigned[reg_count];
|
|
|
|
|
|
unsigned *new_reg_offset = new unsigned[reg_count];
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned reg = 0;
|
2021-11-09 14:37:57 -06:00
|
|
|
|
bool has_splits = false;
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
for (unsigned i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* The first one should always be 0 as a quick sanity check. */
|
|
|
|
|
|
assert(split_points[reg] == false);
|
|
|
|
|
|
|
|
|
|
|
|
/* j = 0 case */
|
|
|
|
|
|
new_reg_offset[reg] = 0;
|
|
|
|
|
|
reg++;
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned offset = 1;
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* j > 0 case */
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < alloc.sizes[i]; j++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* If this is a split point, reset the offset to 0 and allocate a
|
|
|
|
|
|
* new virtual GRF for the previous offset many registers
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (split_points[reg]) {
|
2021-11-09 14:37:57 -06:00
|
|
|
|
has_splits = true;
|
|
|
|
|
|
vgrf_has_split[i] = true;
|
2022-02-21 21:42:05 -08:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE(devinfo));
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
unsigned grf = alloc.allocate(offset);
|
|
|
|
|
|
for (unsigned k = reg - offset; k < reg; k++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
new_virtual_grf[k] = grf;
|
|
|
|
|
|
offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
new_reg_offset[reg] = offset;
|
|
|
|
|
|
offset++;
|
|
|
|
|
|
reg++;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* The last one gets the original register number */
|
2022-02-21 21:42:05 -08:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE(devinfo));
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[i] = offset;
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
for (unsigned k = reg - offset; k < reg; k++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
new_virtual_grf[k] = i;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
assert(reg == reg_count);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2021-11-09 14:38:48 -06:00
|
|
|
|
bool progress;
|
|
|
|
|
|
if (!has_splits) {
|
|
|
|
|
|
progress = false;
|
2021-11-09 14:37:57 -06:00
|
|
|
|
goto cleanup;
|
2021-11-09 14:38:48 -06:00
|
|
|
|
}
|
2021-11-09 14:37:57 -06:00
|
|
|
|
|
2019-09-06 19:34:42 -05:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_UNDEF) {
|
2021-11-09 14:37:57 -06:00
|
|
|
|
assert(inst->dst.file == VGRF);
|
|
|
|
|
|
if (vgrf_has_split[inst->dst.nr]) {
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
assert(inst->size_written % REG_SIZE == 0);
|
2022-09-16 23:35:08 +03:00
|
|
|
|
unsigned reg_offset = inst->dst.offset / REG_SIZE;
|
|
|
|
|
|
unsigned size_written = 0;
|
|
|
|
|
|
while (size_written < inst->size_written) {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->dst.nr] + reg_offset + size_written / REG_SIZE;
|
|
|
|
|
|
fs_inst *undef =
|
|
|
|
|
|
ibld.UNDEF(
|
|
|
|
|
|
byte_offset(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type),
|
|
|
|
|
|
new_reg_offset[reg] * REG_SIZE));
|
|
|
|
|
|
undef->size_written =
|
|
|
|
|
|
MIN2(inst->size_written - size_written, undef->size_written);
|
|
|
|
|
|
assert(undef->size_written % REG_SIZE == 0);
|
|
|
|
|
|
size_written += undef->size_written;
|
2021-11-09 14:37:57 -06:00
|
|
|
|
}
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->dst.nr];
|
|
|
|
|
|
assert(new_reg_offset[reg] == 0);
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
assert(new_virtual_grf[reg] == inst->dst.nr);
|
2019-09-06 19:34:42 -05:00
|
|
|
|
}
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
|
2021-11-09 14:37:57 -06:00
|
|
|
|
if (vgrf_has_split[inst->dst.nr]) {
|
|
|
|
|
|
inst->dst.nr = new_virtual_grf[reg];
|
|
|
|
|
|
inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
|
|
|
|
|
|
inst->dst.offset % REG_SIZE;
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2021-11-09 14:37:57 -06:00
|
|
|
|
} else {
|
|
|
|
|
|
assert(new_reg_offset[reg] == inst->dst.offset / REG_SIZE);
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
assert(new_virtual_grf[reg] == inst->dst.nr);
|
2021-11-09 14:37:57 -06:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
2021-11-09 14:37:57 -06:00
|
|
|
|
if (inst->src[i].file != VGRF)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
|
|
|
|
|
|
if (vgrf_has_split[inst->src[i].nr]) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr = new_virtual_grf[reg];
|
2016-09-01 12:42:20 -07:00
|
|
|
|
inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
|
|
|
|
|
|
inst->src[i].offset % REG_SIZE;
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
assert(new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2021-11-09 14:37:57 -06:00
|
|
|
|
} else {
|
|
|
|
|
|
assert(new_reg_offset[reg] == inst->src[i].offset / REG_SIZE);
|
intel/fs: Preserve unsignedness in fs_visitor::split_virtual_grfs
GCC 12.2.0 warns:
../src/intel/compiler/brw_fs.cpp: In member function ‘bool fs_visitor::
split_virtual_grfs()’:
../src/intel/compiler/brw_fs.cpp:2199:10: warning: ‘void* memset(void*, int,
size_t)’ specified size between 18446744071562067968 and 18446744073709551615
exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=]
2199 | memset(vgrf_has_split, 0, num_vars * sizeof(*vgrf_has_split));
`num_vars` is an `int` but gets assigned the value of `this->alloc.count`,
which is an `unsigned int`. Thus, `num_vars` will be negative if
`this->alloc.count` is larger than int max value. Converting that negative
`int` to a `size_t`, which `memset` expects, then blows it up to a huge
positive value.
Simply turning `num_vars` into an `unsigned int` would be enough to fix this
specific problem, but there are many other instances where an `unsigned int`
gets assigned to an `int` for no good reason in this function. Some of which
the compiler warns about now, some of which it doesn't warn about.
This turns all variables in `fs_visitor::split_virtual_grfs`, which should
reasonably be unsigned, into `unsigned int`s. While at it, a few now pointless
casts are removed.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/19423>
2022-10-31 14:22:09 +01:00
|
|
|
|
assert(new_virtual_grf[reg] == inst->src[i].nr);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
|
2019-07-22 00:28:27 -05:00
|
|
|
|
|
2021-11-09 14:38:48 -06:00
|
|
|
|
progress = true;
|
|
|
|
|
|
|
2021-11-09 14:37:57 -06:00
|
|
|
|
cleanup:
|
2019-07-22 00:28:27 -05:00
|
|
|
|
delete[] split_points;
|
2021-11-09 14:37:57 -06:00
|
|
|
|
delete[] vgrf_has_split;
|
2019-07-22 00:28:27 -05:00
|
|
|
|
delete[] new_virtual_grf;
|
|
|
|
|
|
delete[] new_reg_offset;
|
2021-11-09 14:38:48 -06:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
/**
|
2016-03-09 17:46:16 -08:00
|
|
|
|
* Remove unused virtual GRFs and compact the vgrf_* arrays.
|
2012-11-01 22:04:50 -07:00
|
|
|
|
*
|
|
|
|
|
|
* During code generation, we create tons of temporary variables, many of
|
|
|
|
|
|
* which get immediately killed and are never used again. Yet, in later
|
|
|
|
|
|
* optimization and analysis passes, such as compute_live_intervals, we need
|
|
|
|
|
|
* to loop over all the virtual GRFs. Compacting them can save a lot of
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool
|
2012-11-01 22:04:50 -07:00
|
|
|
|
fs_visitor::compact_virtual_grfs()
|
|
|
|
|
|
{
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool progress = false;
|
2019-07-22 00:28:27 -05:00
|
|
|
|
int *remap_table = new int[this->alloc.count];
|
|
|
|
|
|
memset(remap_table, -1, this->alloc.count * sizeof(int));
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
2014-08-19 16:11:36 -07:00
|
|
|
|
/* Mark which virtual GRFs are used. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, const fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
remap_table[inst->dst.nr] = 0;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
remap_table[inst->src[i].nr] = 0;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Compact the GRF arrays. */
|
|
|
|
|
|
int new_index = 0;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2014-09-16 13:14:09 -07:00
|
|
|
|
if (remap_table[i] == -1) {
|
|
|
|
|
|
/* We just found an unused register. This means that we are
|
|
|
|
|
|
* actually going to compact something.
|
|
|
|
|
|
*/
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
remap_table[i] = new_index;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[new_index] = alloc.sizes[i];
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES);
|
2012-11-01 22:04:50 -07:00
|
|
|
|
++new_index;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
this->alloc.count = new_index;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
|
|
|
|
|
/* Patch all the instructions to use the newly renumbered registers */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = remap_table[inst->dst.nr];
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr = remap_table[inst->src[i].nr];
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
/* Patch all the references to delta_xy, since they're used in register
|
|
|
|
|
|
* allocation. If they're unused, switch them to BAD_FILE so we don't
|
|
|
|
|
|
* think some random VGRF is delta_xy.
|
2014-08-10 19:03:34 -07:00
|
|
|
|
*/
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (delta_xy[i].file == VGRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
if (remap_table[delta_xy[i].nr] != -1) {
|
|
|
|
|
|
delta_xy[i].nr = remap_table[delta_xy[i].nr];
|
2014-09-12 17:45:30 -07:00
|
|
|
|
} else {
|
2015-04-06 17:44:40 -07:00
|
|
|
|
delta_xy[i].file = BAD_FILE;
|
2014-09-12 17:45:30 -07:00
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 13:14:09 -07:00
|
|
|
|
|
2019-07-22 00:28:27 -05:00
|
|
|
|
delete[] remap_table;
|
|
|
|
|
|
|
2014-09-16 13:14:09 -07:00
|
|
|
|
return progress;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-08-30 00:47:32 -07:00
|
|
|
|
int
|
|
|
|
|
|
brw_get_subgroup_id_param_index(const intel_device_info *devinfo,
|
|
|
|
|
|
const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
|
{
|
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
2021-03-29 13:43:47 -07:00
|
|
|
|
if (devinfo->verx10 >= 125)
|
2020-06-16 23:06:25 -05:00
|
|
|
|
return -1;
|
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-18 17:04:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*
|
2015-08-18 17:04:53 -07:00
|
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
2015-12-08 17:34:38 -08:00
|
|
|
|
* update the program to load them.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
2015-08-18 17:04:53 -07:00
|
|
|
|
fs_visitor::assign_constant_locations()
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
2016-02-22 10:42:07 -08:00
|
|
|
|
/* Only the first compile gets to decide on locations. */
|
2021-12-03 21:34:06 -06:00
|
|
|
|
if (push_constant_loc)
|
2014-03-07 16:10:50 -08:00
|
|
|
|
return;
|
2019-10-31 15:57:52 -05:00
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
|
for (unsigned u = 0; u < uniforms; u++)
|
|
|
|
|
|
push_constant_loc[u] = u;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
|
/* Now that we know how many regular uniforms we'll push, reduce the
|
|
|
|
|
|
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
|
|
|
|
|
|
*/
|
2021-07-28 13:51:38 +10:00
|
|
|
|
/* For gen4/5:
|
|
|
|
|
|
* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
|
*
|
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
|
* brw_curbe.c/crocus_state.c
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned max_push_length = compiler->devinfo->ver < 6 ? 16 : 64;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
|
|
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
|
|
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
|
|
|
|
|
|
|
2021-07-28 13:51:38 +10:00
|
|
|
|
if (push_length + range->length > max_push_length)
|
|
|
|
|
|
range->length = max_push_length - push_length;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
|
|
|
|
|
|
push_length += range->length;
|
|
|
|
|
|
}
|
2021-07-28 13:51:38 +10:00
|
|
|
|
assert(push_length <= max_push_length);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::get_pull_locs(const fs_reg &src,
|
|
|
|
|
|
unsigned *out_surf_index,
|
|
|
|
|
|
unsigned *out_pull_index)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(src.file == UNIFORM);
|
|
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
|
if (src.nr < UBO_START)
|
|
|
|
|
|
return false;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
|
const struct brw_ubo_range *range =
|
|
|
|
|
|
&prog_data->ubo_ranges[src.nr - UBO_START];
|
2019-09-09 22:21:17 -07:00
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
|
/* If this access is in our (reduced) range, use the push data. */
|
|
|
|
|
|
if (src.offset / 32 < range->length)
|
|
|
|
|
|
return false;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
|
2021-12-03 22:20:30 -06:00
|
|
|
|
*out_surf_index = range->block;
|
2021-12-03 21:34:06 -06:00
|
|
|
|
*out_pull_index = (32 * range->start + src.offset) / 4;
|
2019-09-09 22:21:17 -07:00
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
|
prog_data->has_ubo_pull = true;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
|
2021-12-03 21:34:06 -06:00
|
|
|
|
return true;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
|
|
|
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
|
|
|
|
|
*/
|
2023-08-06 15:46:12 +03:00
|
|
|
|
bool
|
2015-12-08 17:14:49 -08:00
|
|
|
|
fs_visitor::lower_constant_loads()
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
{
|
2017-06-02 09:54:31 -07:00
|
|
|
|
unsigned index, pull_index;
|
2023-08-06 15:46:12 +03:00
|
|
|
|
bool progress = false;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
/* Set up the annotation tracking for new generated instructions. */
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2010-10-22 12:57:00 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-11-24 15:12:20 -08:00
|
|
|
|
/* We'll handle this case later */
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
if (!get_pull_locs(inst->src[i], &index, &pull_index))
|
2010-10-22 12:57:00 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
assert(inst->src[i].stride == 0);
|
|
|
|
|
|
|
2016-12-08 19:18:00 -08:00
|
|
|
|
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
|
|
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
|
|
|
|
|
|
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
const unsigned base = pull_index * 4;
|
|
|
|
|
|
|
2022-12-21 20:16:27 +02:00
|
|
|
|
fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS];
|
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index);
|
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1));
|
|
|
|
|
|
srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst,
|
|
|
|
|
|
srcs, PULL_UNIFORM_CONSTANT_SRCS);
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->src[i].file = VGRF;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr = dst.nr;
|
2016-12-08 19:18:00 -08:00
|
|
|
|
inst->src[i].offset = (base & (block_sz - 1)) +
|
|
|
|
|
|
inst->src[i].offset % 4;
|
2023-08-06 15:46:12 +03:00
|
|
|
|
|
|
|
|
|
|
progress = true;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
|
|
|
|
|
|
inst->src[0].file == UNIFORM) {
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
if (!get_pull_locs(inst->src[0], &index, &pull_index))
|
|
|
|
|
|
continue;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
|
|
|
|
|
|
VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
|
|
|
|
|
|
brw_imm_ud(index),
|
2023-01-13 12:29:30 +02:00
|
|
|
|
fs_reg() /* surface_handle */,
|
2015-11-24 15:12:20 -08:00
|
|
|
|
inst->src[1],
|
2020-02-21 10:59:38 -06:00
|
|
|
|
pull_index * 4, 4);
|
2015-11-24 15:12:20 -08:00
|
|
|
|
inst->remove(block);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
|
|
|
|
|
|
progress = true;
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-06-22 19:03:25 -07:00
|
|
|
|
static uint64_t
|
|
|
|
|
|
src_as_uint(const fs_reg &src)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(src.file == IMM);
|
|
|
|
|
|
|
|
|
|
|
|
switch (src.type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_W:
|
|
|
|
|
|
return (uint64_t)(int16_t)(src.ud & 0xffff);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UW:
|
|
|
|
|
|
return (uint64_t)(uint16_t)(src.ud & 0xffff);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
|
|
|
|
|
return (uint64_t)src.d;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
|
|
|
|
|
return (uint64_t)src.ud;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_Q:
|
|
|
|
|
|
return src.d64;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UQ:
|
|
|
|
|
|
return src.u64;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid integer type.");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static fs_reg
|
|
|
|
|
|
brw_imm_for_type(uint64_t value, enum brw_reg_type type)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_W:
|
|
|
|
|
|
return brw_imm_w(value);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UW:
|
|
|
|
|
|
return brw_imm_uw(value);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
|
|
|
|
|
return brw_imm_d(value);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
|
|
|
|
|
return brw_imm_ud(value);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_Q:
|
|
|
|
|
|
return brw_imm_d(value);
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UQ:
|
|
|
|
|
|
return brw_imm_uq(value);
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid integer type.");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_algebraic()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2018-10-11 09:55:38 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
switch (inst->opcode) {
|
2014-12-21 06:56:54 -08:00
|
|
|
|
case BRW_OPCODE_MOV:
|
2020-01-13 11:17:27 -08:00
|
|
|
|
if (!devinfo->has_64bit_float &&
|
2022-03-21 13:17:30 -07:00
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_DF) {
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
assert(!inst->src[0].abs);
|
|
|
|
|
|
assert(!inst->src[0].negate);
|
|
|
|
|
|
const brw::fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2022-09-14 02:40:01 +03:00
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
|
2022-03-21 13:17:30 -07:00
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 1),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_F, 1));
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_F, 0),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_F, 0));
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!devinfo->has_64bit_int &&
|
|
|
|
|
|
(inst->dst.type == BRW_REGISTER_TYPE_UQ ||
|
2018-10-11 09:55:38 -07:00
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_Q)) {
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
assert(!inst->src[0].abs);
|
|
|
|
|
|
assert(!inst->src[0].negate);
|
|
|
|
|
|
const brw::fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2022-09-14 02:40:01 +03:00
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
|
2020-10-26 13:27:43 -05:00
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
|
2018-10-11 09:55:38 -07:00
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-06-22 08:34:03 -07:00
|
|
|
|
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
|
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
|
|
|
|
|
|
inst->dst.is_null() &&
|
|
|
|
|
|
(inst->src[0].abs || inst->src[0].negate)) {
|
|
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-21 06:56:54 -08:00
|
|
|
|
if (inst->src[0].file != IMM)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->saturate) {
|
2018-06-26 19:21:43 -07:00
|
|
|
|
/* Full mixed-type saturates don't happen. However, we can end up
|
|
|
|
|
|
* with things like:
|
|
|
|
|
|
*
|
|
|
|
|
|
* mov.sat(8) g21<1>DF -1F
|
|
|
|
|
|
*
|
|
|
|
|
|
* Other mixed-size-but-same-base-type cases may also be possible.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->dst.type != inst->src[0].type &&
|
|
|
|
|
|
inst->dst.type != BRW_REGISTER_TYPE_DF &&
|
|
|
|
|
|
inst->src[0].type != BRW_REGISTER_TYPE_F)
|
2014-12-21 06:56:54 -08:00
|
|
|
|
assert(!"unimplemented: saturate mixed types");
|
|
|
|
|
|
|
2018-06-26 19:21:43 -07:00
|
|
|
|
if (brw_saturate_immediate(inst->src[0].type,
|
2015-11-19 21:51:37 -08:00
|
|
|
|
&inst->src[0].as_brw_reg())) {
|
2014-12-21 06:56:54 -08:00
|
|
|
|
inst->saturate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
case BRW_OPCODE_MUL:
|
2018-11-27 09:43:12 +01:00
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
2011-07-22 16:45:15 -07:00
|
|
|
|
|
2021-10-29 14:16:44 -07:00
|
|
|
|
if (brw_reg_type_is_floating_point(inst->src[1].type))
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2018-11-27 09:43:12 +01:00
|
|
|
|
/* a * 1.0 = a */
|
|
|
|
|
|
if (inst->src[1].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2018-11-27 09:43:12 +01:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2011-07-22 16:45:15 -07:00
|
|
|
|
|
2015-02-04 18:08:30 -08:00
|
|
|
|
/* a * -1.0 = -a */
|
|
|
|
|
|
if (inst->src[1].is_negative_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2015-02-04 18:08:30 -08:00
|
|
|
|
inst->src[0].negate = !inst->src[0].negate;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-27 09:43:12 +01:00
|
|
|
|
break;
|
2012-09-20 11:06:07 +02:00
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2020-05-29 16:44:42 -07:00
|
|
|
|
if (brw_reg_type_is_integer(inst->src[1].type) &&
|
|
|
|
|
|
inst->src[1].is_zero()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2020-05-29 16:44:42 -07:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-09 17:27:52 -08:00
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2015-10-24 14:55:57 -07:00
|
|
|
|
inst->src[0].f += inst->src[1].f;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2012-09-20 11:06:07 +02:00
|
|
|
|
break;
|
2023-06-22 19:03:25 -07:00
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->src[0] = brw_imm_for_type(src0 & src1, inst->dst.type);
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2013-10-27 19:34:48 -07:00
|
|
|
|
case BRW_OPCODE_OR:
|
2023-06-22 19:03:25 -07:00
|
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
|
const uint64_t src0 = src_as_uint(inst->src[0]);
|
|
|
|
|
|
const uint64_t src1 = src_as_uint(inst->src[1]);
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->src[0] = brw_imm_for_type(src0 | src1, inst->dst.type);
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-06-13 12:32:27 -07:00
|
|
|
|
if (inst->src[0].equals(inst->src[1]) ||
|
|
|
|
|
|
inst->src[1].is_zero()) {
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* On Gfx8+, the OR instruction can have a source modifier that
|
2018-12-12 18:14:34 -08:00
|
|
|
|
* performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
|
|
|
|
|
|
* or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->src[0].negate) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_NOT;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2018-12-12 18:14:34 -08:00
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2018-12-12 18:14:34 -08:00
|
|
|
|
}
|
2013-10-27 19:34:48 -07:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-01-05 13:51:03 -08:00
|
|
|
|
case BRW_OPCODE_CMP:
|
2018-06-22 08:34:03 -07:00
|
|
|
|
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
|
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
|
|
|
|
|
|
inst->src[1].is_zero() &&
|
|
|
|
|
|
(inst->src[0].abs || inst->src[0].negate)) {
|
2015-01-05 13:51:03 -08:00
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
case BRW_OPCODE_SEL:
|
2020-01-13 11:17:27 -08:00
|
|
|
|
if (!devinfo->has_64bit_float &&
|
|
|
|
|
|
!devinfo->has_64bit_int &&
|
2018-10-11 09:55:38 -07:00
|
|
|
|
(inst->dst.type == BRW_REGISTER_TYPE_DF ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UQ ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_Q)) {
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
assert(!inst->src[0].abs && !inst->src[0].negate);
|
|
|
|
|
|
assert(!inst->src[1].abs && !inst->src[1].negate);
|
|
|
|
|
|
const brw::fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2022-09-14 02:40:01 +03:00
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
|
2018-10-11 09:55:38 -07:00
|
|
|
|
set_predicate(inst->predicate,
|
|
|
|
|
|
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
|
|
|
|
|
|
set_predicate(inst->predicate,
|
|
|
|
|
|
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
2014-04-18 10:01:41 -07:00
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2014-04-18 10:01:41 -07:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NONE;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->saturate && inst->src[1].file == IMM) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
if (inst->src[1].f >= 1.0f) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
inst->src[1] = reg_undef;
|
2015-02-10 21:36:26 -08:00
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 21:26:36 -07:00
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
if (inst->src[1].f <= 0.0f) {
|
2013-10-27 21:26:36 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 1;
|
2013-10-27 21:26:36 -07:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2013-10-27 20:03:48 -07:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
case BRW_OPCODE_MAD:
|
2019-02-12 09:34:10 +01:00
|
|
|
|
if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
|
|
|
|
|
|
inst->src[1].type != BRW_REGISTER_TYPE_F ||
|
|
|
|
|
|
inst->src[2].type != BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
break;
|
2019-02-12 12:43:30 +01:00
|
|
|
|
if (inst->src[1].is_one()) {
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 2;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->src[1] = inst->src[2];
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[2].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
2023-03-07 17:54:16 +02:00
|
|
|
|
inst->sources = 2;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
intel/fs: Constant fold SHL
This is a modified version of a commit originally in !7698. This version
add the changes to brw_fs_copy_propagation. If the address passed to
fs_visitor::swizzle_nir_scratch_addr is a constant, that function will
generate SHL with two constant sources.
DG2 uses a different path to generate those addresses, so the constant
folding can't occur there yet. That will be addressed in the next
commit.
What follows is the commit change history from that older MR.
v2: Previously this commit was after `intel/fs: Combine constants for
integer instructions too`. However, this commit can create invalid
instructions that are only cleaned up by `intel/fs: Combine constants
for integer instructions too`. That would potentially affect the
shader-db results of each commit, but I did not collect new data for
the reordering.
v3: Fix masking for W/UW and for Q/UQ types. Add an assertion for
!saturate. Both suggested by Ken. Also add an assertion that B/UB types
don't matically come back.
v4: Fix sources count. See also ed3c2f73dbb ("intel/fs: fixup sources
number from opt_algebraic").
v5: Fix typo in comment added in v3. Noticed by Marcin. Fix a typo in a
comment added when pulling this commit out of !7698. Noticed by Ken.
shader-db results:
DG2
No changes.
Tiger Lake, Ice Lake, and Skylake had similar results (Ice Lake shown)
total instructions in shared programs: 20655696 -> 20651648 (-0.02%)
instructions in affected programs: 23125 -> 19077 (-17.50%)
helped: 7 / HURT: 0
total cycles in shared programs: 858436639 -> 858407749 (<.01%)
cycles in affected programs: 8990532 -> 8961642 (-0.32%)
helped: 7 / HURT: 0
Broadwell and Haswell had similar results. (Broadwell shown)
total instructions in shared programs: 18500780 -> 18496630 (-0.02%)
instructions in affected programs: 24715 -> 20565 (-16.79%)
helped: 7 / HURT: 0
total cycles in shared programs: 946100660 -> 946087688 (<.01%)
cycles in affected programs: 5838252 -> 5825280 (-0.22%)
helped: 7 / HURT: 0
total spills in shared programs: 17588 -> 17572 (-0.09%)
spills in affected programs: 1206 -> 1190 (-1.33%)
helped: 2 / HURT: 0
total fills in shared programs: 25192 -> 25156 (-0.14%)
fills in affected programs: 156 -> 120 (-23.08%)
helped: 2 / HURT: 0
No shader-db changes on any older Intel platforms.
fossil-db results:
DG2
Totals:
Instrs: 197780415 -> 197780372 (-0.00%); split: -0.00%, +0.00%
Cycles: 14066412266 -> 14066410782 (-0.00%); split: -0.00%, +0.00%
Totals from 16 (0.00% of 668055) affected shaders:
Instrs: 16420 -> 16377 (-0.26%); split: -0.43%, +0.17%
Cycles: 220133 -> 218649 (-0.67%); split: -0.69%, +0.01%
Tiger Lake, Ice Lake and Skylake had similar results. (Ice Lake shown)
Totals:
Instrs: 153425977 -> 153423678 (-0.00%)
Cycles: 14747928947 -> 14747929547 (+0.00%); split: -0.00%, +0.00%
Subgroup size: 8535968 -> 8535976 (+0.00%)
Send messages: 7697606 -> 7697607 (+0.00%)
Scratch Memory Size: 4380672 -> 4381696 (+0.02%)
Totals from 6 (0.00% of 662749) affected shaders:
Instrs: 13893 -> 11594 (-16.55%)
Cycles: 5386074 -> 5386674 (+0.01%); split: -0.42%, +0.43%
Subgroup size: 80 -> 88 (+10.00%)
Send messages: 675 -> 676 (+0.15%)
Scratch Memory Size: 91136 -> 92160 (+1.12%)
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23884>
2020-11-13 19:11:56 -08:00
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
|
if (inst->src[0].file == IMM && inst->src[1].file == IMM) {
|
|
|
|
|
|
/* It's not currently possible to generate this, and this constant
|
|
|
|
|
|
* folding does not handle it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg result;
|
|
|
|
|
|
|
|
|
|
|
|
switch (type_sz(inst->src[0].type)) {
|
|
|
|
|
|
case 2:
|
|
|
|
|
|
result = brw_imm_uw(0x0ffff & (inst->src[0].ud << (inst->src[1].ud & 0x1f)));
|
|
|
|
|
|
break;
|
|
|
|
|
|
case 4:
|
|
|
|
|
|
result = brw_imm_ud(inst->src[0].ud << (inst->src[1].ud & 0x1f));
|
|
|
|
|
|
break;
|
|
|
|
|
|
case 8:
|
|
|
|
|
|
result = brw_imm_uq(inst->src[0].u64 << (inst->src[1].ud & 0x3f));
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
/* Just in case a future platform re-enables B or UB types. */
|
|
|
|
|
|
unreachable("Invalid source size.");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = retype(result, inst->dst.type);
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-02-19 14:52:24 +02:00
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2017-09-01 22:30:53 -07:00
|
|
|
|
/* It's possible that the selected component will be too large and
|
|
|
|
|
|
* overflow the register. This can happen if someone does a
|
|
|
|
|
|
* readInvocation() from GLSL or SPIR-V and provides an OOB
|
|
|
|
|
|
* invocationIndex. If this happens and we some how manage
|
|
|
|
|
|
* to constant fold it in and get here, then component() may cause
|
|
|
|
|
|
* us to start reading outside of the VGRF which will lead to an
|
|
|
|
|
|
* assert later. Instead, just let it wrap around if it goes over
|
|
|
|
|
|
* exec_size.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
|
|
|
|
|
|
inst->src[0] = component(inst->src[0], comp);
|
2015-02-19 14:52:24 +02:00
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2017-08-29 09:21:32 -07:00
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = component(inst->src[0],
|
|
|
|
|
|
inst->src[1].ud);
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-02-19 14:52:24 +02:00
|
|
|
|
|
2011-05-03 10:55:50 -07:00
|
|
|
|
default:
|
2011-07-22 16:45:15 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-06-01 14:21:13 -07:00
|
|
|
|
/* Ensure that the correct source has the immediate value. 2-source
|
|
|
|
|
|
* instructions must have the immediate in src[1]. On Gfx12 and later,
|
|
|
|
|
|
* some 3-source instructions can have the immediate in src[0] or
|
|
|
|
|
|
* src[2]. It's complicated, so don't mess with 3-source instructions
|
|
|
|
|
|
* here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (progress && inst->sources == 2 && inst->is_commutative()) {
|
2015-03-16 10:08:08 +02:00
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
fs_reg tmp = inst->src[1];
|
|
|
|
|
|
inst->src[1] = inst->src[0];
|
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-03-12 18:50:24 -08:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
|
|
|
|
|
DEPENDENCY_INSTRUCTION_DETAIL);
|
2016-03-12 18:50:24 -08:00
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
static unsigned
|
|
|
|
|
|
load_payload_sources_read_for_size(fs_inst *lp, unsigned size_read)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(lp->opcode == SHADER_OPCODE_LOAD_PAYLOAD);
|
|
|
|
|
|
assert(size_read >= lp->header_size * REG_SIZE);
|
|
|
|
|
|
|
|
|
|
|
|
unsigned i;
|
|
|
|
|
|
unsigned size = lp->header_size * REG_SIZE;
|
|
|
|
|
|
for (i = lp->header_size; size < size_read && i < lp->sources; i++)
|
|
|
|
|
|
size += lp->exec_size * type_sz(lp->src[i].type);
|
|
|
|
|
|
|
|
|
|
|
|
/* Size read must cover exactly a subset of sources. */
|
|
|
|
|
|
assert(size == size_read);
|
|
|
|
|
|
return i;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-23 16:56:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages that have constant zero values for the trailing
|
2023-10-25 21:07:26 -07:00
|
|
|
|
* parameters. We can just reduce the message length for these
|
2015-04-23 16:56:53 -07:00
|
|
|
|
* instructions instead of reserving a register for it. Trailing parameters
|
|
|
|
|
|
* that aren't sent default to zero anyway. This will cause the dead code
|
|
|
|
|
|
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
|
|
|
|
|
* set up the zero value.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_zero_samples()
|
|
|
|
|
|
{
|
2023-10-25 21:07:26 -07:00
|
|
|
|
/* Implementation supports only SENDs, so applicable to Gfx7+ only. */
|
|
|
|
|
|
assert(devinfo->ver >= 7);
|
|
|
|
|
|
|
2015-04-23 16:56:53 -07:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2023-10-25 21:07:26 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, send, cfg) {
|
|
|
|
|
|
if (send->opcode != SHADER_OPCODE_SEND ||
|
|
|
|
|
|
send->sfid != BRW_SFID_SAMPLER)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2023-10-31 20:45:31 -07:00
|
|
|
|
/* Wa_14012688258:
|
|
|
|
|
|
*
|
|
|
|
|
|
* Don't trim zeros at the end of payload for sample operations
|
|
|
|
|
|
* in cube and cube arrays.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (send->keep_payload_trailing_zeros)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2023-10-25 21:07:26 -07:00
|
|
|
|
/* This pass works on SENDs before splitting. */
|
|
|
|
|
|
if (send->ex_mlen > 0)
|
2015-04-23 16:56:53 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2023-10-25 21:07:26 -07:00
|
|
|
|
fs_inst *lp = (fs_inst *) send->prev;
|
2015-04-23 16:56:53 -07:00
|
|
|
|
|
2023-10-25 21:07:26 -07:00
|
|
|
|
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
2015-04-23 16:56:53 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2023-10-25 21:07:26 -07:00
|
|
|
|
/* How much of the payload are actually read by this SEND. */
|
|
|
|
|
|
const unsigned params =
|
|
|
|
|
|
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
|
|
|
|
|
|
2015-05-08 16:13:52 +01:00
|
|
|
|
/* We don't want to remove the message header or the first parameter.
|
|
|
|
|
|
* Removing the first parameter is not allowed, see the Haswell PRM
|
|
|
|
|
|
* volume 7, page 149:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Parameter 0 is required except for the sampleinfo message, which
|
|
|
|
|
|
* has no parameter 0"
|
2015-04-23 16:56:53 -07:00
|
|
|
|
*/
|
2023-10-25 21:07:26 -07:00
|
|
|
|
const unsigned first_param_idx = lp->header_size;
|
|
|
|
|
|
unsigned zero_size = 0;
|
|
|
|
|
|
for (unsigned i = params - 1; i > first_param_idx; i--) {
|
|
|
|
|
|
if (lp->src[i].file != BAD_FILE && !lp->src[i].is_zero())
|
|
|
|
|
|
break;
|
|
|
|
|
|
zero_size += lp->exec_size * type_sz(lp->src[i].type) * lp->dst.stride;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const unsigned zero_len = zero_size / (reg_unit(devinfo) * REG_SIZE);
|
|
|
|
|
|
if (zero_len > 0) {
|
|
|
|
|
|
send->mlen -= zero_len;
|
2015-04-23 16:56:53 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
|
2015-04-23 16:56:53 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Opportunistically split SEND message payloads.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Gfx9+ supports "split" SEND messages, which take two payloads that are
|
|
|
|
|
|
* implicitly concatenated. If we find a SEND message with a single payload,
|
|
|
|
|
|
* we can split that payload in two. This results in smaller contiguous
|
|
|
|
|
|
* register blocks for us to allocate. But it can help beyond that, too.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We try and split a LOAD_PAYLOAD between sources which change registers.
|
|
|
|
|
|
* For example, a sampler message often contains a x/y/z coordinate that may
|
|
|
|
|
|
* already be in a contiguous VGRF, combined with an LOD, shadow comparitor,
|
|
|
|
|
|
* or array index, which comes from elsewhere. In this case, the first few
|
|
|
|
|
|
* sources will be different offsets of the same VGRF, then a later source
|
|
|
|
|
|
* will be a different VGRF. So we split there, possibly eliminating the
|
|
|
|
|
|
* payload concatenation altogether.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_split_sends()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (devinfo->ver < 9)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, send, cfg) {
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
if (send->opcode != SHADER_OPCODE_SEND ||
|
2023-09-19 11:09:09 -07:00
|
|
|
|
send->mlen <= reg_unit(devinfo) || send->ex_mlen > 0)
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
assert(send->src[2].file == VGRF);
|
|
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
/* Currently don't split sends that reuse a previously used payload. */
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
fs_inst *lp = (fs_inst *) send->prev;
|
|
|
|
|
|
|
|
|
|
|
|
if (lp->is_head_sentinel() || lp->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
if (lp->dst.file != send->src[2].file || lp->dst.nr != send->src[2].nr)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Split either after the header (if present), or when consecutive
|
|
|
|
|
|
* sources switch from one VGRF to a different one.
|
|
|
|
|
|
*/
|
2023-10-25 18:43:40 -07:00
|
|
|
|
unsigned mid = lp->header_size;
|
|
|
|
|
|
if (mid == 0) {
|
|
|
|
|
|
for (mid = 1; mid < lp->sources; mid++) {
|
|
|
|
|
|
if (lp->src[mid].file == BAD_FILE)
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
if (lp->src[0].file != lp->src[mid].file ||
|
|
|
|
|
|
lp->src[0].nr != lp->src[mid].nr)
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
/* SEND mlen might be smaller than what LOAD_PAYLOAD provides, so
|
|
|
|
|
|
* find out how many sources from the payload does it really need.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned end =
|
|
|
|
|
|
load_payload_sources_read_for_size(lp, send->mlen * REG_SIZE);
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
/* Nothing to split. */
|
|
|
|
|
|
if (end <= mid)
|
|
|
|
|
|
continue;
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
const fs_builder ibld(this, block, lp);
|
|
|
|
|
|
fs_inst *lp1 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[0], mid, lp->header_size);
|
|
|
|
|
|
fs_inst *lp2 = ibld.LOAD_PAYLOAD(lp->dst, &lp->src[mid], end - mid, 0);
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
assert(lp1->size_written % REG_SIZE == 0);
|
|
|
|
|
|
assert(lp2->size_written % REG_SIZE == 0);
|
|
|
|
|
|
assert((lp1->size_written + lp2->size_written) / REG_SIZE == send->mlen);
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
|
2023-10-25 18:43:40 -07:00
|
|
|
|
lp1->dst = fs_reg(VGRF, alloc.allocate(lp1->size_written / REG_SIZE), lp1->dst.type);
|
|
|
|
|
|
lp2->dst = fs_reg(VGRF, alloc.allocate(lp2->size_written / REG_SIZE), lp2->dst.type);
|
|
|
|
|
|
|
|
|
|
|
|
send->resize_sources(4);
|
|
|
|
|
|
send->src[2] = lp1->dst;
|
|
|
|
|
|
send->src[3] = lp2->dst;
|
|
|
|
|
|
send->ex_mlen = lp2->size_written / REG_SIZE;
|
|
|
|
|
|
send->mlen -= send->ex_mlen;
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
2014-04-14 15:01:37 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_register_renaming()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
int depth = 0;
|
|
|
|
|
|
|
2018-12-10 14:49:49 -08:00
|
|
|
|
unsigned remap[alloc.count];
|
|
|
|
|
|
memset(remap, ~0u, sizeof(unsigned) * alloc.count);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-04-14 15:01:37 -07:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
} else if (inst->opcode == BRW_OPCODE_ENDIF ||
|
|
|
|
|
|
inst->opcode == BRW_OPCODE_WHILE) {
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite instruction sources. */
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF &&
|
2018-12-10 14:49:49 -08:00
|
|
|
|
remap[inst->src[i].nr] != ~0u &&
|
2015-10-26 04:35:14 -07:00
|
|
|
|
remap[inst->src[i].nr] != inst->src[i].nr) {
|
|
|
|
|
|
inst->src[i].nr = remap[inst->src[i].nr];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-12-10 14:49:49 -08:00
|
|
|
|
const unsigned dst = inst->dst.nr;
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
|
|
|
|
|
if (depth == 0 &&
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->dst.file == VGRF &&
|
2016-09-07 13:38:20 -07:00
|
|
|
|
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
|
2019-04-24 12:38:28 +02:00
|
|
|
|
!inst->is_partial_write()) {
|
2018-12-10 14:49:49 -08:00
|
|
|
|
if (remap[dst] == ~0u) {
|
2014-04-14 15:01:37 -07:00
|
|
|
|
remap[dst] = dst;
|
|
|
|
|
|
} else {
|
2016-09-07 16:59:35 -07:00
|
|
|
|
remap[dst] = alloc.allocate(regs_written(inst));
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = remap[dst];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
2015-10-26 17:09:25 -07:00
|
|
|
|
} else if (inst->dst.file == VGRF &&
|
2018-12-10 14:49:49 -08:00
|
|
|
|
remap[dst] != ~0u &&
|
2014-04-14 15:01:37 -07:00
|
|
|
|
remap[dst] != dst) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = remap[dst];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
|
|
|
|
|
|
DEPENDENCY_VARIABLES);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
2018-12-10 14:49:49 -08:00
|
|
|
|
if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
delta_xy[i].nr = remap[delta_xy[i].nr];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-05 22:10:41 -07:00
|
|
|
|
/**
|
2020-11-30 17:24:51 -06:00
|
|
|
|
* Remove redundant or useless halts.
|
2014-07-05 22:10:41 -07:00
|
|
|
|
*
|
2020-11-30 17:24:51 -06:00
|
|
|
|
* For example, we can eliminate halts in the following sequence:
|
2014-07-05 22:10:41 -07:00
|
|
|
|
*
|
2020-11-30 17:24:51 -06:00
|
|
|
|
* halt (redundant with the next halt)
|
|
|
|
|
|
* halt (useless; jumps to the next instruction)
|
|
|
|
|
|
* halt-target
|
2014-07-05 22:10:41 -07:00
|
|
|
|
*/
|
|
|
|
|
|
bool
|
2020-11-30 17:24:51 -06:00
|
|
|
|
fs_visitor::opt_redundant_halt()
|
2014-07-05 22:10:41 -07:00
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2020-12-01 09:48:30 -06:00
|
|
|
|
unsigned halt_count = 0;
|
2020-11-19 09:32:27 -06:00
|
|
|
|
fs_inst *halt_target = NULL;
|
2020-12-01 09:48:30 -06:00
|
|
|
|
bblock_t *halt_target_block = NULL;
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_HALT)
|
|
|
|
|
|
halt_count++;
|
|
|
|
|
|
|
2020-11-19 09:32:27 -06:00
|
|
|
|
if (inst->opcode == SHADER_OPCODE_HALT_TARGET) {
|
|
|
|
|
|
halt_target = inst;
|
2020-12-01 09:48:30 -06:00
|
|
|
|
halt_target_block = block;
|
2014-07-05 22:10:41 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-12-01 09:48:30 -06:00
|
|
|
|
if (!halt_target) {
|
|
|
|
|
|
assert(halt_count == 0);
|
2014-07-05 22:10:41 -07:00
|
|
|
|
return false;
|
2020-12-01 09:48:30 -06:00
|
|
|
|
}
|
2014-07-05 22:10:41 -07:00
|
|
|
|
|
2020-11-19 09:32:27 -06:00
|
|
|
|
/* Delete any HALTs immediately before the halt target. */
|
|
|
|
|
|
for (fs_inst *prev = (fs_inst *) halt_target->prev;
|
2020-11-30 17:24:51 -06:00
|
|
|
|
!prev->is_head_sentinel() && prev->opcode == BRW_OPCODE_HALT;
|
2020-11-19 09:32:27 -06:00
|
|
|
|
prev = (fs_inst *) halt_target->prev) {
|
2020-12-01 09:48:30 -06:00
|
|
|
|
prev->remove(halt_target_block);
|
|
|
|
|
|
halt_count--;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (halt_count == 0) {
|
|
|
|
|
|
halt_target->remove(halt_target_block);
|
2014-07-05 22:10:41 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2014-07-05 22:10:41 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Compute a bitmask with GRF granularity with a bit set for each GRF starting
|
2016-09-02 15:21:26 -07:00
|
|
|
|
* from \p r.offset which overlaps the region starting at \p s.offset and
|
|
|
|
|
|
* spanning \p ds bytes.
|
2016-05-27 16:03:34 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static inline unsigned
|
2016-09-02 15:21:26 -07:00
|
|
|
|
mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
|
2016-05-27 16:03:34 -07:00
|
|
|
|
{
|
2016-09-02 15:21:26 -07:00
|
|
|
|
const int rel_offset = reg_offset(s) - reg_offset(r);
|
|
|
|
|
|
const int shift = rel_offset / REG_SIZE;
|
|
|
|
|
|
const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
|
2016-05-27 16:03:34 -07:00
|
|
|
|
assert(reg_space(r) == reg_space(s) &&
|
2016-09-02 15:21:26 -07:00
|
|
|
|
shift >= 0 && shift < int(8 * sizeof(unsigned)));
|
|
|
|
|
|
return ((1 << n) - 1) << shift;
|
2016-05-27 16:03:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::compute_to_mrf()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
2014-07-15 12:56:37 -07:00
|
|
|
|
int next_ip = 0;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
|
2014-10-29 14:21:14 -07:00
|
|
|
|
/* No MRFs on Gen >= 7. */
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 7)
|
2014-10-29 14:21:14 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2016-03-13 16:25:57 -07:00
|
|
|
|
const fs_live_variables &live = live_analysis.require();
|
2011-01-12 10:10:01 -08:00
|
|
|
|
|
2014-09-03 23:52:26 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
int ip = next_ip;
|
|
|
|
|
|
next_ip++;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
2019-04-24 12:38:28 +02:00
|
|
|
|
inst->is_partial_write() ||
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->dst.file != MRF || inst->src[0].file != VGRF ||
|
2010-10-08 14:00:14 -07:00
|
|
|
|
inst->dst.type != inst->src[0].type ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[0].abs || inst->src[0].negate ||
|
2014-01-15 22:21:30 +01:00
|
|
|
|
!inst->src[0].is_contiguous() ||
|
2016-09-01 15:11:21 -07:00
|
|
|
|
inst->src[0].offset % REG_SIZE != 0)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Can't compute-to-MRF this GRF if someone else was going to
|
|
|
|
|
|
* read it later.
|
|
|
|
|
|
*/
|
2016-03-13 16:25:57 -07:00
|
|
|
|
if (live.vgrf_end[inst->src[0].nr] > ip)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
|
|
|
|
|
|
* things that computed the value of all GRFs of the source region. The
|
|
|
|
|
|
* regs_left bitset keeps track of the registers we haven't yet found a
|
|
|
|
|
|
* generating instruction for.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2016-09-07 16:59:35 -07:00
|
|
|
|
unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
|
2016-05-27 13:15:55 -07:00
|
|
|
|
|
2015-10-20 11:16:00 +02:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
2016-09-07 17:00:07 -07:00
|
|
|
|
inst->src[0], inst->size_read(0))) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
/* Found the last thing to write our reg we want to turn
|
|
|
|
|
|
* into a compute-to-MRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/* If this one instruction didn't populate all the
|
|
|
|
|
|
* channels, bail. We might be able to rewrite everything
|
2011-03-28 16:54:39 -07:00
|
|
|
|
* that writes that reg, but it would require smarter
|
2016-05-27 13:15:55 -07:00
|
|
|
|
* tracking.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2019-04-24 12:38:28 +02:00
|
|
|
|
if (scan_inst->is_partial_write())
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2016-05-27 16:41:35 -07:00
|
|
|
|
/* Handling things not fully contained in the source of the copy
|
|
|
|
|
|
* would need us to understand coalescing out more than one MOV at
|
|
|
|
|
|
* a time.
|
2014-07-15 12:56:37 -07:00
|
|
|
|
*/
|
2016-09-01 20:06:40 -07:00
|
|
|
|
if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
|
|
|
|
|
|
inst->src[0], inst->size_read(0)))
|
2014-07-15 12:56:37 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* SEND instructions can't have MRF as a destination. */
|
|
|
|
|
|
if (scan_inst->mlen)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver == 6) {
|
2021-03-29 15:40:04 -07:00
|
|
|
|
/* gfx6 math instructions must have the destination be
|
2010-10-11 13:38:38 -07:00
|
|
|
|
* GRF, so no compute-to-MRF for them.
|
|
|
|
|
|
*/
|
2011-01-18 22:48:11 -08:00
|
|
|
|
if (scan_inst->is_math()) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Clear the bits for any registers this instruction overwrites. */
|
|
|
|
|
|
regs_left &= ~mask_relative_to(
|
2016-09-02 15:21:26 -07:00
|
|
|
|
inst->src[0], scan_inst->dst, scan_inst->size_written);
|
2016-05-27 16:03:34 -07:00
|
|
|
|
if (!regs_left)
|
|
|
|
|
|
break;
|
2010-11-18 15:03:50 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:36:18 -08:00
|
|
|
|
/* We don't handle control flow here. Most computation of
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* values that end up in MRFs are shortly before the MRF
|
|
|
|
|
|
* write anyway.
|
|
|
|
|
|
*/
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->start() == scan_inst)
|
2010-11-18 15:03:50 +08:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* You can't read from an MRF, so if someone else reads our
|
|
|
|
|
|
* MRF's source GRF that we wanted to rewrite, that stops us.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool interfered = false;
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->sources; i++) {
|
2016-09-07 17:00:07 -07:00
|
|
|
|
if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
|
|
|
|
|
|
inst->src[0], inst->size_read(0))) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
interfered = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (interfered)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
|
|
|
|
|
inst->dst, inst->size_written)) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
/* If somebody else writes our MRF here, we can't
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* compute-to-MRF before that.
|
|
|
|
|
|
*/
|
2016-05-27 12:50:28 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-11-18 15:03:50 +08:00
|
|
|
|
|
2016-05-27 12:50:28 -07:00
|
|
|
|
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
|
|
|
|
|
|
regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
|
2016-09-07 13:38:20 -07:00
|
|
|
|
inst->dst, inst->size_written)) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
/* Found a SEND instruction, which means that there are
|
|
|
|
|
|
* live values in MRFs from base_mrf to base_mrf +
|
|
|
|
|
|
* scan_inst->mlen - 1. Don't go pushing our MRF write up
|
|
|
|
|
|
* above it.
|
|
|
|
|
|
*/
|
2016-05-27 12:50:28 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-10-08 14:00:14 -07:00
|
|
|
|
}
|
2016-05-27 13:15:55 -07:00
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
if (regs_left)
|
2016-05-27 13:15:55 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Found all generating instructions of our MRF's source value, so it
|
|
|
|
|
|
* should be safe to rewrite them to point to the MRF directly.
|
2016-05-27 13:15:55 -07:00
|
|
|
|
*/
|
2016-09-07 16:59:35 -07:00
|
|
|
|
regs_left = (1 << regs_read(inst, 0)) - 1;
|
2016-05-27 16:03:34 -07:00
|
|
|
|
|
2016-05-27 13:15:55 -07:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
2016-09-07 17:00:07 -07:00
|
|
|
|
inst->src[0], inst->size_read(0))) {
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Clear the bits for any registers this instruction overwrites. */
|
|
|
|
|
|
regs_left &= ~mask_relative_to(
|
2016-09-02 15:21:26 -07:00
|
|
|
|
inst->src[0], scan_inst->dst, scan_inst->size_written);
|
2016-05-27 16:03:34 -07:00
|
|
|
|
|
2016-09-03 13:14:28 -07:00
|
|
|
|
const unsigned rel_offset = reg_offset(scan_inst->dst) -
|
|
|
|
|
|
reg_offset(inst->src[0]);
|
2016-05-27 14:17:28 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->dst.nr & BRW_MRF_COMPR4) {
|
|
|
|
|
|
/* Apply the same address transformation done by the hardware
|
|
|
|
|
|
* for COMPR4 MRF writes.
|
|
|
|
|
|
*/
|
2016-09-03 13:14:28 -07:00
|
|
|
|
assert(rel_offset < 2 * REG_SIZE);
|
|
|
|
|
|
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
|
2016-05-27 14:17:28 -07:00
|
|
|
|
|
|
|
|
|
|
/* Clear the COMPR4 bit if the generating instruction is not
|
|
|
|
|
|
* compressed.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (scan_inst->size_written < 2 * REG_SIZE)
|
2016-05-27 14:17:28 -07:00
|
|
|
|
scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
|
|
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Calculate the MRF number the result of this instruction is
|
|
|
|
|
|
* ultimately written to.
|
|
|
|
|
|
*/
|
2016-09-03 13:14:28 -07:00
|
|
|
|
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
|
2016-05-27 14:17:28 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 13:15:55 -07:00
|
|
|
|
scan_inst->dst.file = MRF;
|
2016-09-03 13:14:28 -07:00
|
|
|
|
scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
|
2016-05-27 13:15:55 -07:00
|
|
|
|
scan_inst->saturate |= inst->saturate;
|
2016-05-27 16:03:34 -07:00
|
|
|
|
if (!regs_left)
|
|
|
|
|
|
break;
|
2016-05-27 13:15:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
assert(!regs_left);
|
2016-05-27 13:15:55 -07:00
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-20 20:25:04 +02:00
|
|
|
|
/**
|
|
|
|
|
|
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
|
|
|
|
|
* flow. We could probably do better here with some form of divergence
|
|
|
|
|
|
* analysis.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::eliminate_find_live_channel()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
unsigned depth = 0;
|
|
|
|
|
|
|
2016-09-15 17:20:23 -07:00
|
|
|
|
if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
|
|
|
|
|
|
/* The optimization below assumes that channel zero is live on thread
|
|
|
|
|
|
* dispatch, which may not be the case if the fixed function dispatches
|
|
|
|
|
|
* threads sparsely.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-20 20:25:04 +02:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2020-11-30 17:24:51 -06:00
|
|
|
|
case BRW_OPCODE_HALT:
|
2015-02-20 20:25:04 +02:00
|
|
|
|
/* This can potentially make control flow non-uniform until the end
|
|
|
|
|
|
* of the program.
|
|
|
|
|
|
*/
|
2022-05-23 12:54:22 -07:00
|
|
|
|
goto out;
|
2015-02-20 20:25:04 +02:00
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
|
if (depth == 0) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2015-11-02 11:26:16 -08:00
|
|
|
|
inst->src[0] = brw_imm_ud(0u);
|
2015-02-20 20:25:04 +02:00
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-05-23 12:54:22 -07:00
|
|
|
|
out:
|
2016-03-12 18:50:24 -08:00
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL);
|
2016-03-12 18:50:24 -08:00
|
|
|
|
|
2015-02-20 20:25:04 +02:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-07 15:27:17 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
|
|
|
|
|
* instructions to FS_OPCODE_REP_FB_WRITE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_visitor::emit_repclear_shader()
|
2014-07-07 15:27:17 -07:00
|
|
|
|
{
|
2014-08-19 13:57:11 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2022-11-23 01:47:55 -08:00
|
|
|
|
fs_inst *write = NULL;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2022-11-23 00:38:02 -08:00
|
|
|
|
assert(uniforms == 0);
|
2022-11-23 00:55:19 -08:00
|
|
|
|
assume(key->nr_color_regions > 0);
|
2016-04-04 14:38:42 -07:00
|
|
|
|
|
2022-11-23 01:47:55 -08:00
|
|
|
|
fs_reg color_output, header;
|
|
|
|
|
|
if (devinfo->ver >= 7) {
|
|
|
|
|
|
color_output = retype(brw_vec4_grf(127, 0), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
header = retype(brw_vec8_grf(125, 0), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
color_output = retype(brw_vec4_reg(MRF, 2, 0), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
header = retype(brw_vec8_reg(MRF, 0, 0), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We pass the clear color as a flat input. Copy it to the output. */
|
|
|
|
|
|
fs_reg color_input =
|
2022-11-23 00:38:02 -08:00
|
|
|
|
brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_UD,
|
|
|
|
|
|
BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
|
|
|
|
|
|
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
|
|
|
|
|
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
2022-11-23 01:47:55 -08:00
|
|
|
|
bld.exec_all().group(4, 0).MOV(color_output, color_input);
|
2018-05-17 08:46:03 -07:00
|
|
|
|
|
2022-11-23 00:55:19 -08:00
|
|
|
|
if (key->nr_color_regions > 1) {
|
2022-11-23 01:47:55 -08:00
|
|
|
|
/* Copy g0..g1 as the message header */
|
2018-05-17 08:46:03 -07:00
|
|
|
|
bld.exec_all().group(16, 0)
|
|
|
|
|
|
.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
2022-11-23 00:55:19 -08:00
|
|
|
|
}
|
2018-05-17 08:46:03 -07:00
|
|
|
|
|
2022-11-23 00:55:19 -08:00
|
|
|
|
for (int i = 0; i < key->nr_color_regions; ++i) {
|
|
|
|
|
|
if (i > 0)
|
|
|
|
|
|
bld.exec_all().group(1, 0).MOV(component(header, 2), brw_imm_ud(i));
|
2018-05-17 08:46:03 -07:00
|
|
|
|
|
2022-11-23 01:47:55 -08:00
|
|
|
|
if (devinfo->ver >= 7) {
|
|
|
|
|
|
write = bld.emit(SHADER_OPCODE_SEND);
|
|
|
|
|
|
write->resize_sources(3);
|
|
|
|
|
|
write->sfid = GFX6_SFID_DATAPORT_RENDER_CACHE;
|
|
|
|
|
|
write->src[0] = brw_imm_ud(0);
|
|
|
|
|
|
write->src[1] = brw_imm_ud(0);
|
|
|
|
|
|
write->src[2] = i == 0 ? color_output : header;
|
|
|
|
|
|
write->check_tdr = true;
|
|
|
|
|
|
write->send_has_side_effects = true;
|
|
|
|
|
|
write->desc = brw_fb_write_desc(devinfo, i,
|
|
|
|
|
|
BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED,
|
|
|
|
|
|
i == key->nr_color_regions - 1, false);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
|
|
|
|
|
|
write->target = i;
|
|
|
|
|
|
write->base_mrf = i == 0 ? color_output.nr : header.nr;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-11-23 00:55:19 -08:00
|
|
|
|
/* We can use a headerless message for the first render target */
|
|
|
|
|
|
write->header_size = i == 0 ? 0 : 2;
|
|
|
|
|
|
write->mlen = 1 + write->header_size;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
write->eot = true;
|
2017-01-13 14:01:45 -08:00
|
|
|
|
write->last_rt = true;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
calculate_cfg();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2022-11-23 00:38:02 -08:00
|
|
|
|
this->first_non_payload_grf = payload().num_regs;
|
2018-11-09 14:13:37 -08:00
|
|
|
|
|
|
|
|
|
|
lower_scoreboard();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
2012-01-27 11:06:49 -08:00
|
|
|
|
* Walks through basic blocks, looking for repeated MRF writes and
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* removing the later ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_duplicate_mrf_writes()
|
|
|
|
|
|
{
|
2021-03-29 14:41:58 -07:00
|
|
|
|
fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->ver)];
|
2010-11-19 15:57:05 +08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2011-03-23 14:00:01 -07:00
|
|
|
|
/* Need to update the MRF tracking for compressed instructions. */
|
2016-04-25 17:09:39 -07:00
|
|
|
|
if (dispatch_width >= 16)
|
2011-03-23 14:00:01 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2013-02-05 15:36:18 -08:00
|
|
|
|
if (inst->is_control_flow()) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
|
2019-01-25 13:30:36 -06:00
|
|
|
|
if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.equals(prev_inst->dst) &&
|
|
|
|
|
|
inst->src[0].equals(prev_inst->src[0]) &&
|
|
|
|
|
|
inst->saturate == prev_inst->saturate &&
|
|
|
|
|
|
inst->predicate == prev_inst->predicate &&
|
|
|
|
|
|
inst->conditional_mod == prev_inst->conditional_mod &&
|
|
|
|
|
|
inst->exec_size == prev_inst->exec_size) {
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->remove(block);
|
2010-11-19 15:57:05 +08:00
|
|
|
|
progress = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out the last-write records for MRFs that were overwritten. */
|
|
|
|
|
|
if (inst->dst.file == MRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
last_mrf_move[inst->dst.nr] = NULL;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->mlen > 0 && inst->base_mrf != -1) {
|
2011-01-18 13:28:32 -08:00
|
|
|
|
/* Found a SEND instruction, which will include two or fewer
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* implied MRF writes. We could do better here.
|
|
|
|
|
|
*/
|
2019-12-27 16:38:26 -08:00
|
|
|
|
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
last_mrf_move[inst->base_mrf + i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out any MRF move records whose sources got overwritten. */
|
2016-05-25 13:17:41 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
|
|
|
|
|
|
if (last_mrf_move[i] &&
|
2016-09-07 13:38:20 -07:00
|
|
|
|
regions_overlap(inst->dst, inst->size_written,
|
2016-05-25 13:17:41 -07:00
|
|
|
|
last_mrf_move[i]->src[0],
|
2016-09-07 17:00:07 -07:00
|
|
|
|
last_mrf_move[i]->size_read(0))) {
|
2016-05-25 13:17:41 -07:00
|
|
|
|
last_mrf_move[i] = NULL;
|
|
|
|
|
|
}
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF &&
|
2016-05-25 13:17:41 -07:00
|
|
|
|
inst->src[0].file != ARF &&
|
2019-04-24 12:38:28 +02:00
|
|
|
|
!inst->is_partial_write()) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
last_mrf_move[inst->dst.nr] = inst;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-07-01 08:14:56 +02:00
|
|
|
|
/**
|
|
|
|
|
|
* Rounding modes for conversion instructions are included for each
|
|
|
|
|
|
* conversion, but right now it is a state. So once it is set,
|
|
|
|
|
|
* we don't need to call it again for subsequent calls.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful for vector/matrices conversions, as setting the
|
|
|
|
|
|
* mode once is enough for the full vector/matrix
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_extra_rounding_modes()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
2018-11-19 12:38:10 +01:00
|
|
|
|
unsigned execution_mode = this->nir->info.float_controls_execution_mode;
|
|
|
|
|
|
|
|
|
|
|
|
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
|
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
|
|
|
|
|
execution_mode)
|
|
|
|
|
|
base_mode = BRW_RND_MODE_RTNE;
|
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
|
|
|
|
|
execution_mode)
|
|
|
|
|
|
base_mode = BRW_RND_MODE_RTZ;
|
2017-07-01 08:14:56 +02:00
|
|
|
|
|
|
|
|
|
|
foreach_block (block, cfg) {
|
2018-11-19 12:38:10 +01:00
|
|
|
|
brw_rnd_mode prev_mode = base_mode;
|
2017-07-01 08:14:56 +02:00
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_safe (fs_inst, inst, block) {
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
|
|
|
|
|
|
assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
|
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
|
|
|
|
|
|
if (mode == prev_mode) {
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
prev_mode = mode;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2017-07-01 08:14:56 +02:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
static void
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int grf;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
|
2015-10-24 15:29:03 -07:00
|
|
|
|
grf = inst->src[i].nr;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (grf >= first_grf &&
|
|
|
|
|
|
grf < first_grf + grf_len) {
|
|
|
|
|
|
deps[grf - first_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (inst->exec_size == 16)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
deps[grf - first_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
|
|
|
|
|
|
* check for post destination dependencies on this instruction, software
|
|
|
|
|
|
* must ensure that there is no destination hazard for the case of ‘write
|
|
|
|
|
|
* followed by a posted write’ shown in the following example.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1. mov r3 0
|
|
|
|
|
|
* 2. send r3.xy <rest of send instruction>
|
|
|
|
|
|
* 3. mov r2 r3
|
|
|
|
|
|
*
|
|
|
|
|
|
* Due to no post-destination dependency check on the ‘send’, the above
|
|
|
|
|
|
* code sequence could have two instructions (1 and 2) in flight at the
|
|
|
|
|
|
* same time that both consider ‘r3’ as the target of their final writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2021-03-29 15:40:04 -07:00
|
|
|
|
fs_visitor::insert_gfx4_pre_send_dependency_workarounds(bblock_t *block,
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2016-09-07 16:59:35 -07:00
|
|
|
|
int write_len = regs_written(inst);
|
2015-10-26 04:35:14 -07:00
|
|
|
|
int first_write_grf = inst->dst.nr;
|
2021-03-29 14:41:58 -07:00
|
|
|
|
bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
|
2013-02-05 15:46:22 -08:00
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Walk backwards looking for writes to registers we're writing which
|
|
|
|
|
|
* aren't read since being written. If we hit the start of the program,
|
|
|
|
|
|
* we assume that there are no outstanding dependencies on entry to the
|
|
|
|
|
|
* program.
|
|
|
|
|
|
*/
|
2015-10-20 11:16:00 +02:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, assume that there *are* outstanding
|
|
|
|
|
|
* dependencies, and force their cleanup before our instruction.
|
|
|
|
|
|
*/
|
2016-05-25 14:21:49 -07:00
|
|
|
|
if (block->start() == scan_inst && block->num != 0) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
2015-06-03 22:22:10 +03:00
|
|
|
|
if (needs_dep[i])
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, inst),
|
|
|
|
|
|
first_write_grf + i);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible on the assumption that any
|
|
|
|
|
|
* instruction but a MOV that might have left us an outstanding
|
|
|
|
|
|
* dependency has more latency than a MOV.
|
|
|
|
|
|
*/
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (scan_inst->dst.file == VGRF) {
|
2016-09-07 16:59:35 -07:00
|
|
|
|
for (unsigned i = 0; i < regs_written(scan_inst); i++) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
int reg = scan_inst->dst.nr + i;
|
2013-03-06 17:50:50 -08:00
|
|
|
|
|
|
|
|
|
|
if (reg >= first_write_grf &&
|
|
|
|
|
|
reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[reg - first_write_grf]) {
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (scan_inst->exec_size == 16)
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Errata: A destination register from a send can not be
|
|
|
|
|
|
* used as a destination register until after it has been sourced by an
|
|
|
|
|
|
* instruction with a different destination register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2021-03-29 15:40:04 -07:00
|
|
|
|
fs_visitor::insert_gfx4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2016-09-07 16:59:35 -07:00
|
|
|
|
int write_len = regs_written(inst);
|
2018-12-10 14:49:49 -08:00
|
|
|
|
unsigned first_write_grf = inst->dst.nr;
|
2021-03-29 14:41:58 -07:00
|
|
|
|
bool needs_dep[BRW_MAX_MRF(devinfo->ver)];
|
2013-02-05 15:46:22 -08:00
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
/* Walk forwards looking for writes to registers we're writing which aren't
|
|
|
|
|
|
* read before being written.
|
|
|
|
|
|
*/
|
2015-10-20 11:16:00 +02:00
|
|
|
|
foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, force resolve all remaining dependencies. */
|
2016-05-25 14:21:49 -07:00
|
|
|
|
if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
|
|
|
|
|
|
first_write_grf + i);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible since they're reading the
|
|
|
|
|
|
* result of a SEND, which has massive latency.
|
|
|
|
|
|
*/
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (scan_inst->dst.file == VGRF &&
|
2015-10-26 04:35:14 -07:00
|
|
|
|
scan_inst->dst.nr >= first_write_grf &&
|
|
|
|
|
|
scan_inst->dst.nr < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[scan_inst->dst.nr - first_write_grf]) {
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
|
2015-10-26 04:35:14 -07:00
|
|
|
|
scan_inst->dst.nr);
|
|
|
|
|
|
needs_dep[scan_inst->dst.nr - first_write_grf] = false;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
2021-03-29 15:40:04 -07:00
|
|
|
|
fs_visitor::insert_gfx4_send_dependency_workarounds()
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2021-09-22 15:06:58 +03:00
|
|
|
|
if (devinfo->ver != 4 || devinfo->platform == INTEL_PLATFORM_G4X)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-06-09 02:59:22 -07:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2014-08-24 19:07:01 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->mlen != 0 && inst->dst.file == VGRF) {
|
2021-03-29 15:40:04 -07:00
|
|
|
|
insert_gfx4_pre_send_dependency_workarounds(block, inst);
|
|
|
|
|
|
insert_gfx4_post_send_dependency_workarounds(block, inst);
|
2014-06-09 02:59:22 -07:00
|
|
|
|
progress = true;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-06-09 02:59:22 -07:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-18 11:56:46 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_load_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
assert(inst->dst.file == MRF || inst->dst.file == VGRF);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
assert(inst->saturate == false);
|
|
|
|
|
|
fs_reg dst = inst->dst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Get rid of COMPR4. We'll add it back in if we need it */
|
|
|
|
|
|
if (dst.file == MRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
dst.nr = dst.nr & ~BRW_MRF_COMPR4;
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
2015-07-27 18:34:43 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
2019-12-29 18:17:10 -08:00
|
|
|
|
const fs_builder ubld = ibld.exec_all();
|
2015-06-18 12:07:27 -07:00
|
|
|
|
|
2019-12-29 18:17:10 -08:00
|
|
|
|
for (uint8_t i = 0; i < inst->header_size;) {
|
|
|
|
|
|
/* Number of header GRFs to initialize at once with a single MOV
|
|
|
|
|
|
* instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned n =
|
|
|
|
|
|
(i + 1 < inst->header_size && inst->src[i].stride == 1 &&
|
|
|
|
|
|
inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ?
|
|
|
|
|
|
2 : 1;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE)
|
|
|
|
|
|
ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
retype(inst->src[i], BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
|
|
|
|
|
dst = byte_offset(dst, n * REG_SIZE);
|
|
|
|
|
|
i += n;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
|
if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
inst->exec_size > 8) {
|
|
|
|
|
|
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
|
|
|
|
|
|
* a straightforward copy. Instead, the result of the
|
|
|
|
|
|
* LOAD_PAYLOAD is treated as interleaved and the first four
|
|
|
|
|
|
* non-header sources are unpacked as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* m + 0: r0
|
|
|
|
|
|
* m + 1: g0
|
|
|
|
|
|
* m + 2: b0
|
|
|
|
|
|
* m + 3: a0
|
|
|
|
|
|
* m + 4: r1
|
|
|
|
|
|
* m + 5: g1
|
|
|
|
|
|
* m + 6: b1
|
|
|
|
|
|
* m + 7: a1
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is used for gen <= 5 fb writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
|
assert(inst->header_size + 4 <= inst->sources);
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
if (devinfo->has_compr4) {
|
|
|
|
|
|
fs_reg compr4_dst = retype(dst, inst->src[i].type);
|
2015-10-26 04:35:14 -07:00
|
|
|
|
compr4_dst.nr |= BRW_MRF_COMPR4;
|
2015-06-03 20:36:47 +03:00
|
|
|
|
ibld.MOV(compr4_dst, inst->src[i]);
|
2015-02-05 12:20:03 +02:00
|
|
|
|
} else {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
/* Platform doesn't have COMPR4. We have to fake it */
|
|
|
|
|
|
fs_reg mov_dst = retype(dst, inst->src[i].type);
|
2020-04-03 13:04:43 -07:00
|
|
|
|
ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0));
|
2015-10-26 04:35:14 -07:00
|
|
|
|
mov_dst.nr += 4;
|
2020-04-03 13:04:43 -07:00
|
|
|
|
ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1));
|
2015-02-04 19:49:32 +02:00
|
|
|
|
}
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
|
dst.nr++;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
/* The loop above only ever incremented us through the first set
|
|
|
|
|
|
* of 4 registers. However, thanks to the magic of COMPR4, we
|
|
|
|
|
|
* actually wrote to the first 8 registers, so we need to take
|
|
|
|
|
|
* that into account now.
|
|
|
|
|
|
*/
|
2015-10-26 04:35:14 -07:00
|
|
|
|
dst.nr += 4;
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
|
|
|
|
|
/* The COMPR4 code took care of the first 4 sources. We'll let
|
|
|
|
|
|
* the regular path handle any remaining sources. Yes, we are
|
|
|
|
|
|
* modifying the instruction but we're about to delete it so
|
|
|
|
|
|
* this really doesn't hurt anything.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->header_size += 4;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
2021-09-15 19:18:34 -07:00
|
|
|
|
dst.type = inst->src[i].type;
|
2018-11-14 22:38:23 -06:00
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
ibld.MOV(dst, inst->src[i]);
|
|
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
dst = offset(dst, ibld, 1);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/fs: Optimize integer multiplication of large constants by factoring
Many Intel platforms can only perform 32x16 bit multiplication. The
straightforward way to implement 32x32 bit multiplications is by
splitting one of the operands into high and low parts called H and L,
repsectively. The full multiplication can be implemented as:
((A * H) << 16) + (A * L)
On Intel platforms, special register accesses can be used to eliminate
the shift operation. This results in three instructions and a temporary
register for most values.
If H or L is 1, then one (or both) of the multiplications will later be
eliminated. On some platforms it may be possible to eliminate the
multiplication when H is 256.
If L is zero (note that H cannot be zero), one of the multiplications
will also be eliminated.
Instead of splitting the operand into high and low parts, it may
possible to factor the operand into two 16-bit factors X and Y. The
original multiplication can be replaced with (A * (X * Y)) = ((A * X) *
Y). This requires two instructions without a temporary register.
I may have gone a bit overboard with optimizing the factorization
routine. It was a fun brainteaser, and I couldn't put it down. :) On my
1.3GHz Ice Lake, a standalone test could chug through 1,000,000 randomly
selected values in about 5.7 seconds. This is about 9x the performance
of the obvious, straightforward implementation that I started with.
v2: Drop an unnecessary return. Rearrange logic slightly and rename
variables in factor_uint32 to better match the names used in the large
comment. Both suggested by Caio. Rearrange logic to avoid possibly
using `a` uninitialized. Noticed by Marcin.
v3: Use DIV_ROUND_UP instead of open coding it. Noticed by Caio.
Tiger Lake, Ice Lake, Haswell, and Ivy Bridge had similar results. (Ice Lake shown)
total instructions in shared programs: 19912558 -> 19912526 (<.01%)
instructions in affected programs: 3432 -> 3400 (-0.93%)
helped: 10 / HURT: 0
total cycles in shared programs: 856413218 -> 856412810 (<.01%)
cycles in affected programs: 122032 -> 121624 (-0.33%)
helped: 9 / HURT: 0
No shader-db changes on any other Intel platforms.
Tiger Lake and Ice Lake had similar results. (Ice Lake shown)
Instructions in all programs: 141997227 -> 141996923 (-0.0%)
Instructions helped: 71
Cycles in all programs: 9162524757 -> 9162523886 (-0.0%)
Cycles helped: 63
Cycles hurt: 5
No fossil-db changes on any other Intel platforms.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2021-02-07 12:12:29 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Factor an unsigned 32-bit integer.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Attempts to factor \c x into two values that are at most 0xFFFF. If no
|
|
|
|
|
|
* such factorization is possible, either because the value is too large or is
|
|
|
|
|
|
* prime, both \c result_a and \c result_b will be zero.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static void
|
|
|
|
|
|
factor_uint32(uint32_t x, unsigned *result_a, unsigned *result_b)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* This is necessary to prevent various opportunities for division by zero
|
|
|
|
|
|
* below.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(x > 0xffff);
|
|
|
|
|
|
|
|
|
|
|
|
/* This represents the actual expected constraints on the input. Namely,
|
|
|
|
|
|
* both the upper and lower words should be > 1.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(x >= 0x00020002);
|
|
|
|
|
|
|
|
|
|
|
|
*result_a = 0;
|
|
|
|
|
|
*result_b = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* The value is too large to factor with the constraints. */
|
|
|
|
|
|
if (x > (0xffffu * 0xffffu))
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
/* A non-prime number will have the form p*q*d where p is some prime
|
|
|
|
|
|
* number, q > 1, and 1 <= d <= q. To meet the constraints of this
|
|
|
|
|
|
* function, (p*d) < 0x10000. This implies d <= floor(0xffff / p).
|
|
|
|
|
|
* Furthermore, since q < 0x10000, d >= floor(x / (0xffff * p)). Finally,
|
|
|
|
|
|
* floor(x / (0xffff * p)) <= d <= floor(0xffff / p).
|
|
|
|
|
|
*
|
|
|
|
|
|
* The observation is finding the largest possible value of p reduces the
|
|
|
|
|
|
* possible range of d. After selecting p, all values of d in this range
|
|
|
|
|
|
* are tested until a factorization is found. The size of the range of
|
|
|
|
|
|
* possible values of d sets an upper bound on the run time of the
|
|
|
|
|
|
* function.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static const uint16_t primes[256] = {
|
|
|
|
|
|
2, 3, 5, 7, 11, 13, 17, 19,
|
|
|
|
|
|
23, 29, 31, 37, 41, 43, 47, 53,
|
|
|
|
|
|
59, 61, 67, 71, 73, 79, 83, 89,
|
|
|
|
|
|
97, 101, 103, 107, 109, 113, 127, 131, /* 32 */
|
|
|
|
|
|
137, 139, 149, 151, 157, 163, 167, 173,
|
|
|
|
|
|
179, 181, 191, 193, 197, 199, 211, 223,
|
|
|
|
|
|
227, 229, 233, 239, 241, 251, 257, 263,
|
|
|
|
|
|
269, 271, 277, 281, 283, 293, 307, 311, /* 64 */
|
|
|
|
|
|
313, 317, 331, 337, 347, 349, 353, 359,
|
|
|
|
|
|
367, 373, 379, 383, 389, 397, 401, 409,
|
|
|
|
|
|
419, 421, 431, 433, 439, 443, 449, 457,
|
|
|
|
|
|
461, 463, 467, 479, 487, 491, 499, 503, /* 96 */
|
|
|
|
|
|
509, 521, 523, 541, 547, 557, 563, 569,
|
|
|
|
|
|
571, 577, 587, 593, 599, 601, 607, 613,
|
|
|
|
|
|
617, 619, 631, 641, 643, 647, 653, 659,
|
|
|
|
|
|
661, 673, 677, 683, 691, 701, 709, 719, /* 128 */
|
|
|
|
|
|
727, 733, 739, 743, 751, 757, 761, 769,
|
|
|
|
|
|
773, 787, 797, 809, 811, 821, 823, 827,
|
|
|
|
|
|
829, 839, 853, 857, 859, 863, 877, 881,
|
|
|
|
|
|
883, 887, 907, 911, 919, 929, 937, 941, /* 160 */
|
|
|
|
|
|
947, 953, 967, 971, 977, 983, 991, 997,
|
|
|
|
|
|
1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049,
|
|
|
|
|
|
1051, 1061, 1063, 1069, 1087, 1091, 1093, 1097,
|
|
|
|
|
|
1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, /* 192 */
|
|
|
|
|
|
1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223,
|
|
|
|
|
|
1229, 1231, 1237, 1249, 1259, 1277, 1279, 1283,
|
|
|
|
|
|
1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321,
|
|
|
|
|
|
1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, /* 224 */
|
|
|
|
|
|
1427, 1429, 1433, 1439, 1447, 1451, 1453, 1459,
|
|
|
|
|
|
1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511,
|
|
|
|
|
|
1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571,
|
|
|
|
|
|
1579, 1583, 1597, 1601, 1607, 1609, 1613, 1619, /* 256 */
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
unsigned p;
|
|
|
|
|
|
unsigned x_div_p;
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = ARRAY_SIZE(primes) - 1; i >= 0; i--) {
|
|
|
|
|
|
p = primes[i];
|
|
|
|
|
|
x_div_p = x / p;
|
|
|
|
|
|
|
|
|
|
|
|
if ((x_div_p * p) == x)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* A prime factor was not found. */
|
|
|
|
|
|
if (x_div_p * p != x)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
/* Terminate early if d=1 is a solution. */
|
|
|
|
|
|
if (x_div_p < 0x10000) {
|
|
|
|
|
|
*result_a = x_div_p;
|
|
|
|
|
|
*result_b = p;
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Pick the maximum possible value for 'd'. It's important that the loop
|
|
|
|
|
|
* below execute while d <= max_d because max_d is a valid value. Having
|
|
|
|
|
|
* the wrong loop bound would cause 1627*1367*47 (0x063b0c83) to be
|
|
|
|
|
|
* incorrectly reported as not being factorable. The problem would occur
|
|
|
|
|
|
* with any value that is a factor of two primes in the table and one prime
|
|
|
|
|
|
* not in the table.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned max_d = 0xffff / p;
|
|
|
|
|
|
|
|
|
|
|
|
/* Pick an initial value of 'd' that (combined with rejecting too large
|
|
|
|
|
|
* values above) guarantees that 'q' will always be small enough.
|
|
|
|
|
|
* DIV_ROUND_UP is used to prevent 'd' from being zero.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned d = DIV_ROUND_UP(x_div_p, 0xffff); d <= max_d; d++) {
|
|
|
|
|
|
unsigned q = x_div_p / d;
|
|
|
|
|
|
|
|
|
|
|
|
if ((q * d) == x_div_p) {
|
|
|
|
|
|
assert(p * d * q == x);
|
|
|
|
|
|
assert((p * d) < 0x10000);
|
|
|
|
|
|
|
|
|
|
|
|
*result_a = q;
|
|
|
|
|
|
*result_b = p * d;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Since every value of 'd' is tried, as soon as 'd' is larger
|
|
|
|
|
|
* than 'q', we're just re-testing combinations that have
|
|
|
|
|
|
* already been tested.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (d > q)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
void
|
2019-07-10 16:48:01 -07:00
|
|
|
|
fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
|
2019-07-11 16:56:05 -07:00
|
|
|
|
{
|
2019-07-10 16:48:01 -07:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
intel/fs: Fix bounds checking for integer multiplication lowering
The previous bounds checking would cause
mul(8) g121<1>D g120<8,8,1>D 0xec4dD
to be lowered to
mul(8) g121<1>D g120<8,8,1>D 0xec4dUW
mul(8) g41<1>D g120<8,8,1>D 0x0000UW
add(8) g121.1<2>UW g121.1<16,8,2>UW g41<16,8,2>UW
Instead of picking the bounds (and the new type) based on the old type,
pick the new type based on the value only.
This helps a few fossil-db shaders in Witcher 3 and Geekbench5. No
changes on any other Intel platforms.
Tiger Lake
Instructions in all programs: 157581069 -> 157580768 (-0.0%)
Instructions helped: 24
Cycles in all programs: 7566979620 -> 7566977172 (-0.0%)
Cycles helped: 22
Cycles hurt: 4
Ice Lake
Instructions in all programs: 141998965 -> 141998667 (-0.0%)
Instructions helped: 26
Cycles in all programs: 9162568666 -> 9162565297 (-0.0%)
Cycles helped: 24
Cycles hurt: 2
Skylake
No changes.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2022-02-03 15:48:28 -08:00
|
|
|
|
/* It is correct to use inst->src[1].d in both end of the comparison.
|
|
|
|
|
|
* Using .ud in the UINT16_MAX comparison would cause any negative value to
|
|
|
|
|
|
* fail the check.
|
|
|
|
|
|
*/
|
2019-12-16 13:37:41 -08:00
|
|
|
|
if (inst->src[1].file == IMM &&
|
intel/fs: Fix bounds checking for integer multiplication lowering
The previous bounds checking would cause
mul(8) g121<1>D g120<8,8,1>D 0xec4dD
to be lowered to
mul(8) g121<1>D g120<8,8,1>D 0xec4dUW
mul(8) g41<1>D g120<8,8,1>D 0x0000UW
add(8) g121.1<2>UW g121.1<16,8,2>UW g41<16,8,2>UW
Instead of picking the bounds (and the new type) based on the old type,
pick the new type based on the value only.
This helps a few fossil-db shaders in Witcher 3 and Geekbench5. No
changes on any other Intel platforms.
Tiger Lake
Instructions in all programs: 157581069 -> 157580768 (-0.0%)
Instructions helped: 24
Cycles in all programs: 7566979620 -> 7566977172 (-0.0%)
Cycles helped: 22
Cycles hurt: 4
Ice Lake
Instructions in all programs: 141998965 -> 141998667 (-0.0%)
Instructions helped: 26
Cycles in all programs: 9162568666 -> 9162565297 (-0.0%)
Cycles helped: 24
Cycles hurt: 2
Skylake
No changes.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2022-02-03 15:48:28 -08:00
|
|
|
|
(inst->src[1].d >= INT16_MIN && inst->src[1].d <= UINT16_MAX)) {
|
2019-07-11 16:56:05 -07:00
|
|
|
|
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
|
|
|
|
|
|
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
|
|
|
|
|
|
* src1 are used.
|
|
|
|
|
|
*
|
|
|
|
|
|
* If multiplying by an immediate value that fits in 16-bits, do a
|
|
|
|
|
|
* single MUL instruction with that value in the proper location.
|
|
|
|
|
|
*/
|
intel/fs: Fix bounds checking for integer multiplication lowering
The previous bounds checking would cause
mul(8) g121<1>D g120<8,8,1>D 0xec4dD
to be lowered to
mul(8) g121<1>D g120<8,8,1>D 0xec4dUW
mul(8) g41<1>D g120<8,8,1>D 0x0000UW
add(8) g121.1<2>UW g121.1<16,8,2>UW g41<16,8,2>UW
Instead of picking the bounds (and the new type) based on the old type,
pick the new type based on the value only.
This helps a few fossil-db shaders in Witcher 3 and Geekbench5. No
changes on any other Intel platforms.
Tiger Lake
Instructions in all programs: 157581069 -> 157580768 (-0.0%)
Instructions helped: 24
Cycles in all programs: 7566979620 -> 7566977172 (-0.0%)
Cycles helped: 22
Cycles hurt: 4
Ice Lake
Instructions in all programs: 141998965 -> 141998667 (-0.0%)
Instructions helped: 26
Cycles in all programs: 9162568666 -> 9162565297 (-0.0%)
Cycles helped: 24
Cycles hurt: 2
Skylake
No changes.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2022-02-03 15:48:28 -08:00
|
|
|
|
const bool ud = (inst->src[1].d >= 0);
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 7) {
|
2019-07-11 16:56:05 -07:00
|
|
|
|
fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
|
|
|
|
|
|
ibld.MOV(imm, inst->src[1]);
|
|
|
|
|
|
ibld.MUL(inst->dst, imm, inst->src[0]);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MUL(inst->dst, inst->src[0],
|
|
|
|
|
|
ud ? brw_imm_uw(inst->src[1].ud)
|
|
|
|
|
|
: brw_imm_w(inst->src[1].d));
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* Gen < 8 (and some Gfx8+ low-power parts like Cherryview) cannot
|
2019-07-11 16:56:05 -07:00
|
|
|
|
* do 32-bit integer multiplication in one instruction, but instead
|
|
|
|
|
|
* must do a sequence (which actually calculates a 64-bit result):
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
|
|
|
|
|
|
* mach(8) null g3<8,8,1>D g4<8,8,1>D
|
|
|
|
|
|
* mov(8) g2<1>D acc0<8,8,1>D
|
|
|
|
|
|
*
|
|
|
|
|
|
* But on Gen > 6, the ability to use second accumulator register
|
|
|
|
|
|
* (acc1) for non-float data types was removed, preventing a simple
|
|
|
|
|
|
* implementation in SIMD16. A 16-channel result can be calculated by
|
|
|
|
|
|
* executing the three instructions twice in SIMD8, once with quarter
|
|
|
|
|
|
* control of 1Q for the first eight channels and again with 2Q for
|
|
|
|
|
|
* the second eight channels.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Which accumulator register is implicitly accessed (by AccWrEnable
|
|
|
|
|
|
* for instance) is determined by the quarter control. Unfortunately
|
|
|
|
|
|
* Ivybridge (and presumably Baytrail) has a hardware bug in which an
|
|
|
|
|
|
* implicit accumulator access by an instruction with 2Q will access
|
|
|
|
|
|
* acc1 regardless of whether the data type is usable in acc1.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Specifically, the 2Q mach(8) writes acc1 which does not exist for
|
|
|
|
|
|
* integer data types.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Since we only want the low 32-bits of the result, we can do two
|
|
|
|
|
|
* 32-bit x 16-bit multiplies (like the mul and mach are doing), and
|
|
|
|
|
|
* adjust the high result and add them (like the mach is doing):
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
|
|
|
|
|
|
* mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
|
|
|
|
|
|
* shl(8) g9<1>D g8<8,8,1>D 16D
|
|
|
|
|
|
* add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
|
|
|
|
|
|
*
|
|
|
|
|
|
* We avoid the shl instruction by realizing that we only want to add
|
|
|
|
|
|
* the low 16-bits of the "high" result to the high 16-bits of the
|
|
|
|
|
|
* "low" result and using proper regioning on the add:
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
|
|
|
|
|
|
* mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
|
|
|
|
|
|
* add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
|
|
|
|
|
|
*
|
|
|
|
|
|
* Since it does not use the (single) accumulator register, we can
|
|
|
|
|
|
* schedule multi-component multiplications much better.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
bool needs_mov = false;
|
|
|
|
|
|
fs_reg orig_dst = inst->dst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Get a new VGRF for the "low" 32x16-bit multiplication result if
|
|
|
|
|
|
* reusing the original destination is impossible due to hardware
|
|
|
|
|
|
* restrictions, source/destination overlap, or it being the null
|
|
|
|
|
|
* register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg low = inst->dst;
|
|
|
|
|
|
if (orig_dst.is_null() || orig_dst.file == MRF ||
|
|
|
|
|
|
regions_overlap(inst->dst, inst->size_written,
|
|
|
|
|
|
inst->src[0], inst->size_read(0)) ||
|
|
|
|
|
|
regions_overlap(inst->dst, inst->size_written,
|
|
|
|
|
|
inst->src[1], inst->size_read(1)) ||
|
|
|
|
|
|
inst->dst.stride >= 4) {
|
|
|
|
|
|
needs_mov = true;
|
|
|
|
|
|
low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
|
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Get a new VGRF but keep the same stride as inst->dst */
|
|
|
|
|
|
fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
|
|
|
|
|
|
high.stride = inst->dst.stride;
|
|
|
|
|
|
high.offset = inst->dst.offset % REG_SIZE;
|
|
|
|
|
|
|
intel/fs: Optimize integer multiplication of large constants by factoring
Many Intel platforms can only perform 32x16 bit multiplication. The
straightforward way to implement 32x32 bit multiplications is by
splitting one of the operands into high and low parts called H and L,
repsectively. The full multiplication can be implemented as:
((A * H) << 16) + (A * L)
On Intel platforms, special register accesses can be used to eliminate
the shift operation. This results in three instructions and a temporary
register for most values.
If H or L is 1, then one (or both) of the multiplications will later be
eliminated. On some platforms it may be possible to eliminate the
multiplication when H is 256.
If L is zero (note that H cannot be zero), one of the multiplications
will also be eliminated.
Instead of splitting the operand into high and low parts, it may
possible to factor the operand into two 16-bit factors X and Y. The
original multiplication can be replaced with (A * (X * Y)) = ((A * X) *
Y). This requires two instructions without a temporary register.
I may have gone a bit overboard with optimizing the factorization
routine. It was a fun brainteaser, and I couldn't put it down. :) On my
1.3GHz Ice Lake, a standalone test could chug through 1,000,000 randomly
selected values in about 5.7 seconds. This is about 9x the performance
of the obvious, straightforward implementation that I started with.
v2: Drop an unnecessary return. Rearrange logic slightly and rename
variables in factor_uint32 to better match the names used in the large
comment. Both suggested by Caio. Rearrange logic to avoid possibly
using `a` uninitialized. Noticed by Marcin.
v3: Use DIV_ROUND_UP instead of open coding it. Noticed by Caio.
Tiger Lake, Ice Lake, Haswell, and Ivy Bridge had similar results. (Ice Lake shown)
total instructions in shared programs: 19912558 -> 19912526 (<.01%)
instructions in affected programs: 3432 -> 3400 (-0.93%)
helped: 10 / HURT: 0
total cycles in shared programs: 856413218 -> 856412810 (<.01%)
cycles in affected programs: 122032 -> 121624 (-0.33%)
helped: 9 / HURT: 0
No shader-db changes on any other Intel platforms.
Tiger Lake and Ice Lake had similar results. (Ice Lake shown)
Instructions in all programs: 141997227 -> 141996923 (-0.0%)
Instructions helped: 71
Cycles in all programs: 9162524757 -> 9162523886 (-0.0%)
Cycles helped: 63
Cycles hurt: 5
No fossil-db changes on any other Intel platforms.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2021-02-07 12:12:29 -08:00
|
|
|
|
bool do_addition = true;
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 7) {
|
2021-03-29 17:15:41 -07:00
|
|
|
|
/* From Wa_1604601757:
|
2020-08-07 13:11:08 -07:00
|
|
|
|
*
|
|
|
|
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
|
|
|
|
* is not supported."
|
|
|
|
|
|
*
|
|
|
|
|
|
* An unsupported negate modifier on src[1] would ordinarily be
|
|
|
|
|
|
* lowered by the subsequent lower_regioning pass. In this case that
|
|
|
|
|
|
* pass would spawn another dword multiply. Instead, lower the
|
|
|
|
|
|
* modifier first.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
const bool source_mods_unsupported = (devinfo->ver >= 12);
|
2020-08-07 13:11:08 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->src[1].abs || (inst->src[1].negate &&
|
|
|
|
|
|
source_mods_unsupported))
|
2019-07-11 16:56:05 -07:00
|
|
|
|
lower_src_modifiers(this, block, inst, 1);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[1].file == IMM) {
|
intel/fs: Optimize integer multiplication of large constants by factoring
Many Intel platforms can only perform 32x16 bit multiplication. The
straightforward way to implement 32x32 bit multiplications is by
splitting one of the operands into high and low parts called H and L,
repsectively. The full multiplication can be implemented as:
((A * H) << 16) + (A * L)
On Intel platforms, special register accesses can be used to eliminate
the shift operation. This results in three instructions and a temporary
register for most values.
If H or L is 1, then one (or both) of the multiplications will later be
eliminated. On some platforms it may be possible to eliminate the
multiplication when H is 256.
If L is zero (note that H cannot be zero), one of the multiplications
will also be eliminated.
Instead of splitting the operand into high and low parts, it may
possible to factor the operand into two 16-bit factors X and Y. The
original multiplication can be replaced with (A * (X * Y)) = ((A * X) *
Y). This requires two instructions without a temporary register.
I may have gone a bit overboard with optimizing the factorization
routine. It was a fun brainteaser, and I couldn't put it down. :) On my
1.3GHz Ice Lake, a standalone test could chug through 1,000,000 randomly
selected values in about 5.7 seconds. This is about 9x the performance
of the obvious, straightforward implementation that I started with.
v2: Drop an unnecessary return. Rearrange logic slightly and rename
variables in factor_uint32 to better match the names used in the large
comment. Both suggested by Caio. Rearrange logic to avoid possibly
using `a` uninitialized. Noticed by Marcin.
v3: Use DIV_ROUND_UP instead of open coding it. Noticed by Caio.
Tiger Lake, Ice Lake, Haswell, and Ivy Bridge had similar results. (Ice Lake shown)
total instructions in shared programs: 19912558 -> 19912526 (<.01%)
instructions in affected programs: 3432 -> 3400 (-0.93%)
helped: 10 / HURT: 0
total cycles in shared programs: 856413218 -> 856412810 (<.01%)
cycles in affected programs: 122032 -> 121624 (-0.33%)
helped: 9 / HURT: 0
No shader-db changes on any other Intel platforms.
Tiger Lake and Ice Lake had similar results. (Ice Lake shown)
Instructions in all programs: 141997227 -> 141996923 (-0.0%)
Instructions helped: 71
Cycles in all programs: 9162524757 -> 9162523886 (-0.0%)
Cycles helped: 63
Cycles hurt: 5
No fossil-db changes on any other Intel platforms.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2021-02-07 12:12:29 -08:00
|
|
|
|
unsigned a;
|
|
|
|
|
|
unsigned b;
|
|
|
|
|
|
|
|
|
|
|
|
/* If the immeditate value can be factored into two values, A and
|
|
|
|
|
|
* B, that each fit in 16-bits, the multiplication result can
|
|
|
|
|
|
* instead be calculated as (src1 * (A * B)) = ((src1 * A) * B).
|
|
|
|
|
|
* This saves an operation (the addition) and a temporary register
|
|
|
|
|
|
* (high).
|
|
|
|
|
|
*
|
|
|
|
|
|
* Skip the optimization if either the high word or the low word
|
|
|
|
|
|
* is 0 or 1. In these conditions, at least one of the
|
|
|
|
|
|
* multiplications generated by the straightforward method will be
|
|
|
|
|
|
* eliminated anyway.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->src[1].ud > 0x0001ffff &&
|
|
|
|
|
|
(inst->src[1].ud & 0xffff) > 1) {
|
|
|
|
|
|
factor_uint32(inst->src[1].ud, &a, &b);
|
|
|
|
|
|
|
|
|
|
|
|
if (a != 0) {
|
|
|
|
|
|
ibld.MUL(low, inst->src[0], brw_imm_uw(a));
|
|
|
|
|
|
ibld.MUL(low, low, brw_imm_uw(b));
|
|
|
|
|
|
do_addition = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (do_addition) {
|
|
|
|
|
|
ibld.MUL(low, inst->src[0],
|
|
|
|
|
|
brw_imm_uw(inst->src[1].ud & 0xffff));
|
|
|
|
|
|
ibld.MUL(high, inst->src[0],
|
|
|
|
|
|
brw_imm_uw(inst->src[1].ud >> 16));
|
|
|
|
|
|
}
|
2019-07-11 16:56:05 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MUL(low, inst->src[0],
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
|
|
|
|
|
|
ibld.MUL(high, inst->src[0],
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if (inst->src[0].abs)
|
|
|
|
|
|
lower_src_modifiers(this, block, inst, 0);
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
|
|
|
|
|
|
inst->src[1]);
|
|
|
|
|
|
ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
|
|
|
|
|
|
inst->src[1]);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/fs: Optimize integer multiplication of large constants by factoring
Many Intel platforms can only perform 32x16 bit multiplication. The
straightforward way to implement 32x32 bit multiplications is by
splitting one of the operands into high and low parts called H and L,
repsectively. The full multiplication can be implemented as:
((A * H) << 16) + (A * L)
On Intel platforms, special register accesses can be used to eliminate
the shift operation. This results in three instructions and a temporary
register for most values.
If H or L is 1, then one (or both) of the multiplications will later be
eliminated. On some platforms it may be possible to eliminate the
multiplication when H is 256.
If L is zero (note that H cannot be zero), one of the multiplications
will also be eliminated.
Instead of splitting the operand into high and low parts, it may
possible to factor the operand into two 16-bit factors X and Y. The
original multiplication can be replaced with (A * (X * Y)) = ((A * X) *
Y). This requires two instructions without a temporary register.
I may have gone a bit overboard with optimizing the factorization
routine. It was a fun brainteaser, and I couldn't put it down. :) On my
1.3GHz Ice Lake, a standalone test could chug through 1,000,000 randomly
selected values in about 5.7 seconds. This is about 9x the performance
of the obvious, straightforward implementation that I started with.
v2: Drop an unnecessary return. Rearrange logic slightly and rename
variables in factor_uint32 to better match the names used in the large
comment. Both suggested by Caio. Rearrange logic to avoid possibly
using `a` uninitialized. Noticed by Marcin.
v3: Use DIV_ROUND_UP instead of open coding it. Noticed by Caio.
Tiger Lake, Ice Lake, Haswell, and Ivy Bridge had similar results. (Ice Lake shown)
total instructions in shared programs: 19912558 -> 19912526 (<.01%)
instructions in affected programs: 3432 -> 3400 (-0.93%)
helped: 10 / HURT: 0
total cycles in shared programs: 856413218 -> 856412810 (<.01%)
cycles in affected programs: 122032 -> 121624 (-0.33%)
helped: 9 / HURT: 0
No shader-db changes on any other Intel platforms.
Tiger Lake and Ice Lake had similar results. (Ice Lake shown)
Instructions in all programs: 141997227 -> 141996923 (-0.0%)
Instructions helped: 71
Cycles in all programs: 9162524757 -> 9162523886 (-0.0%)
Cycles helped: 63
Cycles hurt: 5
No fossil-db changes on any other Intel platforms.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17718>
2021-02-07 12:12:29 -08:00
|
|
|
|
if (do_addition) {
|
|
|
|
|
|
ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
|
|
|
|
|
|
subscript(low, BRW_REGISTER_TYPE_UW, 1),
|
|
|
|
|
|
subscript(high, BRW_REGISTER_TYPE_UW, 0));
|
|
|
|
|
|
}
|
2019-07-11 16:56:05 -07:00
|
|
|
|
|
|
|
|
|
|
if (needs_mov || inst->conditional_mod)
|
|
|
|
|
|
set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 15:08:03 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
|
|
|
|
|
|
{
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
/* Considering two 64-bit integers ab and cd where each letter ab
|
|
|
|
|
|
* corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
|
|
|
|
|
|
* only need to provide the YZ part of the result. -------
|
|
|
|
|
|
* BD
|
|
|
|
|
|
* Only BD needs to be 64 bits. For AD and BC we only care + AD
|
|
|
|
|
|
* about the lower 32 bits (since they are part of the upper + BC
|
|
|
|
|
|
* 32 bits of our result). AC is not needed since it starts + AC
|
|
|
|
|
|
* on the 65th bit of the result. -------
|
|
|
|
|
|
* WXYZ
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned int q_regs = regs_written(inst);
|
|
|
|
|
|
unsigned int d_regs = (q_regs + 1) / 2;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
|
|
|
|
|
|
fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
/* Here we need the full 64 bit result for 32b * 32b. */
|
|
|
|
|
|
if (devinfo->has_integer_dword_mul) {
|
|
|
|
|
|
ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
2022-07-07 14:43:05 -07:00
|
|
|
|
const unsigned acc_width = reg_unit(devinfo) * 8;
|
|
|
|
|
|
fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
inst->group % acc_width);
|
2019-07-11 15:08:03 -07:00
|
|
|
|
|
|
|
|
|
|
fs_inst *mul = ibld.MUL(acc,
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
|
|
|
|
|
|
mul->writes_accumulator = true;
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
ibld.MOV(bd_low, acc);
|
|
|
|
|
|
|
2022-09-14 02:40:01 +03:00
|
|
|
|
ibld.UNDEF(bd);
|
2019-07-11 15:08:03 -07:00
|
|
|
|
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
|
|
|
|
|
|
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
|
|
|
|
|
|
|
|
|
|
|
|
ibld.ADD(ad, ad, bc);
|
|
|
|
|
|
ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
|
|
|
|
|
|
|
2020-10-27 02:24:30 -05:00
|
|
|
|
if (devinfo->has_64bit_int) {
|
|
|
|
|
|
ibld.MOV(inst->dst, bd);
|
|
|
|
|
|
} else {
|
2022-09-14 02:40:01 +03:00
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
2020-10-27 02:24:30 -05:00
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(bd, BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(bd, BRW_REGISTER_TYPE_UD, 1));
|
|
|
|
|
|
}
|
2019-07-11 15:08:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
void
|
2019-07-10 16:48:01 -07:00
|
|
|
|
fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
|
2019-07-11 16:56:05 -07:00
|
|
|
|
{
|
2019-07-10 16:48:01 -07:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
/* According to the BDW+ BSpec page for the "Multiply Accumulate
|
|
|
|
|
|
* High" instruction:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "An added preliminary mov is required for source modification on
|
|
|
|
|
|
* src1:
|
|
|
|
|
|
* mov (8) r3.0<1>:d -r3<8;8,1>:d
|
|
|
|
|
|
* mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
|
|
|
|
|
|
* mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 8 && (inst->src[1].negate || inst->src[1].abs))
|
2019-07-11 16:56:05 -07:00
|
|
|
|
lower_src_modifiers(this, block, inst, 1);
|
|
|
|
|
|
|
|
|
|
|
|
/* Should have been lowered to 8-wide. */
|
2022-06-29 14:13:31 -07:00
|
|
|
|
assert(inst->exec_size <= get_lowered_simd_width(compiler, inst));
|
2022-07-07 14:43:05 -07:00
|
|
|
|
const unsigned acc_width = reg_unit(devinfo) * 8;
|
|
|
|
|
|
const fs_reg acc = suboffset(retype(brw_acc_reg(inst->exec_size), inst->dst.type),
|
|
|
|
|
|
inst->group % acc_width);
|
2019-07-11 16:56:05 -07:00
|
|
|
|
fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
|
|
|
|
|
|
fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
|
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 8) {
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* Until Gfx8, integer multiplies read 32-bits from one source,
|
2019-07-11 16:56:05 -07:00
|
|
|
|
* and 16-bits from the other, and relying on the MACH instruction
|
|
|
|
|
|
* to generate the high bits of the result.
|
|
|
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* On Gfx8, the multiply instruction does a full 32x32-bit
|
2019-07-11 16:56:05 -07:00
|
|
|
|
* multiply, but in order to do a 64-bit multiply we can simulate
|
|
|
|
|
|
* the previous behavior and then use a MACH instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
|
mul->src[1].type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
mul->src[1].type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
mul->src[1].stride *= 2;
|
|
|
|
|
|
|
|
|
|
|
|
if (mul->src[1].file == IMM) {
|
|
|
|
|
|
mul->src[1] = brw_imm_uw(mul->src[1].ud);
|
|
|
|
|
|
}
|
2021-05-14 18:04:46 +02:00
|
|
|
|
} else if (devinfo->verx10 == 70 &&
|
2019-07-11 16:56:05 -07:00
|
|
|
|
inst->group > 0) {
|
|
|
|
|
|
/* Among other things the quarter control bits influence which
|
|
|
|
|
|
* accumulator register is used by the hardware for instructions
|
|
|
|
|
|
* that access the accumulator implicitly (e.g. MACH). A
|
|
|
|
|
|
* second-half instruction would normally map to acc1, which
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* doesn't exist on Gfx7 and up (the hardware does emulate it for
|
2019-07-11 16:56:05 -07:00
|
|
|
|
* floating-point instructions *only* by taking advantage of the
|
|
|
|
|
|
* extra precision of acc0 not normally used for floating point
|
|
|
|
|
|
* arithmetic).
|
|
|
|
|
|
*
|
|
|
|
|
|
* HSW and up are careful enough not to try to access an
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* accumulator register that doesn't exist, but on earlier Gfx7
|
2019-07-11 16:56:05 -07:00
|
|
|
|
* hardware we need to make sure that the quarter control bits are
|
|
|
|
|
|
* zero to avoid non-deterministic behaviour and emit an extra MOV
|
|
|
|
|
|
* to get the result masked correctly according to the current
|
|
|
|
|
|
* channel enables.
|
|
|
|
|
|
*/
|
|
|
|
|
|
mach->group = 0;
|
|
|
|
|
|
mach->force_writemask_all = true;
|
|
|
|
|
|
mach->dst = ibld.vgrf(inst->dst.type);
|
|
|
|
|
|
ibld.MOV(inst->dst, mach->dst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-05-11 09:29:56 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_integer_multiplication()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2015-08-05 16:47:18 +03:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_MUL) {
|
2019-12-05 08:25:34 -08:00
|
|
|
|
/* If the instruction is already in a form that does not need lowering,
|
|
|
|
|
|
* return early.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 7) {
|
2019-12-05 08:25:34 -08:00
|
|
|
|
if (type_sz(inst->src[1].type) < 4 && type_sz(inst->src[0].type) <= 4)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if (type_sz(inst->src[0].type) < 4 && type_sz(inst->src[1].type) <= 4)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 15:08:03 -07:00
|
|
|
|
if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
|
|
|
|
|
|
(inst->src[0].type == BRW_REGISTER_TYPE_Q ||
|
|
|
|
|
|
inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
|
|
|
|
|
|
(inst->src[1].type == BRW_REGISTER_TYPE_Q ||
|
|
|
|
|
|
inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
|
|
|
|
|
|
lower_mul_qword_inst(inst, block);
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (!inst->dst.is_accumulator() &&
|
|
|
|
|
|
(inst->dst.type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UD) &&
|
2019-01-29 16:34:30 -08:00
|
|
|
|
(!devinfo->has_integer_dword_mul ||
|
|
|
|
|
|
devinfo->verx10 >= 125)) {
|
2019-07-10 17:03:48 -07:00
|
|
|
|
lower_mul_dword_inst(inst, block);
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
2015-08-06 14:04:00 +03:00
|
|
|
|
} else if (inst->opcode == SHADER_OPCODE_MULH) {
|
2019-07-10 16:48:01 -07:00
|
|
|
|
lower_mulh_inst(inst, block);
|
2019-07-10 17:03:48 -07:00
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2015-05-11 09:29:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2015-05-11 09:29:56 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-02-11 12:27:02 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_minmax()
|
|
|
|
|
|
{
|
2021-03-29 14:41:58 -07:00
|
|
|
|
assert(devinfo->ver < 6);
|
2016-02-11 12:27:02 -08:00
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
|
inst->predicate == BRW_PREDICATE_NONE) {
|
intel/compiler: Use CMPN for min / max on Gen4 and Gen5
On Intel platforms before Gen6, there is no min or max instruction.
Instead, a comparison instruction (*more on this below) and a SEL
instruction are used. Per other IEEE rules, the regular comparison
instruction, CMP, will always return false if either source is NaN. A
sequence like
cmp.l.f0.0(16) null<1>F g30<8,8,1>F g22<8,8,1>F
(+f0.0) sel(16) g8<1>F g30<8,8,1>F g22<8,8,1>F
will generate the wrong result for min if g22 is NaN. The CMP will
return false, and the SEL will pick g22.
To account for this, the hardware has a special comparison instruction
CMPN. This instruction behaves just like CMP, except if the second
source is NaN, it will return true. The intention is to use it for min
and max. This sequence will always generate the correct result:
cmpn.l.f0.0(16) null<1>F g30<8,8,1>F g22<8,8,1>F
(+f0.0) sel(16) g8<1>F g30<8,8,1>F g22<8,8,1>F
The problem is... for whatever reason, we don't emit CMPN. There was
even a comment in lower_minmax that calls out this very issue! The bug
is actually older than the "Fixes" below even implies. That's just when
the comment was added. That we know of, we never observed a failure
until #4254.
If src1 is known to be a number, either because it's not float or it's
an immediate number, use CMP. This allows cmod propagation to still do
its thing. Without this slight optimization, about 8,300 shaders from
shader-db are hurt on Iron Lake.
Fixes the following piglit tests (from piglit!475):
tests/spec/glsl-1.20/execution/fs-nan-builtin-max.shader_test
tests/spec/glsl-1.20/execution/fs-nan-builtin-min.shader_test
tests/spec/glsl-1.20/execution/vs-nan-builtin-max.shader_test
tests/spec/glsl-1.20/execution/vs-nan-builtin-min.shader_test
Closes: #4254
Fixes: 2f2c00c7279 ("i965: Lower min/max after optimization on Gen4/5.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Iron Lake and GM45 had similar results. (Iron Lake shown)
total instructions in shared programs: 8115134 -> 8115135 (<.01%)
instructions in affected programs: 229 -> 230 (0.44%)
helped: 0
HURT: 1
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9027>
2021-02-13 14:11:58 -08:00
|
|
|
|
/* If src1 is an immediate value that is not NaN, then it can't be
|
|
|
|
|
|
* NaN. In that case, emit CMP because it is much better for cmod
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't
|
intel/compiler: Use CMPN for min / max on Gen4 and Gen5
On Intel platforms before Gen6, there is no min or max instruction.
Instead, a comparison instruction (*more on this below) and a SEL
instruction are used. Per other IEEE rules, the regular comparison
instruction, CMP, will always return false if either source is NaN. A
sequence like
cmp.l.f0.0(16) null<1>F g30<8,8,1>F g22<8,8,1>F
(+f0.0) sel(16) g8<1>F g30<8,8,1>F g22<8,8,1>F
will generate the wrong result for min if g22 is NaN. The CMP will
return false, and the SEL will pick g22.
To account for this, the hardware has a special comparison instruction
CMPN. This instruction behaves just like CMP, except if the second
source is NaN, it will return true. The intention is to use it for min
and max. This sequence will always generate the correct result:
cmpn.l.f0.0(16) null<1>F g30<8,8,1>F g22<8,8,1>F
(+f0.0) sel(16) g8<1>F g30<8,8,1>F g22<8,8,1>F
The problem is... for whatever reason, we don't emit CMPN. There was
even a comment in lower_minmax that calls out this very issue! The bug
is actually older than the "Fixes" below even implies. That's just when
the comment was added. That we know of, we never observed a failure
until #4254.
If src1 is known to be a number, either because it's not float or it's
an immediate number, use CMP. This allows cmod propagation to still do
its thing. Without this slight optimization, about 8,300 shaders from
shader-db are hurt on Iron Lake.
Fixes the following piglit tests (from piglit!475):
tests/spec/glsl-1.20/execution/fs-nan-builtin-max.shader_test
tests/spec/glsl-1.20/execution/fs-nan-builtin-min.shader_test
tests/spec/glsl-1.20/execution/vs-nan-builtin-max.shader_test
tests/spec/glsl-1.20/execution/vs-nan-builtin-min.shader_test
Closes: #4254
Fixes: 2f2c00c7279 ("i965: Lower min/max after optimization on Gen4/5.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Iron Lake and GM45 had similar results. (Iron Lake shown)
total instructions in shared programs: 8115134 -> 8115135 (<.01%)
instructions in affected programs: 229 -> 230 (0.44%)
helped: 0
HURT: 1
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9027>
2021-02-13 14:11:58 -08:00
|
|
|
|
* support HF or DF, so it is not necessary to check for those.
|
2016-02-11 12:27:02 -08:00
|
|
|
|
*/
|
intel/compiler: Use CMPN for min / max on Gen4 and Gen5
On Intel platforms before Gen6, there is no min or max instruction.
Instead, a comparison instruction (*more on this below) and a SEL
instruction are used. Per other IEEE rules, the regular comparison
instruction, CMP, will always return false if either source is NaN. A
sequence like
cmp.l.f0.0(16) null<1>F g30<8,8,1>F g22<8,8,1>F
(+f0.0) sel(16) g8<1>F g30<8,8,1>F g22<8,8,1>F
will generate the wrong result for min if g22 is NaN. The CMP will
return false, and the SEL will pick g22.
To account for this, the hardware has a special comparison instruction
CMPN. This instruction behaves just like CMP, except if the second
source is NaN, it will return true. The intention is to use it for min
and max. This sequence will always generate the correct result:
cmpn.l.f0.0(16) null<1>F g30<8,8,1>F g22<8,8,1>F
(+f0.0) sel(16) g8<1>F g30<8,8,1>F g22<8,8,1>F
The problem is... for whatever reason, we don't emit CMPN. There was
even a comment in lower_minmax that calls out this very issue! The bug
is actually older than the "Fixes" below even implies. That's just when
the comment was added. That we know of, we never observed a failure
until #4254.
If src1 is known to be a number, either because it's not float or it's
an immediate number, use CMP. This allows cmod propagation to still do
its thing. Without this slight optimization, about 8,300 shaders from
shader-db are hurt on Iron Lake.
Fixes the following piglit tests (from piglit!475):
tests/spec/glsl-1.20/execution/fs-nan-builtin-max.shader_test
tests/spec/glsl-1.20/execution/fs-nan-builtin-min.shader_test
tests/spec/glsl-1.20/execution/vs-nan-builtin-max.shader_test
tests/spec/glsl-1.20/execution/vs-nan-builtin-min.shader_test
Closes: #4254
Fixes: 2f2c00c7279 ("i965: Lower min/max after optimization on Gen4/5.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Iron Lake and GM45 had similar results. (Iron Lake shown)
total instructions in shared programs: 8115134 -> 8115135 (<.01%)
instructions in affected programs: 229 -> 230 (0.44%)
helped: 0
HURT: 1
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9027>
2021-02-13 14:11:58 -08:00
|
|
|
|
if (inst->src[1].type != BRW_REGISTER_TYPE_F ||
|
|
|
|
|
|
(inst->src[1].file == IMM && !isnan(inst->src[1].f))) {
|
|
|
|
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
|
inst->conditional_mod);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
|
inst->conditional_mod);
|
|
|
|
|
|
}
|
2016-02-11 12:27:02 -08:00
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
2016-02-11 12:27:02 -08:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-09-19 01:28:06 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_sub_sat()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_USUB_SAT ||
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
|
|
|
|
|
/* The fundamental problem is the hardware performs source negation
|
|
|
|
|
|
* at the bit width of the source. If the source is 0x80000000D, the
|
|
|
|
|
|
* negation is 0x80000000D. As a result, subtractSaturate(0,
|
|
|
|
|
|
* 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There
|
|
|
|
|
|
* are at least three ways to resolve this:
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1. Use the accumulator for the negated source. The accumulator is
|
|
|
|
|
|
* 33 bits, so our source 0x80000000 is sign-extended to
|
|
|
|
|
|
* 0x1800000000. The negation of which is 0x080000000. This
|
|
|
|
|
|
* doesn't help for 64-bit integers (which are already bigger than
|
|
|
|
|
|
* 33 bits). There are also only 8 accumulators, so SIMD16 or
|
|
|
|
|
|
* SIMD32 instructions would have to be split into multiple SIMD8
|
|
|
|
|
|
* instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 2. Use slightly different math. For any n-bit value x, we know (x
|
|
|
|
|
|
* >> 1) != -(x >> 1). We can use this fact to only do
|
|
|
|
|
|
* subtractions involving (x >> 1). subtractSaturate(a, b) ==
|
|
|
|
|
|
* subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)).
|
|
|
|
|
|
*
|
|
|
|
|
|
* 3. For unsigned sources, it is sufficient to replace the
|
|
|
|
|
|
* subtractSaturate with (a > b) ? a - b : 0.
|
|
|
|
|
|
*
|
|
|
|
|
|
* It may also be possible to use the SUBB instruction. This
|
|
|
|
|
|
* implicitly writes the accumulator, so it could only be used in the
|
|
|
|
|
|
* same situations as #1 above. It is further limited by only
|
|
|
|
|
|
* allowing UD sources.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q &&
|
|
|
|
|
|
inst->src[0].type != BRW_REGISTER_TYPE_UQ) {
|
|
|
|
|
|
fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type);
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MOV(acc, inst->src[1]);
|
|
|
|
|
|
fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]);
|
|
|
|
|
|
add->saturate = true;
|
|
|
|
|
|
add->src[0].negate = true;
|
|
|
|
|
|
} else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) {
|
|
|
|
|
|
/* tmp = src1 >> 1;
|
|
|
|
|
|
* dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp));
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
|
fs_reg tmp2 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
|
fs_reg tmp3 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
|
fs_inst *add;
|
|
|
|
|
|
|
|
|
|
|
|
ibld.SHR(tmp1, inst->src[1], brw_imm_d(1));
|
|
|
|
|
|
|
|
|
|
|
|
add = ibld.ADD(tmp2, inst->src[1], tmp1);
|
|
|
|
|
|
add->src[1].negate = true;
|
|
|
|
|
|
|
|
|
|
|
|
add = ibld.ADD(tmp3, inst->src[0], tmp1);
|
|
|
|
|
|
add->src[1].negate = true;
|
|
|
|
|
|
add->saturate = true;
|
|
|
|
|
|
|
|
|
|
|
|
add = ibld.ADD(inst->dst, tmp3, tmp2);
|
|
|
|
|
|
add->src[1].negate = true;
|
|
|
|
|
|
add->saturate = true;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* a > b ? a - b : 0 */
|
|
|
|
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
|
BRW_CONDITIONAL_G);
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]);
|
|
|
|
|
|
add->src[1].negate = !add->src[1].negate;
|
|
|
|
|
|
|
|
|
|
|
|
ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0))
|
|
|
|
|
|
->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2018-09-19 01:28:06 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-01-23 12:50:50 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the mask of SIMD channels enabled during dispatch and not yet disabled
|
|
|
|
|
|
* by discard. Due to the layout of the sample mask in the fragment shader
|
|
|
|
|
|
* thread payload, \p bld is required to have a dispatch_width() not greater
|
|
|
|
|
|
* than 16 for fragment shaders.
|
|
|
|
|
|
*/
|
2022-06-27 12:24:58 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
brw_sample_mask_reg(const fs_builder &bld)
|
2020-01-23 12:50:50 -08:00
|
|
|
|
{
|
|
|
|
|
|
const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
|
|
|
|
|
|
|
|
|
|
|
|
if (v->stage != MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
return brw_imm_ud(0xffffffff);
|
|
|
|
|
|
} else if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
|
2020-01-04 16:16:24 -08:00
|
|
|
|
assert(bld.dispatch_width() <= 16);
|
|
|
|
|
|
return brw_flag_subreg(sample_mask_flag_subreg(v) + bld.group() / 16);
|
2020-01-23 12:50:50 -08:00
|
|
|
|
} else {
|
2021-03-29 14:41:58 -07:00
|
|
|
|
assert(v->devinfo->ver >= 6 && bld.dispatch_width() <= 16);
|
2020-01-23 12:50:50 -08:00
|
|
|
|
return retype(brw_vec1_grf((bld.group() >= 16 ? 2 : 1), 7),
|
2020-01-04 16:11:23 -08:00
|
|
|
|
BRW_REGISTER_TYPE_UW);
|
2020-01-23 12:50:50 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
|
uint32_t
|
|
|
|
|
|
brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
|
{
|
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
|
|
|
|
|
|
assert(inst->group == 0 && inst->exec_size == 16);
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
|
|
|
|
|
|
} else if (prog_data->dual_src_blend) {
|
|
|
|
|
|
assert(inst->exec_size == 8);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
|
else
|
|
|
|
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
|
|
|
|
} else {
|
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
|
|
|
|
|
else
|
|
|
|
|
|
unreachable("Invalid FB write execution size");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Predicate the specified instruction on the sample mask.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
brw_emit_predicate_on_sample_mask(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(bld.shader->stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
|
bld.group() == inst->group &&
|
|
|
|
|
|
bld.dispatch_width() == inst->exec_size);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
|
const fs_visitor *v = static_cast<const fs_visitor *>(bld.shader);
|
|
|
|
|
|
const fs_reg sample_mask = brw_sample_mask_reg(bld);
|
|
|
|
|
|
const unsigned subreg = sample_mask_flag_subreg(v);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
|
if (brw_wm_prog_data(v->stage_prog_data)->uses_kill) {
|
|
|
|
|
|
assert(sample_mask.file == ARF &&
|
|
|
|
|
|
sample_mask.nr == brw_flag_subreg(subreg).nr &&
|
|
|
|
|
|
sample_mask.subnr == brw_flag_subreg(
|
|
|
|
|
|
subreg + inst->group / 16).subnr);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
bld.group(1, 0).exec_all()
|
|
|
|
|
|
.MOV(brw_flag_subreg(subreg + inst->group / 16), sample_mask);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-06-27 12:24:58 -07:00
|
|
|
|
if (inst->predicate) {
|
|
|
|
|
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
|
|
|
|
|
assert(!inst->predicate_inverse);
|
|
|
|
|
|
assert(inst->flag_subreg == 0);
|
|
|
|
|
|
/* Combine the sample mask with the existing predicate by using a
|
|
|
|
|
|
* vertical predication mode.
|
2015-11-16 17:23:01 -08:00
|
|
|
|
*/
|
2022-06-27 12:24:58 -07:00
|
|
|
|
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
} else {
|
2022-06-27 12:24:58 -07:00
|
|
|
|
inst->flag_subreg = subreg;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
}
|
2015-07-27 16:14:36 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-03-14 10:35:58 +01:00
|
|
|
|
static bool
|
|
|
|
|
|
is_mixed_float_with_fp32_dst(const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* This opcode sometimes uses :W type on the source even if the operand is
|
2021-03-29 15:40:04 -07:00
|
|
|
|
* a :HF, because in gfx7 there is no support for :HF, and thus it uses :W.
|
2019-03-14 10:35:58 +01:00
|
|
|
|
*/
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_F16TO32)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->dst.type != BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
|
is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* This opcode sometimes uses :W type on the destination even if the
|
2021-03-29 15:40:04 -07:00
|
|
|
|
* destination is a :HF, because in gfx7 there is no support for :HF, and
|
2019-03-14 10:35:58 +01:00
|
|
|
|
* thus it uses :W.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_F32TO16 &&
|
|
|
|
|
|
inst->dst.stride == 1)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
|
|
|
|
|
|
inst->dst.stride != 1)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].type == BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the closest allowed SIMD width for instruction \p inst accounting for
|
|
|
|
|
|
* some common regioning and execution control restrictions that apply to FPU
|
|
|
|
|
|
* instructions. These restrictions don't necessarily have any relevance to
|
|
|
|
|
|
* instructions not executed by the FPU pipeline like extended math, control
|
|
|
|
|
|
* flow or send message instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For virtual opcodes it's really up to the instruction -- In some cases
|
|
|
|
|
|
* (e.g. where a virtual instruction unrolls into a simple sequence of FPU
|
|
|
|
|
|
* instructions) it may simplify virtual instruction lowering if we can
|
|
|
|
|
|
* enforce FPU-like regioning restrictions already on the virtual instruction,
|
|
|
|
|
|
* in other cases (e.g. virtual send-like instructions) this may be
|
|
|
|
|
|
* excessively restrictive.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2022-06-29 14:13:31 -07:00
|
|
|
|
get_fpu_lowered_simd_width(const struct brw_compiler *compiler,
|
2016-05-20 13:15:49 -07:00
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
2022-06-29 14:13:31 -07:00
|
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/* Maximum execution size representable in the instruction controls. */
|
|
|
|
|
|
unsigned max_width = MIN2(32, inst->exec_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* According to the PRMs:
|
|
|
|
|
|
* "A. In Direct Addressing mode, a source cannot span more than 2
|
|
|
|
|
|
* adjacent GRF registers.
|
|
|
|
|
|
* B. A destination cannot span more than 2 adjacent GRF registers."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Look for the source or destination with the largest register region
|
|
|
|
|
|
* which is the one that is going to limit the overall execution size of
|
|
|
|
|
|
* the instruction due to this rule.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
|
2016-05-20 13:15:49 -07:00
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
|
2016-05-20 13:15:49 -07:00
|
|
|
|
|
|
|
|
|
|
/* Calculate the maximum execution size of the instruction based on the
|
|
|
|
|
|
* factor by which it goes over the hardware limit of 2 GRFs.
|
|
|
|
|
|
*/
|
2022-07-22 17:30:30 -07:00
|
|
|
|
const unsigned max_reg_count = 2 * reg_unit(devinfo);
|
|
|
|
|
|
if (reg_count > max_reg_count)
|
|
|
|
|
|
max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, max_reg_count));
|
2016-05-20 13:15:49 -07:00
|
|
|
|
|
|
|
|
|
|
/* According to the IVB PRMs:
|
|
|
|
|
|
* "When destination spans two registers, the source MUST span two
|
|
|
|
|
|
* registers. The exception to the above rule:
|
|
|
|
|
|
*
|
|
|
|
|
|
* - When source is scalar, the source registers are not incremented.
|
|
|
|
|
|
* - When source is packed integer Word and destination is packed
|
|
|
|
|
|
* integer DWord, the source register is not incremented but the
|
|
|
|
|
|
* source sub register is incremented."
|
|
|
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* The hardware specs from Gfx4 to Gfx7.5 mention similar regioning
|
2016-05-20 13:15:49 -07:00
|
|
|
|
* restrictions. The code below intentionally doesn't check whether the
|
|
|
|
|
|
* destination type is integer because empirically the hardware doesn't
|
|
|
|
|
|
* seem to care what the actual type is as long as it's dword-aligned.
|
2023-09-25 19:16:50 +03:00
|
|
|
|
*
|
|
|
|
|
|
* HSW PRMs also add a note to the second exception:
|
|
|
|
|
|
* "When lower 8 channels are disabled, the sub register of source1
|
|
|
|
|
|
* operand is not incremented. If the lower 8 channels are expected
|
|
|
|
|
|
* to be disabled, say by predication, the instruction must be split
|
|
|
|
|
|
* into pair of simd8 operations."
|
|
|
|
|
|
*
|
|
|
|
|
|
* We can't reliably know if the channels won't be disabled due to,
|
|
|
|
|
|
* for example, IMASK. So, play it safe and disallow packed-word exception
|
|
|
|
|
|
* for src1.
|
2016-05-20 13:15:49 -07:00
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 8) {
|
2016-05-20 13:15:49 -07:00
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
2017-01-11 08:17:57 +01:00
|
|
|
|
/* IVB implements DF scalars as <0;2,1> regions. */
|
|
|
|
|
|
const bool is_scalar_exception = is_uniform(inst->src[i]) &&
|
2021-09-22 15:06:58 +03:00
|
|
|
|
(devinfo->platform == INTEL_PLATFORM_HSW || type_sz(inst->src[i].type) != 8);
|
2023-09-25 19:16:50 +03:00
|
|
|
|
const bool is_packed_word_exception = i != 1 &&
|
2017-01-11 08:17:57 +01:00
|
|
|
|
type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
|
|
|
|
|
|
type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
|
|
|
|
|
|
|
2017-01-13 17:04:23 -08:00
|
|
|
|
/* We check size_read(i) against size_written instead of REG_SIZE
|
|
|
|
|
|
* because we want to properly handle SIMD32. In SIMD32, you can end
|
|
|
|
|
|
* up with writes to 4 registers and a source that reads 2 registers
|
|
|
|
|
|
* and we may still need to lower all the way to SIMD8 in that case.
|
|
|
|
|
|
*/
|
2016-09-07 13:32:25 -07:00
|
|
|
|
if (inst->size_written > REG_SIZE &&
|
2017-01-13 17:04:23 -08:00
|
|
|
|
inst->size_read(i) != 0 &&
|
|
|
|
|
|
inst->size_read(i) < inst->size_written &&
|
2017-01-11 08:17:57 +01:00
|
|
|
|
!is_scalar_exception && !is_packed_word_exception) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
|
|
|
|
|
|
max_width = MIN2(max_width, inst->exec_size / reg_count);
|
|
|
|
|
|
}
|
2016-05-20 13:15:49 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 6) {
|
intel: Fix SIMD16 unaligned payload GRF reads on Gen4-5.
When the SIMD16 Gen4-5 fragment shader payload contains source depth
(g2-3), destination stencil (g4), and destination depth (g5-6), the
single register of stencil makes the destination depth unaligned.
We were generating this instruction in the RT write payload setup:
mov(16) m14<1>F g5<8,8,1>F { align1 compr };
which is illegal, instructions with a source region spanning more than
one register need to be aligned to even registers. This is because the
hardware implicitly does (nr | 1) instead of (nr + 1) when splitting the
compressed instruction into two mov(8)'s.
I believe this would cause the hardware to load g5 twice, replicating
subspan 0-1's destination depth to subspan 2-3. This showed up as 2x2
artifact blocks in both TIS-100 and Reicast.
Normally, we rely on the register allocator to even-align our virtual
GRFs. But we don't control the payload, so we need to lower SIMD widths
to make it work. To fix this, we teach lower_simd_width about the
restriction, and then call it again after lower_load_payload (which is
what generates the offending MOV).
Fixes: 8aee87fe4cce0a883867df3546db0e0a36908086 (i965: Use SIMD16 instead of SIMD8 on Gen4 when possible.)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107212
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=13728
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Diego Viola <diego.viola@gmail.com>
2018-08-02 15:02:18 -07:00
|
|
|
|
/* From the G45 PRM, Volume 4 Page 361:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Operand Alignment Rule: With the exceptions listed below, a
|
|
|
|
|
|
* source/destination operand in general should be aligned to even
|
|
|
|
|
|
* 256-bit physical register with a region size equal to two 256-bit
|
|
|
|
|
|
* physical registers."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Normally we enforce this by allocating virtual registers to the
|
|
|
|
|
|
* even-aligned class. But we need to handle payload registers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
|
|
|
|
|
|
inst->size_read(i) > REG_SIZE) {
|
|
|
|
|
|
max_width = MIN2(max_width, 8);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/* From the IVB PRMs:
|
|
|
|
|
|
* "When an instruction is SIMD32, the low 16 bits of the execution mask
|
|
|
|
|
|
* are applied for both halves of the SIMD32 instruction. If different
|
|
|
|
|
|
* execution mask channels are required, split the instruction into two
|
|
|
|
|
|
* SIMD16 instructions."
|
|
|
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* There is similar text in the HSW PRMs. Gfx4-6 don't even implement
|
2016-05-20 13:15:49 -07:00
|
|
|
|
* 32-wide control flow support in hardware and will behave similarly.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 8 && !inst->force_writemask_all)
|
2016-05-20 13:15:49 -07:00
|
|
|
|
max_width = MIN2(max_width, 16);
|
|
|
|
|
|
|
|
|
|
|
|
/* From the IVB PRMs (applies to HSW too):
|
|
|
|
|
|
* "Instructions with condition modifiers must not use SIMD32."
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the BDW PRMs (applies to later hardware too):
|
|
|
|
|
|
* "Ternary instruction with condition modifiers must not use SIMD32."
|
|
|
|
|
|
*/
|
2022-07-22 17:30:30 -07:00
|
|
|
|
if (inst->conditional_mod && (devinfo->ver < 8 ||
|
|
|
|
|
|
(inst->is_3src(compiler) && devinfo->ver < 12)))
|
2016-05-20 13:15:49 -07:00
|
|
|
|
max_width = MIN2(max_width, 16);
|
|
|
|
|
|
|
|
|
|
|
|
/* From the IVB PRMs (applies to other devices that don't have the
|
2021-04-05 13:19:39 -07:00
|
|
|
|
* intel_device_info::supports_simd16_3src flag set):
|
2016-05-20 13:15:49 -07:00
|
|
|
|
* "In Align16 access mode, SIMD16 is not allowed for DW operations and
|
|
|
|
|
|
* SIMD8 is not allowed for DF operations."
|
|
|
|
|
|
*/
|
2022-06-29 14:13:31 -07:00
|
|
|
|
if (inst->is_3src(compiler) && !devinfo->supports_simd16_3src)
|
2016-05-20 13:15:49 -07:00
|
|
|
|
max_width = MIN2(max_width, inst->exec_size / reg_count);
|
|
|
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* Pre-Gfx8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
|
2016-03-30 14:00:31 +02:00
|
|
|
|
* the 8-bit quarter of the execution mask signals specified in the
|
|
|
|
|
|
* instruction control fields) for the second compressed half of any
|
|
|
|
|
|
* single-precision instruction (for double-precision instructions
|
|
|
|
|
|
* it's hardwired to use NibCtrl+1, at least on HSW), which means that
|
|
|
|
|
|
* the EU will apply the wrong execution controls for the second
|
|
|
|
|
|
* sequential GRF write if the number of channels per GRF is not exactly
|
|
|
|
|
|
* eight in single-precision mode (or four in double-float mode).
|
|
|
|
|
|
*
|
|
|
|
|
|
* In this situation we calculate the maximum size of the split
|
|
|
|
|
|
* instructions so they only ever write to a single register.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 8 && inst->size_written > REG_SIZE &&
|
2016-03-30 14:00:31 +02:00
|
|
|
|
!inst->force_writemask_all) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
const unsigned channels_per_grf = inst->exec_size /
|
|
|
|
|
|
DIV_ROUND_UP(inst->size_written, REG_SIZE);
|
2016-07-18 07:17:39 +00:00
|
|
|
|
const unsigned exec_type_size = get_exec_type_size(inst);
|
2016-03-30 14:00:31 +02:00
|
|
|
|
assert(exec_type_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* The hardware shifts exactly 8 channels per compressed half of the
|
|
|
|
|
|
* instruction in single-precision mode and exactly 4 in double-precision.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
|
|
|
|
|
|
max_width = MIN2(max_width, channels_per_grf);
|
2016-08-25 16:05:24 +02:00
|
|
|
|
|
|
|
|
|
|
/* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
|
|
|
|
|
|
* because HW applies the same channel enable signals to both halves of
|
|
|
|
|
|
* the compressed instruction which will be just wrong under
|
|
|
|
|
|
* non-uniform control flow.
|
|
|
|
|
|
*/
|
2021-05-14 18:04:46 +02:00
|
|
|
|
if (devinfo->verx10 == 70 &&
|
2016-08-25 16:05:24 +02:00
|
|
|
|
(exec_type_size == 8 || type_sz(inst->dst.type) == 8))
|
|
|
|
|
|
max_width = MIN2(max_width, 4);
|
2016-03-30 14:00:31 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-03-14 10:35:58 +01:00
|
|
|
|
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
|
|
|
|
|
|
* Float Operations:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "No SIMD16 in mixed mode when destination is f32. Instruction
|
|
|
|
|
|
* execution size must be no more than 8."
|
|
|
|
|
|
*
|
|
|
|
|
|
* FIXME: the simulator doesn't seem to complain if we don't do this and
|
|
|
|
|
|
* empirical testing with existing CTS tests show that they pass just fine
|
|
|
|
|
|
* without implementing this, however, since our interpretation of the PRM
|
|
|
|
|
|
* is that conversion MOVs between HF and F are still mixed-float
|
|
|
|
|
|
* instructions (and therefore subject to this restriction) we decided to
|
|
|
|
|
|
* split them to be safe. Might be useful to do additional investigation to
|
|
|
|
|
|
* lift the restriction if we can ensure that it is safe though, since these
|
|
|
|
|
|
* conversions are common when half-float types are involved since many
|
|
|
|
|
|
* instructions do not support HF types and conversions from/to F are
|
|
|
|
|
|
* required.
|
|
|
|
|
|
*/
|
2022-07-22 17:30:30 -07:00
|
|
|
|
if (is_mixed_float_with_fp32_dst(inst) && devinfo->ver < 20)
|
2019-03-14 10:35:58 +01:00
|
|
|
|
max_width = MIN2(max_width, 8);
|
|
|
|
|
|
|
|
|
|
|
|
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
|
|
|
|
|
|
* Float Operations:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "No SIMD16 in mixed mode when destination is packed f16 for both
|
|
|
|
|
|
* Align1 and Align16."
|
|
|
|
|
|
*/
|
2022-07-22 17:30:30 -07:00
|
|
|
|
if (is_mixed_float_with_packed_fp16_dst(inst) && devinfo->ver < 20)
|
2019-03-14 10:35:58 +01:00
|
|
|
|
max_width = MIN2(max_width, 8);
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/* Only power-of-two execution sizes are representable in the instruction
|
|
|
|
|
|
* control fields.
|
|
|
|
|
|
*/
|
2019-12-06 09:20:09 -08:00
|
|
|
|
return 1 << util_logbase2(max_width);
|
2016-05-20 13:15:49 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-08-12 14:05:19 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the maximum allowed SIMD width for instruction \p inst accounting for
|
|
|
|
|
|
* various payload size restrictions that apply to sampler message
|
|
|
|
|
|
* instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is only intended to provide a maximum theoretical bound for the
|
|
|
|
|
|
* execution size of the message based on the number of argument components
|
|
|
|
|
|
* alone, which in most cases will determine whether the SIMD8 or SIMD16
|
|
|
|
|
|
* variant of the message can be used, though some messages may have
|
|
|
|
|
|
* additional restrictions not accounted for here (e.g. pre-ILK hardware uses
|
|
|
|
|
|
* the message length to determine the exact SIMD width and argument count,
|
|
|
|
|
|
* which makes a number of sampler message combinations impossible to
|
|
|
|
|
|
* represent).
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
|
get_sampler_lowered_simd_width(const struct intel_device_info *devinfo,
|
2016-08-12 14:05:19 -07:00
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
2018-10-11 15:57:50 -05:00
|
|
|
|
/* If we have a min_lod parameter on anything other than a simple sample
|
|
|
|
|
|
* message, it will push it over 5 arguments and we have to fall back to
|
|
|
|
|
|
* SIMD8.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_TEX &&
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
2016-08-12 14:05:19 -07:00
|
|
|
|
/* Calculate the number of coordinate components that have to be present
|
|
|
|
|
|
* assuming that additional arguments follow the texel coordinates in the
|
|
|
|
|
|
* message payload. On IVB+ there is no need for padding, on ILK-SNB we
|
|
|
|
|
|
* need to pad to four or three components depending on the message,
|
|
|
|
|
|
* pre-ILK we need to pad to at most three components.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned req_coord_components =
|
2021-03-29 14:41:58 -07:00
|
|
|
|
(devinfo->ver >= 7 ||
|
2016-08-12 14:05:19 -07:00
|
|
|
|
!inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
|
2021-03-29 14:41:58 -07:00
|
|
|
|
(devinfo->ver >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
|
2016-08-12 14:05:19 -07:00
|
|
|
|
inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
|
|
|
|
|
|
3;
|
|
|
|
|
|
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* On Gfx9+ the LOD argument is for free if we're able to use the LZ
|
2016-08-12 14:05:19 -07:00
|
|
|
|
* variant of the TXL or TXF message.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
const bool implicit_lod = devinfo->ver >= 9 &&
|
2016-08-12 14:05:19 -07:00
|
|
|
|
(inst->opcode == SHADER_OPCODE_TXL ||
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_TXF) &&
|
|
|
|
|
|
inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
|
|
|
|
|
|
|
|
|
|
|
|
/* Calculate the total number of argument components that need to be passed
|
|
|
|
|
|
* to the sampler unit.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned num_payload_components =
|
|
|
|
|
|
MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
|
|
|
|
|
|
req_coord_components) +
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
|
|
|
|
|
|
(implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_LOD2) +
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
|
|
|
|
|
|
(inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
|
2016-11-28 18:13:02 -08:00
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
|
2016-08-12 14:05:19 -07:00
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_MCS);
|
|
|
|
|
|
|
|
|
|
|
|
/* SIMD16 messages with more than five arguments exceed the maximum message
|
|
|
|
|
|
* size supported by the sampler, regardless of whether a header is
|
|
|
|
|
|
* provided or not.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return MIN2(inst->exec_size,
|
|
|
|
|
|
num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the closest native SIMD width supported by the hardware for instruction
|
|
|
|
|
|
* \p inst. The instruction will be left untouched by
|
|
|
|
|
|
* fs_visitor::lower_simd_width() if the returned value is equal to the
|
|
|
|
|
|
* original execution size.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2022-06-29 14:13:31 -07:00
|
|
|
|
get_lowered_simd_width(const struct brw_compiler *compiler,
|
2015-07-13 21:15:31 +03:00
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
2022-06-29 14:13:31 -07:00
|
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
switch (inst->opcode) {
|
2023-10-04 17:41:08 -07:00
|
|
|
|
case BRW_OPCODE_DP4A:
|
2015-08-04 19:07:19 +03:00
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
|
case BRW_OPCODE_ASR:
|
2019-11-24 16:12:12 -08:00
|
|
|
|
case BRW_OPCODE_ROR:
|
|
|
|
|
|
case BRW_OPCODE_ROL:
|
2015-08-04 19:07:19 +03:00
|
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
|
case BRW_OPCODE_CSEL:
|
|
|
|
|
|
case BRW_OPCODE_F32TO16:
|
|
|
|
|
|
case BRW_OPCODE_F16TO32:
|
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
|
|
|
|
|
case BRW_OPCODE_SAD2:
|
|
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
|
case BRW_OPCODE_LRP:
|
2021-07-21 15:24:29 -07:00
|
|
|
|
case BRW_OPCODE_ADD3:
|
2016-05-20 13:15:49 -07:00
|
|
|
|
case FS_OPCODE_PACK:
|
2017-08-31 21:45:30 -07:00
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
2020-08-08 13:56:16 -05:00
|
|
|
|
case SHADER_OPCODE_MOV_RELOC_IMM:
|
2022-06-29 14:13:31 -07:00
|
|
|
|
return get_fpu_lowered_simd_width(compiler, inst);
|
2016-05-18 01:26:03 -07:00
|
|
|
|
|
2016-05-17 15:58:04 -07:00
|
|
|
|
case BRW_OPCODE_CMP: {
|
|
|
|
|
|
/* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
|
|
|
|
|
|
* when the destination is a GRF the dependency-clear bit on the flag
|
|
|
|
|
|
* register is cleared early.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Suggested workarounds are to disable coissuing CMP instructions
|
|
|
|
|
|
* or to split CMP(16) instructions into two CMP(8) instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We choose to split into CMP(8) instructions since disabling
|
|
|
|
|
|
* coissuing would affect CMP instructions not otherwise affected by
|
|
|
|
|
|
* the errata.
|
|
|
|
|
|
*/
|
2021-05-14 18:04:46 +02:00
|
|
|
|
const unsigned max_width = (devinfo->verx10 == 70 &&
|
2016-05-17 15:58:04 -07:00
|
|
|
|
!inst->dst.is_null() ? 8 : ~0);
|
2022-06-29 14:13:31 -07:00
|
|
|
|
return MIN2(max_width, get_fpu_lowered_simd_width(compiler, inst));
|
2016-05-17 15:58:04 -07:00
|
|
|
|
}
|
2016-05-17 16:00:19 -07:00
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
|
|
|
|
|
/* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
|
|
|
|
|
|
* should
|
|
|
|
|
|
* "Force BFI instructions to be executed always in SIMD8."
|
|
|
|
|
|
*/
|
2021-09-22 15:06:58 +03:00
|
|
|
|
return MIN2(devinfo->platform == INTEL_PLATFORM_HSW ? 8 : ~0u,
|
2022-06-29 14:13:31 -07:00
|
|
|
|
get_fpu_lowered_simd_width(compiler, inst));
|
2016-05-17 15:58:04 -07:00
|
|
|
|
|
2016-05-17 16:01:29 -07:00
|
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
|
assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
|
2016-05-20 13:14:20 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
2018-04-26 10:26:22 +02:00
|
|
|
|
case SHADER_OPCODE_COS: {
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* Unary extended math instructions are limited to SIMD8 on Gfx4 and
|
|
|
|
|
|
* Gfx6. Extended Math Function is limited to SIMD8 with half-float.
|
2016-05-20 13:14:20 -07:00
|
|
|
|
*/
|
2021-09-22 15:06:58 +03:00
|
|
|
|
if (devinfo->ver == 6 || devinfo->verx10 == 40)
|
2018-04-26 10:26:22 +02:00
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
}
|
2016-05-20 13:14:20 -07:00
|
|
|
|
|
2018-04-26 10:26:22 +02:00
|
|
|
|
case SHADER_OPCODE_POW: {
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* SIMD16 is only allowed on Gfx7+. Extended Math Function is limited
|
2018-04-26 10:26:22 +02:00
|
|
|
|
* to SIMD8 with half-float
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 7)
|
2018-04-26 10:26:22 +02:00
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
}
|
2016-05-20 13:14:20 -07:00
|
|
|
|
|
2018-09-19 01:28:06 -07:00
|
|
|
|
case SHADER_OPCODE_USUB_SAT:
|
|
|
|
|
|
case SHADER_OPCODE_ISUB_SAT:
|
2022-06-29 14:13:31 -07:00
|
|
|
|
return get_fpu_lowered_simd_width(compiler, inst);
|
2018-09-19 01:28:06 -07:00
|
|
|
|
|
2016-05-20 13:14:20 -07:00
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
|
/* Integer division is limited to SIMD8 on all generations. */
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case FS_OPCODE_LINTERP:
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2016-05-18 01:26:03 -07:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
|
|
|
|
|
/* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
|
|
|
|
|
|
* message used to implement varying pull constant loads, so expand it
|
|
|
|
|
|
* to SIMD16. An alternative with longer message payload length but
|
|
|
|
|
|
* shorter return payload would be to use the SIMD8 sampler message that
|
|
|
|
|
|
* takes (header, u, v, r) as parameters instead of (header, u).
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
return (devinfo->ver == 4 ? 16 : MIN2(16, inst->exec_size));
|
2016-05-18 01:26:03 -07:00
|
|
|
|
|
2019-07-25 18:28:44 -05:00
|
|
|
|
case FS_OPCODE_DDX_COARSE:
|
|
|
|
|
|
case FS_OPCODE_DDX_FINE:
|
|
|
|
|
|
case FS_OPCODE_DDY_COARSE:
|
2016-05-17 16:27:09 -07:00
|
|
|
|
case FS_OPCODE_DDY_FINE:
|
|
|
|
|
|
/* The implementation of this virtual opcode may require emitting
|
|
|
|
|
|
* compressed Align16 instructions, which are severely limited on some
|
|
|
|
|
|
* generations.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
|
|
|
|
|
|
* Region Restrictions):
|
|
|
|
|
|
*
|
|
|
|
|
|
* "In Align16 access mode, SIMD16 is not allowed for DW operations
|
|
|
|
|
|
* and SIMD8 is not allowed for DF operations."
|
|
|
|
|
|
*
|
|
|
|
|
|
* In this context, "DW operations" means "operations acting on 32-bit
|
|
|
|
|
|
* values", so it includes operations on floats.
|
|
|
|
|
|
*
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* Gfx4 has a similar restriction. From the i965 PRM, section 11.5.3
|
2016-05-17 16:27:09 -07:00
|
|
|
|
* (Instruction Compression -> Rules and Restrictions):
|
|
|
|
|
|
*
|
|
|
|
|
|
* "A compressed instruction must be in Align1 access mode. Align16
|
|
|
|
|
|
* mode instructions cannot be compressed."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Similar text exists in the g45 PRM.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Empirically, compressed align16 instructions using odd register
|
|
|
|
|
|
* numbers don't appear to work on Sandybridge either.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
return (devinfo->ver == 4 || devinfo->ver == 6 ||
|
2021-05-14 18:04:46 +02:00
|
|
|
|
(devinfo->verx10 == 70) ?
|
2016-05-17 16:27:09 -07:00
|
|
|
|
MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
|
|
|
|
|
|
|
2015-08-06 14:04:00 +03:00
|
|
|
|
case SHADER_OPCODE_MULH:
|
|
|
|
|
|
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* is 8-wide on Gfx7+.
|
2015-08-06 14:04:00 +03:00
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
return (devinfo->ver >= 7 ? 8 :
|
2022-06-29 14:13:31 -07:00
|
|
|
|
get_fpu_lowered_simd_width(compiler, inst));
|
2015-08-06 14:04:00 +03:00
|
|
|
|
|
2015-07-13 21:19:28 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* Gfx6 doesn't support SIMD16 depth writes but we cannot handle them
|
2015-07-13 21:19:28 +03:00
|
|
|
|
* here.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
assert(devinfo->ver != 6 ||
|
2015-10-20 14:29:37 -07:00
|
|
|
|
inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
|
2015-07-13 21:19:28 +03:00
|
|
|
|
inst->exec_size == 8);
|
|
|
|
|
|
/* Dual-source FB writes are unsupported in SIMD16 mode. */
|
2015-10-20 14:29:37 -07:00
|
|
|
|
return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
|
2016-05-20 13:34:46 -07:00
|
|
|
|
8 : MIN2(16, inst->exec_size));
|
2015-07-13 21:19:28 +03:00
|
|
|
|
|
2016-07-21 16:55:45 -07:00
|
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-08-12 14:05:19 -07:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
|
return get_sampler_lowered_simd_width(devinfo, inst);
|
2016-05-20 00:37:37 -07:00
|
|
|
|
|
2020-07-07 23:54:00 -07:00
|
|
|
|
/* On gfx12 parameters are fixed to 16-bit values and therefore they all
|
|
|
|
|
|
* always fit regardless of the execution size.
|
|
|
|
|
|
*/
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_GFX12_LOGICAL:
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2015-07-13 21:19:52 +03:00
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
/* TXD is unsupported in SIMD16 mode. */
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
2016-08-12 14:05:19 -07:00
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
/* Only one execution size is representable pre-ILK depending on whether
|
|
|
|
|
|
* the shadow reference argument is present.
|
2015-07-13 21:19:52 +03:00
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver == 4)
|
2016-08-12 14:05:19 -07:00
|
|
|
|
return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
|
2015-07-13 21:19:52 +03:00
|
|
|
|
else
|
2016-08-12 14:05:19 -07:00
|
|
|
|
return get_sampler_lowered_simd_width(devinfo, inst);
|
|
|
|
|
|
|
2015-07-13 21:19:52 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
2021-03-29 15:46:12 -07:00
|
|
|
|
/* Gfx4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
|
2015-07-13 21:19:52 +03:00
|
|
|
|
* messages. Use SIMD16 instead.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver == 4)
|
2015-07-13 21:19:52 +03:00
|
|
|
|
return 16;
|
|
|
|
|
|
else
|
2016-08-12 14:05:19 -07:00
|
|
|
|
return get_sampler_lowered_simd_width(devinfo, inst);
|
2015-09-08 15:52:09 +01:00
|
|
|
|
|
2015-07-18 16:16:19 +03:00
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
2017-07-01 08:16:01 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
2017-07-01 08:19:17 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
2015-04-08 02:41:33 -07:00
|
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_DWORD_SCATTERED_READ_LOGICAL:
|
2016-05-20 13:34:46 -07:00
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
2021-03-29 14:41:58 -07:00
|
|
|
|
return devinfo->ver <= 8 ? 8 : MIN2(16, inst->exec_size);
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
2020-10-05 14:43:41 -07:00
|
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNALIGNED_OWORD_BLOCK_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_OWORD_BLOCK_WRITE_LOGICAL:
|
|
|
|
|
|
assert(inst->exec_size <= 16);
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
2022-08-01 18:12:45 +03:00
|
|
|
|
return devinfo->has_lsc ? MIN2(16, inst->exec_size) : 8;
|
2018-11-26 15:15:04 -06:00
|
|
|
|
|
2022-06-27 15:22:03 -07:00
|
|
|
|
case SHADER_OPCODE_URB_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_LOGICAL:
|
2022-07-20 10:21:21 -07:00
|
|
|
|
return MIN2(devinfo->ver < 20 ? 8 : 16, inst->exec_size);
|
2016-05-20 13:34:46 -07:00
|
|
|
|
|
2018-12-06 14:11:34 -08:00
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE: {
|
|
|
|
|
|
const unsigned swiz = inst->src[1].ud;
|
|
|
|
|
|
return (is_uniform(inst->src[0]) ?
|
2022-06-29 14:13:31 -07:00
|
|
|
|
get_fpu_lowered_simd_width(compiler, inst) :
|
2021-03-29 14:41:58 -07:00
|
|
|
|
devinfo->ver < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
|
2018-12-06 14:11:34 -08:00
|
|
|
|
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
|
2022-06-29 14:13:31 -07:00
|
|
|
|
get_fpu_lowered_simd_width(compiler, inst));
|
2018-12-06 14:11:34 -08:00
|
|
|
|
}
|
2016-08-03 11:51:44 +00:00
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT: {
|
|
|
|
|
|
/* From IVB and HSW PRMs:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "2.When the destination requires two registers and the sources are
|
|
|
|
|
|
* indirect, the sources must use 1x1 regioning mode.
|
|
|
|
|
|
*
|
|
|
|
|
|
* In case of DF instructions in HSW/IVB, the exec_size is limited by
|
|
|
|
|
|
* the EU decompression logic not handling VxH indirect addressing
|
|
|
|
|
|
* correctly.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
const unsigned max_size = (devinfo->ver >= 8 ? 2 : 1) * REG_SIZE;
|
2016-08-03 11:51:44 +00:00
|
|
|
|
/* Prior to Broadwell, we only have 8 address subregisters. */
|
2021-03-29 14:41:58 -07:00
|
|
|
|
return MIN3(devinfo->ver >= 8 ? 16 : 8,
|
2016-08-03 11:51:44 +00:00
|
|
|
|
max_size / (inst->dst.stride * type_sz(inst->dst.type)),
|
2016-05-17 16:10:38 -07:00
|
|
|
|
inst->exec_size);
|
2016-08-03 11:51:44 +00:00
|
|
|
|
}
|
2015-11-24 09:01:11 -08:00
|
|
|
|
|
2016-05-19 23:44:23 -07:00
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD: {
|
|
|
|
|
|
const unsigned reg_count =
|
|
|
|
|
|
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
|
|
|
|
|
|
|
|
|
|
|
|
if (reg_count > 2) {
|
|
|
|
|
|
/* Only LOAD_PAYLOAD instructions with per-channel destination region
|
|
|
|
|
|
* can be easily lowered (which excludes headers and heterogeneous
|
|
|
|
|
|
* types).
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(!inst->header_size);
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++)
|
|
|
|
|
|
assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
|
|
|
|
|
|
inst->src[i].file == BAD_FILE);
|
|
|
|
|
|
|
|
|
|
|
|
return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-07-13 21:15:31 +03:00
|
|
|
|
default:
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-28 22:44:13 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Return true if splitting out the group of channels of instruction \p inst
|
|
|
|
|
|
* given by lbld.group() requires allocating a temporary for the i-th source
|
|
|
|
|
|
* of the lowered instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static inline bool
|
|
|
|
|
|
needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
|
|
|
|
|
|
{
|
|
|
|
|
|
return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
|
|
|
|
|
|
(inst->components_read(i) == 1 &&
|
2017-09-06 18:33:38 -07:00
|
|
|
|
lbld.dispatch_width() <= inst->exec_size)) ||
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
|
(inst->flags_written(lbld.shader->devinfo) &
|
2017-09-06 18:33:38 -07:00
|
|
|
|
flag_mask(inst->src[i], type_sz(inst->src[i].type)));
|
2016-05-28 22:44:13 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Extract the data that would be consumed by the channel group given by
|
|
|
|
|
|
* lbld.group() from the i-th source region of instruction \p inst and return
|
2017-09-06 18:24:17 -07:00
|
|
|
|
* it as result in packed form.
|
2016-05-26 23:07:58 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static fs_reg
|
2017-09-06 18:24:17 -07:00
|
|
|
|
emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
|
2016-05-26 23:07:58 -07:00
|
|
|
|
{
|
2018-12-07 14:15:50 -08:00
|
|
|
|
assert(lbld.group() >= inst->group);
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* Specified channel group from the source region. */
|
2018-12-07 14:15:50 -08:00
|
|
|
|
const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2016-05-28 22:44:13 -07:00
|
|
|
|
if (needs_src_copy(lbld, inst, i)) {
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* Builder of the right width to perform the copy avoiding uninitialized
|
|
|
|
|
|
* data if the lowered execution size is greater than the original
|
|
|
|
|
|
* execution size of the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
|
|
|
|
|
|
inst->exec_size), 0);
|
|
|
|
|
|
const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned k = 0; k < inst->components_read(i); ++k)
|
2017-09-06 18:24:17 -07:00
|
|
|
|
cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
|
|
|
|
|
return tmp;
|
|
|
|
|
|
|
2016-05-28 22:44:13 -07:00
|
|
|
|
} else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* The source is invariant for all dispatch_width-wide groups of the
|
|
|
|
|
|
* original region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return inst->src[i];
|
2016-05-28 22:44:13 -07:00
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We can just point the lowered instruction at the right channel group
|
|
|
|
|
|
* from the original region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return src;
|
2016-05-26 23:07:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 00:45:04 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Return true if splitting out the group of channels of instruction \p inst
|
|
|
|
|
|
* given by lbld.group() requires allocating a temporary for the destination
|
|
|
|
|
|
* of the lowered instruction and copying the data back to the original
|
|
|
|
|
|
* destination region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static inline bool
|
|
|
|
|
|
needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* If the instruction writes more than one component we'll have to shuffle
|
|
|
|
|
|
* the results of multiple lowered instructions in order to make sure that
|
|
|
|
|
|
* they end up arranged correctly in the original destination region.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (inst->size_written > inst->dst.component_size(inst->exec_size))
|
2016-05-27 00:45:04 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
/* If the lowered execution size is larger than the original the result of
|
|
|
|
|
|
* the instruction won't fit in the original destination, so we'll have to
|
|
|
|
|
|
* allocate a temporary in any case.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (lbld.dispatch_width() > inst->exec_size)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
/* If we already made a copy of the source for other reasons there won't
|
|
|
|
|
|
* be any overlap with the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (needs_src_copy(lbld, inst, i))
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* In order to keep the logic simple we emit a copy whenever the
|
|
|
|
|
|
* destination region doesn't exactly match an overlapping source, which
|
|
|
|
|
|
* may point at the source and destination not being aligned group by
|
|
|
|
|
|
* group which could cause one of the lowered instructions to overwrite
|
|
|
|
|
|
* the data read from the same source by other lowered instructions.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(inst->dst, inst->size_written,
|
2016-09-07 17:00:07 -07:00
|
|
|
|
inst->src[i], inst->size_read(i)) &&
|
2016-05-27 00:45:04 -07:00
|
|
|
|
!inst->dst.equals(inst->src[i]))
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Insert data from a packed temporary into the channel group given by
|
|
|
|
|
|
* lbld.group() of the destination region of instruction \p inst and return
|
2017-09-06 18:24:17 -07:00
|
|
|
|
* the temporary as result. Any copy instructions that are required for
|
|
|
|
|
|
* unzipping the previous value (in the case of partial writes) will be
|
|
|
|
|
|
* inserted using \p lbld_before and any copy instructions required for
|
|
|
|
|
|
* zipping up the destination of \p inst will be inserted using \p lbld_after.
|
2016-05-26 23:07:58 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static fs_reg
|
2017-09-06 18:24:17 -07:00
|
|
|
|
emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
|
|
|
|
|
|
fs_inst *inst)
|
2016-05-26 23:07:58 -07:00
|
|
|
|
{
|
2017-09-06 18:24:17 -07:00
|
|
|
|
assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
|
|
|
|
|
|
assert(lbld_before.group() == lbld_after.group());
|
2018-12-07 14:15:50 -08:00
|
|
|
|
assert(lbld_after.group() >= inst->group);
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2023-11-15 17:02:18 +02:00
|
|
|
|
const struct intel_device_info *devinfo = lbld_before.shader->devinfo;
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* Specified channel group from the destination region. */
|
2018-12-07 14:15:50 -08:00
|
|
|
|
const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
|
2016-05-27 00:45:04 -07:00
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
|
if (!needs_dst_copy(lbld_after, inst)) {
|
|
|
|
|
|
/* No need to allocate a temporary for the lowered instruction, just
|
|
|
|
|
|
* take the right group of channels from the original region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Deal with the residency data part later */
|
2023-11-15 17:02:18 +02:00
|
|
|
|
const unsigned residency_size = inst->has_sampler_residency() ?
|
|
|
|
|
|
(reg_unit(devinfo) * REG_SIZE) : 0;
|
2023-05-23 13:11:02 +03:00
|
|
|
|
const unsigned dst_size = (inst->size_written - residency_size) /
|
|
|
|
|
|
inst->dst.component_size(inst->exec_size);
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
|
const fs_reg tmp = lbld_after.vgrf(inst->dst.type,
|
|
|
|
|
|
dst_size + inst->has_sampler_residency());
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
|
if (inst->predicate) {
|
|
|
|
|
|
/* Handle predication by copying the original contents of the
|
|
|
|
|
|
* destination into the temporary before emitting the lowered
|
|
|
|
|
|
* instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder gbld_before =
|
|
|
|
|
|
lbld_before.group(MIN2(lbld_before.dispatch_width(),
|
|
|
|
|
|
inst->exec_size), 0);
|
2017-09-06 18:24:17 -07:00
|
|
|
|
for (unsigned k = 0; k < dst_size; ++k) {
|
2023-05-23 13:11:02 +03:00
|
|
|
|
gbld_before.MOV(offset(tmp, lbld_before, k),
|
|
|
|
|
|
offset(dst, inst->exec_size, k));
|
2017-09-06 18:24:17 -07:00
|
|
|
|
}
|
2023-05-23 13:11:02 +03:00
|
|
|
|
}
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
|
const fs_builder gbld_after =
|
|
|
|
|
|
lbld_after.group(MIN2(lbld_after.dispatch_width(),
|
|
|
|
|
|
inst->exec_size), 0);
|
|
|
|
|
|
for (unsigned k = 0; k < dst_size; ++k) {
|
|
|
|
|
|
/* Use a builder of the right width to perform the copy avoiding
|
|
|
|
|
|
* uninitialized data if the lowered execution size is greater than the
|
|
|
|
|
|
* original execution size of the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
gbld_after.MOV(offset(dst, inst->exec_size, k),
|
|
|
|
|
|
offset(tmp, lbld_after, k));
|
|
|
|
|
|
}
|
2016-05-27 00:45:04 -07:00
|
|
|
|
|
2023-05-23 13:11:02 +03:00
|
|
|
|
if (inst->has_sampler_residency()) {
|
|
|
|
|
|
/* Sampler messages with residency need a special attention. In the
|
|
|
|
|
|
* first lane of the last component are located the Pixel Null Mask
|
|
|
|
|
|
* (bits 0:15) & some upper bits we need to discard (bits 16:31). We
|
|
|
|
|
|
* have to build a single 32bit value for the SIMD32 message out of 2
|
|
|
|
|
|
* SIMD16 16 bit values.
|
2016-05-27 00:45:04 -07:00
|
|
|
|
*/
|
2023-05-23 13:11:02 +03:00
|
|
|
|
const fs_builder rbld = gbld_after.exec_all().group(1, 0);
|
|
|
|
|
|
fs_reg local_res_reg = component(
|
|
|
|
|
|
retype(offset(tmp, lbld_before, dst_size),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UW), 0);
|
|
|
|
|
|
fs_reg final_res_reg =
|
|
|
|
|
|
retype(byte_offset(inst->dst,
|
|
|
|
|
|
inst->size_written - residency_size +
|
|
|
|
|
|
gbld_after.group() / 8),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UW);
|
|
|
|
|
|
rbld.MOV(final_res_reg, local_res_reg);
|
2016-05-27 00:45:04 -07:00
|
|
|
|
}
|
2023-05-23 13:11:02 +03:00
|
|
|
|
|
|
|
|
|
|
return tmp;
|
2016-05-26 23:07:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_simd_width()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2022-06-29 14:13:31 -07:00
|
|
|
|
const unsigned lower_width = get_lowered_simd_width(compiler, inst);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
|
|
|
|
|
if (lower_width != inst->exec_size) {
|
2015-07-27 18:42:31 +03:00
|
|
|
|
/* Builder matching the original instruction. We may also need to
|
|
|
|
|
|
* emit an instruction of width larger than the original, set the
|
|
|
|
|
|
* execution size of the builder to the highest of both for now so
|
|
|
|
|
|
* we're sure that both cases can be handled.
|
|
|
|
|
|
*/
|
2016-05-20 16:14:13 -07:00
|
|
|
|
const unsigned max_width = MAX2(inst->exec_size, lower_width);
|
2023-11-21 09:47:18 -08:00
|
|
|
|
|
|
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
2015-07-13 21:15:31 +03:00
|
|
|
|
const fs_builder ibld = bld.at(block, inst)
|
|
|
|
|
|
.exec_all(inst->force_writemask_all)
|
2016-05-20 16:14:13 -07:00
|
|
|
|
.group(max_width, inst->group / max_width);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
|
|
|
|
|
/* Split the copies in chunks of the execution width of either the
|
|
|
|
|
|
* original or the lowered instruction, whichever is lower.
|
|
|
|
|
|
*/
|
2016-05-26 23:07:58 -07:00
|
|
|
|
const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
|
2023-11-15 17:02:18 +02:00
|
|
|
|
const unsigned residency_size = inst->has_sampler_residency() ?
|
|
|
|
|
|
(reg_unit(devinfo) * REG_SIZE) : 0;
|
2023-05-23 13:11:02 +03:00
|
|
|
|
const unsigned dst_size =
|
|
|
|
|
|
(inst->size_written - residency_size) /
|
2015-07-13 21:15:31 +03:00
|
|
|
|
inst->dst.component_size(inst->exec_size);
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
assert(!inst->writes_accumulator && !inst->mlen);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2017-09-06 18:31:11 -07:00
|
|
|
|
/* Inserting the zip, unzip, and duplicated instructions in all of
|
|
|
|
|
|
* the right spots is somewhat tricky. All of the unzip and any
|
|
|
|
|
|
* instructions from the zip which unzip the destination prior to
|
|
|
|
|
|
* writing need to happen before all of the per-group instructions
|
|
|
|
|
|
* and the zip instructions need to happen after. In order to sort
|
|
|
|
|
|
* this all out, we insert the unzip instructions before \p inst,
|
|
|
|
|
|
* insert the per-group instructions after \p inst (i.e. before
|
|
|
|
|
|
* inst->next), and insert the zip instructions before the
|
|
|
|
|
|
* instruction after \p inst. Since we are inserting instructions
|
|
|
|
|
|
* after \p inst, inst->next is a moving target and we need to save
|
|
|
|
|
|
* it off here so that we insert the zip instructions in the right
|
|
|
|
|
|
* place.
|
2018-05-21 09:51:50 -07:00
|
|
|
|
*
|
|
|
|
|
|
* Since we're inserting split instructions after after_inst, the
|
|
|
|
|
|
* instructions will end up in the reverse order that we insert them.
|
|
|
|
|
|
* However, certain render target writes require that the low group
|
|
|
|
|
|
* instructions come before the high group. From the Ivy Bridge PRM
|
|
|
|
|
|
* Vol. 4, Pt. 1, Section 3.9.11:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "If multiple SIMD8 Dual Source messages are delivered by the
|
|
|
|
|
|
* pixel shader thread, each SIMD8_DUALSRC_LO message must be
|
|
|
|
|
|
* issued before the SIMD8_DUALSRC_HI message with the same Slot
|
|
|
|
|
|
* Group Select setting."
|
|
|
|
|
|
*
|
|
|
|
|
|
* And, from Section 3.9.11.1 of the same PRM:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "When SIMD32 or SIMD16 PS threads send render target writes
|
|
|
|
|
|
* with multiple SIMD8 and SIMD16 messages, the following must
|
|
|
|
|
|
* hold:
|
|
|
|
|
|
*
|
|
|
|
|
|
* All the slots (as described above) must have a corresponding
|
|
|
|
|
|
* render target write irrespective of the slot's validity. A slot
|
|
|
|
|
|
* is considered valid when at least one sample is enabled. For
|
|
|
|
|
|
* example, a SIMD16 PS thread must send two SIMD8 render target
|
|
|
|
|
|
* writes to cover all the slots.
|
|
|
|
|
|
*
|
|
|
|
|
|
* PS thread must send SIMD render target write messages with
|
|
|
|
|
|
* increasing slot numbers. For example, SIMD16 thread has
|
|
|
|
|
|
* Slot[15:0] and if two SIMD8 render target writes are used, the
|
|
|
|
|
|
* first SIMD8 render target write must send Slot[7:0] and the
|
|
|
|
|
|
* next one must send Slot[15:8]."
|
|
|
|
|
|
*
|
|
|
|
|
|
* In order to make low group instructions come before high group
|
|
|
|
|
|
* instructions (this is required for some render target writes), we
|
|
|
|
|
|
* split from the highest group to lowest.
|
2017-09-06 18:31:11 -07:00
|
|
|
|
*/
|
|
|
|
|
|
exec_node *const after_inst = inst->next;
|
2018-05-21 09:51:50 -07:00
|
|
|
|
for (int i = n - 1; i >= 0; i--) {
|
2015-07-13 21:15:31 +03:00
|
|
|
|
/* Emit a copy of the original instruction with the lowered width.
|
|
|
|
|
|
* If the EOT flag was set throw it away except for the last
|
|
|
|
|
|
* instruction to avoid killing the thread prematurely.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst split_inst = *inst;
|
|
|
|
|
|
split_inst.exec_size = lower_width;
|
2018-07-16 13:32:36 -07:00
|
|
|
|
split_inst.eot = inst->eot && i == int(n - 1);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2015-07-27 18:42:31 +03:00
|
|
|
|
/* Select the correct channel enables for the i-th group, then
|
|
|
|
|
|
* transform the sources and destination and emit the lowered
|
|
|
|
|
|
* instruction.
|
2015-07-13 21:15:31 +03:00
|
|
|
|
*/
|
2015-07-27 18:42:31 +03:00
|
|
|
|
const fs_builder lbld = ibld.group(lower_width, i);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
for (unsigned j = 0; j < inst->sources; j++)
|
2017-09-06 18:24:17 -07:00
|
|
|
|
split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2017-09-06 18:24:17 -07:00
|
|
|
|
split_inst.dst = emit_zip(lbld.at(block, inst),
|
2017-09-06 18:31:11 -07:00
|
|
|
|
lbld.at(block, after_inst), inst);
|
2016-09-07 13:38:20 -07:00
|
|
|
|
split_inst.size_written =
|
2023-05-23 13:11:02 +03:00
|
|
|
|
split_inst.dst.component_size(lower_width) * dst_size +
|
|
|
|
|
|
residency_size;
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2017-09-06 18:31:11 -07:00
|
|
|
|
lbld.at(block, inst->next).emit(split_inst);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-01-03 16:12:23 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Transform barycentric vectors into the interleaved form expected by the PLN
|
2021-03-29 15:46:12 -07:00
|
|
|
|
* instruction and returned by the Gfx7+ PI shared function.
|
2020-01-03 16:12:23 -08:00
|
|
|
|
*
|
|
|
|
|
|
* For channels 0-15 in SIMD16 mode they are expected to be laid out as
|
|
|
|
|
|
* follows in the register file:
|
|
|
|
|
|
*
|
|
|
|
|
|
* rN+0: X[0-7]
|
|
|
|
|
|
* rN+1: Y[0-7]
|
|
|
|
|
|
* rN+2: X[8-15]
|
|
|
|
|
|
* rN+3: Y[8-15]
|
|
|
|
|
|
*
|
|
|
|
|
|
* There is no need to handle SIMD32 here -- This is expected to be run after
|
|
|
|
|
|
* SIMD lowering, since SIMD lowering relies on vectors having the standard
|
|
|
|
|
|
* component layout.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_barycentrics()
|
|
|
|
|
|
{
|
2021-03-29 14:41:58 -07:00
|
|
|
|
const bool has_interleaved_layout = devinfo->has_pln || devinfo->ver >= 7;
|
2020-01-03 16:12:23 -08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
if (stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->exec_size < 16)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(8, 0);
|
|
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
2020-01-03 17:08:51 -08:00
|
|
|
|
case FS_OPCODE_LINTERP : {
|
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
|
const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2);
|
|
|
|
|
|
fs_reg srcs[4];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++)
|
|
|
|
|
|
srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2),
|
|
|
|
|
|
8 * (i / 2));
|
|
|
|
|
|
|
|
|
|
|
|
ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs));
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2020-01-03 16:12:23 -08:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: {
|
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
|
const fs_reg tmp = ibld.vgrf(inst->dst.type, 2);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) {
|
|
|
|
|
|
for (unsigned g = 0; g < inst->exec_size / 8; g++) {
|
|
|
|
|
|
fs_inst *mov = ibld.at(block, inst->next).group(8, g)
|
|
|
|
|
|
.MOV(horiz_offset(offset(inst->dst, ibld, i),
|
|
|
|
|
|
8 * g),
|
|
|
|
|
|
offset(tmp, ubld, 2 * g + i));
|
|
|
|
|
|
mov->predicate = inst->predicate;
|
|
|
|
|
|
mov->predicate_inverse = inst->predicate_inverse;
|
|
|
|
|
|
mov->flag_subreg = inst->flag_subreg;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->dst = tmp;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2020-01-03 16:12:23 -08:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-01-10 20:23:53 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Lower a derivative instruction as the floating-point difference of two
|
|
|
|
|
|
* swizzles of the source, specified as \p swz0 and \p swz1.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static bool
|
|
|
|
|
|
lower_derivative(fs_visitor *v, bblock_t *block, fs_inst *inst,
|
|
|
|
|
|
unsigned swz0, unsigned swz1)
|
|
|
|
|
|
{
|
|
|
|
|
|
const fs_builder ibld(v, block, inst);
|
|
|
|
|
|
const fs_reg tmp0 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
|
const fs_reg tmp1 = ibld.vgrf(inst->src[0].type);
|
|
|
|
|
|
|
|
|
|
|
|
ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0));
|
|
|
|
|
|
ibld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1));
|
|
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
|
inst->src[0] = negate(tmp0);
|
|
|
|
|
|
inst->src[1] = tmp1;
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Lower derivative instructions on platforms where codegen cannot implement
|
|
|
|
|
|
* them efficiently (i.e. XeHP).
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_derivatives()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->verx10 < 125)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode == FS_OPCODE_DDX_COARSE)
|
|
|
|
|
|
progress |= lower_derivative(this, block, inst,
|
|
|
|
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY);
|
|
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDX_FINE)
|
|
|
|
|
|
progress |= lower_derivative(this, block, inst,
|
|
|
|
|
|
BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW);
|
|
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDY_COARSE)
|
|
|
|
|
|
progress |= lower_derivative(this, block, inst,
|
|
|
|
|
|
BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ);
|
|
|
|
|
|
|
|
|
|
|
|
else if (inst->opcode == FS_OPCODE_DDY_FINE)
|
|
|
|
|
|
progress |= lower_derivative(this, block, inst,
|
|
|
|
|
|
BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-06-06 02:35:09 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_find_live_channel()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->ver < 8)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
bool packed_dispatch =
|
|
|
|
|
|
brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data);
|
|
|
|
|
|
bool vmask =
|
|
|
|
|
|
stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
|
brw_wm_prog_data(stage_prog_data)->uses_vmask;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL &&
|
|
|
|
|
|
inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL;
|
|
|
|
|
|
|
|
|
|
|
|
/* Getting the first active channel index is easy on Gfx8: Just find
|
|
|
|
|
|
* the first bit set in the execution mask. The register exists on
|
|
|
|
|
|
* HSW already but it reads back as all ones when the current
|
|
|
|
|
|
* instruction has execution masking disabled, so it's kind of
|
|
|
|
|
|
* useless there.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
2022-09-14 02:40:01 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
if (!inst->is_partial_write())
|
|
|
|
|
|
ibld.emit_undef_for_dst(inst);
|
|
|
|
|
|
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder ubld = fs_builder(this, block, inst).exec_all().group(1, 0);
|
2022-06-06 02:35:09 -07:00
|
|
|
|
|
|
|
|
|
|
/* ce0 doesn't consider the thread dispatch mask (DMask or VMask),
|
|
|
|
|
|
* so combine the execution and dispatch masks to obtain the true mask.
|
|
|
|
|
|
*
|
|
|
|
|
|
* If we're looking for the first live channel, and we have packed
|
|
|
|
|
|
* dispatch, we can skip this step, as we know all dispatched channels
|
|
|
|
|
|
* will appear at the front of the mask.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (!(first && packed_dispatch)) {
|
|
|
|
|
|
fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
2023-07-25 10:09:18 +03:00
|
|
|
|
ubld.UNDEF(mask);
|
2022-06-06 02:35:09 -07:00
|
|
|
|
ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2));
|
|
|
|
|
|
|
|
|
|
|
|
/* Quarter control has the effect of magically shifting the value of
|
|
|
|
|
|
* ce0 so you'll get the first/last active channel relative to the
|
|
|
|
|
|
* specified quarter control as result.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->group > 0)
|
|
|
|
|
|
ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8)));
|
|
|
|
|
|
|
|
|
|
|
|
ubld.AND(mask, exec_mask, mask);
|
|
|
|
|
|
exec_mask = mask;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (first) {
|
|
|
|
|
|
ubld.FBL(inst->dst, exec_mask);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
2023-07-25 10:09:18 +03:00
|
|
|
|
ubld.UNDEF(tmp);
|
2022-06-06 02:35:09 -07:00
|
|
|
|
ubld.LZD(tmp, exec_mask);
|
|
|
|
|
|
ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:34:01 -07:00
|
|
|
|
void
|
2023-06-05 23:31:17 -07:00
|
|
|
|
fs_visitor::dump_instructions_to_file(FILE *file) const
|
2014-05-29 13:08:59 -07:00
|
|
|
|
{
|
2015-02-13 10:46:32 -08:00
|
|
|
|
if (cfg) {
|
2016-03-13 16:35:49 -07:00
|
|
|
|
const register_pressure &rp = regpressure_analysis.require();
|
|
|
|
|
|
unsigned ip = 0, max_pressure = 0;
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
|
unsigned cf_count = 0;
|
2015-02-13 10:46:32 -08:00
|
|
|
|
foreach_block_and_inst(block, backend_instruction, inst, cfg) {
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
|
if (inst->is_control_flow_end())
|
|
|
|
|
|
cf_count -= 1;
|
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
|
fprintf(file, "{%3d} %4d: ", rp.regs_live_at_ip[ip], ip);
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
|
for (unsigned i = 0; i < cf_count; i++)
|
|
|
|
|
|
fprintf(file, " ");
|
2015-02-13 10:46:32 -08:00
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
ip++;
|
intel/fs: print identation for control flow
INTEL_DEBUG=optimizer output changes from :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
to :
{ 10} 40: cmp.nz.f0.0(8) null:F, vgrf3470:F, 0f
{ 10} 41: (+f0.0) if(8) (null):UD,
{ 11} 42: txf_logical(8) vgrf3473:UD, vgrf250:D(null):UD, 0d(null):UD(null):UD(null):UD(null):UD, 31u, 0u(null):UD(null):UD(null):UD, 3d, 0d
{ 12} 43: and(8) vgrf262:UD, vgrf3473:UD, 2u
{ 11} 44: cmp.nz.f0.0(8) null:D, vgrf262:D, 0d
{ 10} 45: (+f0.0) if(8) (null):UD,
{ 11} 46: mov(8) vgrf270:D, -1082130432d
{ 12} 47: mov(8) vgrf271:D, 1082130432d
{ 14} 48: mov(8) vgrf274+0.0:D, 0d
{ 14} 49: mov(8) vgrf274+1.0:D, 0d
Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23477>
2023-06-09 11:48:26 +03:00
|
|
|
|
|
|
|
|
|
|
if (inst->is_control_flow_begin())
|
|
|
|
|
|
cf_count += 1;
|
2015-02-13 10:46:32 -08:00
|
|
|
|
}
|
|
|
|
|
|
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
|
foreach_in_list(backend_instruction, inst, &instructions) {
|
|
|
|
|
|
fprintf(file, "%4d: ", ip++);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
2014-05-29 11:45:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
2023-06-05 23:31:17 -07:00
|
|
|
|
fs_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const
|
2012-10-30 15:35:44 -07:00
|
|
|
|
{
|
2016-09-23 15:15:33 +03:00
|
|
|
|
const fs_inst *inst = (const fs_inst *)be_inst;
|
2013-04-29 14:21:14 -07:00
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->predicate) {
|
2017-12-12 12:05:02 -08:00
|
|
|
|
fprintf(file, "(%cf%d.%d) ",
|
|
|
|
|
|
inst->predicate_inverse ? '-' : '+',
|
|
|
|
|
|
inst->flag_subreg / 2,
|
|
|
|
|
|
inst->flag_subreg % 2);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-06-29 14:13:31 -07:00
|
|
|
|
fprintf(file, "%s", brw_instruction_name(&compiler->isa, inst->opcode));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->saturate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".sat");
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->conditional_mod) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (!inst->predicate &&
|
2021-03-29 14:41:58 -07:00
|
|
|
|
(devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL &&
|
2015-11-22 20:12:17 -08:00
|
|
|
|
inst->opcode != BRW_OPCODE_CSEL &&
|
2017-06-06 16:24:14 -07:00
|
|
|
|
inst->opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
inst->opcode != BRW_OPCODE_WHILE))) {
|
2017-12-12 12:05:02 -08:00
|
|
|
|
fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
|
|
|
|
|
|
inst->flag_subreg % 2);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 18:02:52 -07:00
|
|
|
|
fprintf(file, "(%d) ", inst->exec_size);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2015-06-02 20:40:54 -07:00
|
|
|
|
if (inst->mlen) {
|
|
|
|
|
|
fprintf(file, "(mlen: %d) ", inst->mlen);
|
|
|
|
|
|
}
|
2012-12-06 10:36:11 -08:00
|
|
|
|
|
2018-10-29 15:06:14 -05:00
|
|
|
|
if (inst->ex_mlen) {
|
|
|
|
|
|
fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-06-26 00:39:32 -07:00
|
|
|
|
if (inst->eot) {
|
|
|
|
|
|
fprintf(file, "(EOT) ");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
|
fprintf(file, "g%d", inst->dst.nr);
|
|
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case MRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "m%d", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case UNIFORM:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "***u%d***", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "***attr%d***", inst->dst.nr);
|
2014-10-20 23:16:48 -07:00
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
switch (inst->dst.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
|
|
|
|
|
fprintf(file, "null");
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
|
|
|
|
|
fprintf(file, "a0.%d", inst->dst.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
|
|
|
|
|
fprintf(file, "acc%d", inst->dst.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
|
|
|
|
|
fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
|
|
|
|
|
|
break;
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2015-10-26 06:58:56 -07:00
|
|
|
|
case IMM:
|
|
|
|
|
|
unreachable("not reached");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
2016-09-01 20:31:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->dst.offset ||
|
|
|
|
|
|
(inst->dst.file == VGRF &&
|
|
|
|
|
|
alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
|
|
|
|
|
|
const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
|
|
|
|
|
|
fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
|
|
|
|
|
|
inst->dst.offset % reg_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-05 09:41:18 -07:00
|
|
|
|
if (inst->dst.stride != 1)
|
|
|
|
|
|
fprintf(file, "<%u>", inst->dst.stride);
|
2017-07-26 17:31:36 -07:00
|
|
|
|
fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-09-16 15:56:47 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->src[i].file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
|
fprintf(file, "g%d", inst->src[i].nr);
|
|
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case MRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "***m%d***", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "attr%d", inst->src[i].nr);
|
2014-10-20 23:16:48 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case UNIFORM:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "u%d", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case IMM:
|
|
|
|
|
|
switch (inst->src[i].type) {
|
2020-07-29 16:38:40 -07:00
|
|
|
|
case BRW_REGISTER_TYPE_HF:
|
|
|
|
|
|
fprintf(file, "%-ghf", _mesa_half_to_float(inst->src[i].ud & 0xffff));
|
|
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2016-02-11 19:03:56 +13:00
|
|
|
|
fprintf(file, "%-gf", inst->src[i].f);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-08-03 15:00:51 -07:00
|
|
|
|
case BRW_REGISTER_TYPE_DF:
|
|
|
|
|
|
fprintf(file, "%fdf", inst->src[i].df);
|
|
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_W:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
fprintf(file, "%dd", inst->src[i].d);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UW:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
fprintf(file, "%uu", inst->src[i].ud);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2018-09-06 11:15:55 -07:00
|
|
|
|
case BRW_REGISTER_TYPE_Q:
|
|
|
|
|
|
fprintf(file, "%" PRId64 "q", inst->src[i].d64);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UQ:
|
|
|
|
|
|
fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
|
|
|
|
|
|
break;
|
2014-03-08 17:25:34 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_VF:
|
2014-12-31 16:54:44 -08:00
|
|
|
|
fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
|
2015-10-24 14:55:57 -07:00
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
|
2014-03-08 17:25:34 -08:00
|
|
|
|
break;
|
2018-11-05 09:52:09 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_V:
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UV:
|
|
|
|
|
|
fprintf(file, "%08x%s", inst->src[i].ud,
|
|
|
|
|
|
inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
|
|
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
switch (inst->src[i].nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
|
|
|
|
|
fprintf(file, "null");
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
|
|
|
|
|
fprintf(file, "a0.%d", inst->src[i].subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
|
|
|
|
|
fprintf(file, "acc%d", inst->src[i].subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
|
|
|
|
|
fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
|
|
|
|
|
|
break;
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
2016-09-01 20:31:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].offset ||
|
|
|
|
|
|
(inst->src[i].file == VGRF &&
|
|
|
|
|
|
alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
|
|
|
|
|
|
const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
|
|
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
|
|
|
|
|
|
inst->src[i].offset % reg_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2013-12-02 13:10:29 -08:00
|
|
|
|
if (inst->src[i].file != IMM) {
|
2015-08-05 09:41:18 -07:00
|
|
|
|
unsigned stride;
|
|
|
|
|
|
if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
|
|
|
|
|
|
unsigned hstride = inst->src[i].hstride;
|
|
|
|
|
|
stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
stride = inst->src[i].stride;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (stride != 1)
|
|
|
|
|
|
fprintf(file, "<%u>", stride);
|
|
|
|
|
|
|
2017-07-26 17:31:36 -07:00
|
|
|
|
fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
|
2013-12-02 13:10:29 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ", ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2015-11-09 23:55:58 -08:00
|
|
|
|
if (inst->force_writemask_all)
|
|
|
|
|
|
fprintf(file, "NoMask ");
|
|
|
|
|
|
|
2016-05-20 16:14:13 -07:00
|
|
|
|
if (inst->exec_size != dispatch_width)
|
|
|
|
|
|
fprintf(file, "group%d ", inst->group);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "\n");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
|
brw::register_pressure::register_pressure(const fs_visitor *v)
|
2013-08-04 23:27:14 -07:00
|
|
|
|
{
|
2016-03-13 16:35:49 -07:00
|
|
|
|
const fs_live_variables &live = v->live_analysis.require();
|
2016-03-13 16:37:03 -07:00
|
|
|
|
const unsigned num_instructions = v->cfg->num_blocks ?
|
|
|
|
|
|
v->cfg->blocks[v->cfg->num_blocks - 1]->end_ip + 1 : 0;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
|
regs_live_at_ip = new unsigned[num_instructions]();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
|
for (unsigned reg = 0; reg < v->alloc.count; reg++) {
|
2016-03-13 16:25:57 -07:00
|
|
|
|
for (int ip = live.vgrf_start[reg]; ip <= live.vgrf_end[reg]; ip++)
|
2016-03-13 16:35:49 -07:00
|
|
|
|
regs_live_at_ip[ip] += v->alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
|
}
|
2023-08-15 01:15:17 -07:00
|
|
|
|
|
|
|
|
|
|
const unsigned payload_count = v->first_non_payload_grf;
|
|
|
|
|
|
|
|
|
|
|
|
int *payload_last_use_ip = new int[payload_count];
|
|
|
|
|
|
v->calculate_payload_ranges(payload_count, payload_last_use_ip);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned reg = 0; reg < payload_count; reg++) {
|
|
|
|
|
|
for (int ip = 0; ip < payload_last_use_ip[reg]; ip++)
|
|
|
|
|
|
++regs_live_at_ip[ip];
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
delete[] payload_last_use_ip;
|
2013-08-04 23:27:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-03-13 16:35:49 -07:00
|
|
|
|
brw::register_pressure::~register_pressure()
|
|
|
|
|
|
{
|
|
|
|
|
|
delete[] regs_live_at_ip;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-03-12 18:50:24 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::invalidate_analysis(brw::analysis_dependency_class c)
|
|
|
|
|
|
{
|
|
|
|
|
|
backend_shader::invalidate_analysis(c);
|
2016-03-13 16:25:57 -07:00
|
|
|
|
live_analysis.invalidate(c);
|
2016-03-13 16:35:49 -07:00
|
|
|
|
regpressure_analysis.invalidate(c);
|
2016-03-12 18:50:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
|
void
|
2023-08-14 16:59:17 -07:00
|
|
|
|
fs_visitor::debug_optimizer(const nir_shader *nir,
|
|
|
|
|
|
const char *pass_name,
|
2023-08-06 15:46:12 +03:00
|
|
|
|
int iteration, int pass_num) const
|
|
|
|
|
|
{
|
2023-08-14 16:59:17 -07:00
|
|
|
|
if (!brw_should_print_shader(nir, DEBUG_OPTIMIZER))
|
2023-08-06 15:46:12 +03:00
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
char *filename;
|
2023-08-07 17:06:49 +03:00
|
|
|
|
int ret = asprintf(&filename, "%s/%s%d-%s-%02d-%02d-%s",
|
|
|
|
|
|
debug_get_option("INTEL_SHADER_OPTIMIZER_PATH", "./"),
|
2023-09-24 21:38:47 -07:00
|
|
|
|
_mesa_shader_stage_to_abbrev(stage), dispatch_width, nir->info.name,
|
2023-08-06 15:46:12 +03:00
|
|
|
|
iteration, pass_num, pass_name);
|
|
|
|
|
|
if (ret == -1)
|
|
|
|
|
|
return;
|
|
|
|
|
|
dump_instructions(filename);
|
|
|
|
|
|
free(filename);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::optimize()
|
|
|
|
|
|
{
|
2023-08-14 16:59:17 -07:00
|
|
|
|
debug_optimizer(nir, "start", 0, 0);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
|
2015-07-02 15:41:02 -07:00
|
|
|
|
/* Start by validating the shader we currently have. */
|
|
|
|
|
|
validate();
|
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
int iteration = 0;
|
|
|
|
|
|
int pass_num = 0;
|
2015-07-02 15:41:02 -07:00
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
#define OPT(pass, args...) ({ \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
pass_num++; \
|
|
|
|
|
|
bool this_progress = pass(args); \
|
|
|
|
|
|
\
|
2023-08-06 15:46:12 +03:00
|
|
|
|
if (this_progress) \
|
2023-08-14 16:59:17 -07:00
|
|
|
|
debug_optimizer(nir, #pass, iteration, pass_num); \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
\
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate(); \
|
|
|
|
|
|
\
|
2014-11-13 16:28:18 -08:00
|
|
|
|
progress = progress || this_progress; \
|
2015-01-16 01:05:21 -08:00
|
|
|
|
this_progress; \
|
|
|
|
|
|
})
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
|
assign_constant_locations();
|
|
|
|
|
|
OPT(lower_constant_loads);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
|
validate();
|
2015-07-13 17:44:58 +03:00
|
|
|
|
|
2021-11-09 14:38:48 -06:00
|
|
|
|
OPT(split_virtual_grfs);
|
|
|
|
|
|
|
2018-09-09 11:37:24 -07:00
|
|
|
|
/* Before anything else, eliminate dead code. The results of some NIR
|
|
|
|
|
|
* instructions may effectively be calculated twice. Once when the
|
|
|
|
|
|
* instruction is encountered, and again when the user of that result is
|
|
|
|
|
|
* encountered. Wipe those away before algebraic optimizations and
|
|
|
|
|
|
* especially copy propagation can mix things up.
|
|
|
|
|
|
*/
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
|
2017-07-01 08:14:56 +02:00
|
|
|
|
OPT(remove_extra_rounding_modes);
|
2016-04-20 14:22:53 -07:00
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
do {
|
|
|
|
|
|
progress = false;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
iteration++;
|
|
|
|
|
|
|
|
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
OPT(opt_cse);
|
2016-11-28 10:45:08 -08:00
|
|
|
|
OPT(opt_copy_propagation);
|
2015-10-02 20:30:41 -07:00
|
|
|
|
OPT(opt_predicated_break, this);
|
2014-08-22 10:54:43 -07:00
|
|
|
|
OPT(opt_cmod_propagation);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
OPT(dead_control_flow_eliminate, this);
|
|
|
|
|
|
OPT(opt_register_renaming);
|
|
|
|
|
|
OPT(opt_saturate_propagation);
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
2015-02-20 20:25:04 +02:00
|
|
|
|
OPT(eliminate_find_live_channel);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
OPT(compact_virtual_grfs);
|
|
|
|
|
|
} while (progress);
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
progress = false;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
|
|
|
|
|
|
2016-04-01 11:54:47 +02:00
|
|
|
|
if (OPT(lower_pack)) {
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
OPT(lower_simd_width);
|
2020-01-03 16:12:23 -08:00
|
|
|
|
OPT(lower_barycentrics);
|
2016-04-30 15:08:29 -07:00
|
|
|
|
OPT(lower_logical_sends);
|
|
|
|
|
|
|
2020-01-23 22:55:33 -08:00
|
|
|
|
/* After logical SEND lowering. */
|
2023-10-25 21:07:26 -07:00
|
|
|
|
|
|
|
|
|
|
if (OPT(opt_copy_propagation))
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
|
|
|
|
|
|
/* Identify trailing zeros LOAD_PAYLOAD of sampler messages.
|
|
|
|
|
|
* Do this before splitting SENDs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->ver >= 7) {
|
|
|
|
|
|
if (OPT(opt_zero_samples) && OPT(opt_copy_propagation))
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/fs: Opportunistically split SEND message payloads
While we've taken advantage of split-sends in select situations, there
are many other cases (such as sampler messages, framebuffer writes, and
URB writes) that have never received that treatment, and continued to
use monolithic send payloads.
This commit introduces a new optimization pass which detects SEND
messages with a single payload, finds an adjacent LOAD_PAYLOAD that
produces that payload, splits it two, and updates the SEND to use both
of the new smaller payloads.
In places where we manually used split SENDS, we rely on underlying
knowledge of the message to determine a natural split point. For
example, header and data, or address and value.
In this pass, we instead infer a natural split point by looking at the
source registers. Often times, consecutive LOAD_PAYLOAD sources may
already be grouped together in a contiguous block, such as a texture
coordinate. Then, there is another bit of data, such as a LOD, that
may come from elsewhere. We look for the point where the source list
switches VGRFs, and split it there. (If there is a message header, we
choose to split there, as it will naturally come from elsewhere.)
This not only reduces the payload sizes, alleviating register pressure,
but it means that we may be able to eliminate some payload construction
altogether, if we have a contiguous block already and some extra data
being tacked on to one side or the other.
shader-db results for Icelake are:
total instructions in shared programs: 19602513 -> 19369255 (-1.19%)
instructions in affected programs: 6085404 -> 5852146 (-3.83%)
helped: 23650 / HURT: 15
helped stats (abs) min: 1 max: 1344 x̄: 9.87 x̃: 3
helped stats (rel) min: 0.03% max: 35.71% x̄: 3.78% x̃: 2.15%
HURT stats (abs) min: 1 max: 44 x̄: 7.20 x̃: 2
HURT stats (rel) min: 1.04% max: 20.00% x̄: 4.13% x̃: 2.00%
95% mean confidence interval for instructions value: -10.16 -9.55
95% mean confidence interval for instructions %-change: -3.84% -3.72%
Instructions are helped.
total cycles in shared programs: 848180368 -> 842208063 (-0.70%)
cycles in affected programs: 599931746 -> 593959441 (-1.00%)
helped: 22114 / HURT: 13053
helped stats (abs) min: 1 max: 482486 x̄: 580.94 x̃: 22
helped stats (rel) min: <.01% max: 78.92% x̄: 4.76% x̃: 0.75%
HURT stats (abs) min: 1 max: 94022 x̄: 526.67 x̃: 22
HURT stats (rel) min: <.01% max: 188.99% x̄: 4.52% x̃: 0.61%
95% mean confidence interval for cycles value: -222.87 -116.79
95% mean confidence interval for cycles %-change: -1.44% -1.20%
Cycles are helped.
total spills in shared programs: 8387 -> 6569 (-21.68%)
spills in affected programs: 5110 -> 3292 (-35.58%)
helped: 359 / HURT: 3
total fills in shared programs: 11833 -> 8218 (-30.55%)
fills in affected programs: 8635 -> 5020 (-41.86%)
helped: 358 / HURT: 3
LOST: 1 SIMD16 shader, 659 SIMD32 shaders
GAINED: 65 SIMD16 shaders, 959 SIMD32 shaders
Total CPU time (seconds): 1505.48 -> 1474.08 (-2.09%)
Examining these results: the few shaders where spills/fills increased
were already spilling significantly, and were only slightly hurt. The
applications affected were also helped in countless other shaders, and
other shaders stopped spilling altogether or had 50% reductions. Many
SIMD16 shaders were gained, and overall we gain more SIMD32, though many
close to the register pressure line go back and forth.
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17018>
2022-06-13 02:21:49 -07:00
|
|
|
|
OPT(opt_split_sends);
|
2020-01-23 22:55:33 -08:00
|
|
|
|
OPT(fixup_nomask_control_flow);
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
if (progress) {
|
2022-05-26 10:58:10 -07:00
|
|
|
|
if (OPT(opt_copy_propagation))
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
/* Run after logical send lowering to give it a chance to CSE the
|
|
|
|
|
|
* LOAD_PAYLOAD instructions created to construct the payloads of
|
|
|
|
|
|
* e.g. texturing messages in cases where it wasn't possible to CSE the
|
|
|
|
|
|
* whole logical instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
OPT(opt_cse);
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-11-30 17:24:51 -06:00
|
|
|
|
OPT(opt_redundant_halt);
|
2015-02-08 13:59:57 -08:00
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
if (OPT(lower_load_payload)) {
|
2021-11-09 14:38:48 -06:00
|
|
|
|
OPT(split_virtual_grfs);
|
2019-12-12 13:25:33 -08:00
|
|
|
|
|
|
|
|
|
|
/* Lower 64 bit MOVs generated by payload lowering. */
|
2022-07-20 16:53:14 +03:00
|
|
|
|
if (!devinfo->has_64bit_float || !devinfo->has_64bit_int)
|
2019-12-12 13:25:33 -08:00
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
OPT(register_coalesce);
|
intel: Fix SIMD16 unaligned payload GRF reads on Gen4-5.
When the SIMD16 Gen4-5 fragment shader payload contains source depth
(g2-3), destination stencil (g4), and destination depth (g5-6), the
single register of stencil makes the destination depth unaligned.
We were generating this instruction in the RT write payload setup:
mov(16) m14<1>F g5<8,8,1>F { align1 compr };
which is illegal, instructions with a source region spanning more than
one register need to be aligned to even registers. This is because the
hardware implicitly does (nr | 1) instead of (nr + 1) when splitting the
compressed instruction into two mov(8)'s.
I believe this would cause the hardware to load g5 twice, replicating
subspan 0-1's destination depth to subspan 2-3. This showed up as 2x2
artifact blocks in both TIS-100 and Reicast.
Normally, we rely on the register allocator to even-align our virtual
GRFs. But we don't control the payload, so we need to lower SIMD widths
to make it work. To fix this, we teach lower_simd_width about the
restriction, and then call it again after lower_load_payload (which is
what generates the offending MOV).
Fixes: 8aee87fe4cce0a883867df3546db0e0a36908086 (i965: Use SIMD16 instead of SIMD8 on Gen4 when possible.)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107212
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=13728
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Diego Viola <diego.viola@gmail.com>
2018-08-02 15:02:18 -07:00
|
|
|
|
OPT(lower_simd_width);
|
2015-01-16 01:05:21 -08:00
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-12 11:00:46 -08:00
|
|
|
|
OPT(opt_combine_constants);
|
2020-10-27 02:24:30 -05:00
|
|
|
|
if (OPT(lower_integer_multiplication)) {
|
|
|
|
|
|
/* If lower_integer_multiplication made progress, it may have produced
|
|
|
|
|
|
* some 32x32-bit MULs in the process of lowering 64-bit MULs. Run it
|
|
|
|
|
|
* one more time to clean those up if they exist.
|
|
|
|
|
|
*/
|
|
|
|
|
|
OPT(lower_integer_multiplication);
|
|
|
|
|
|
}
|
2018-09-19 01:28:06 -07:00
|
|
|
|
OPT(lower_sub_sat);
|
2014-02-12 11:00:46 -08:00
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver <= 5 && OPT(lower_minmax)) {
|
2016-02-11 12:27:02 -08:00
|
|
|
|
OPT(opt_cmod_propagation);
|
|
|
|
|
|
OPT(opt_cse);
|
2022-05-26 10:58:10 -07:00
|
|
|
|
if (OPT(opt_copy_propagation))
|
|
|
|
|
|
OPT(opt_algebraic);
|
2016-02-11 12:27:02 -08:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-01-10 20:23:53 -08:00
|
|
|
|
progress = false;
|
|
|
|
|
|
OPT(lower_derivatives);
|
|
|
|
|
|
OPT(lower_regioning);
|
|
|
|
|
|
if (progress) {
|
2022-05-26 10:58:10 -07:00
|
|
|
|
if (OPT(opt_copy_propagation))
|
|
|
|
|
|
OPT(opt_algebraic);
|
2017-01-20 08:47:05 +01:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(lower_simd_width);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-15 21:05:08 -06:00
|
|
|
|
OPT(fixup_sends_duplicate_payload);
|
|
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
|
OPT(lower_uniform_pull_constant_loads);
|
2015-07-02 15:41:02 -07:00
|
|
|
|
|
2023-08-06 15:46:12 +03:00
|
|
|
|
OPT(lower_find_live_channel);
|
2022-06-06 02:35:09 -07:00
|
|
|
|
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate();
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-15 21:05:08 -06:00
|
|
|
|
/**
|
|
|
|
|
|
* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with the
|
|
|
|
|
|
* first block."
|
|
|
|
|
|
*
|
|
|
|
|
|
* There are plenty of cases where we may accidentally violate this due to
|
|
|
|
|
|
* having, for instance, both sources be the constant 0. This little pass
|
|
|
|
|
|
* just adds a new vgrf for the second payload and copies it over.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::fixup_sends_duplicate_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
|
|
|
|
|
regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
|
|
|
|
|
|
inst->src[3], inst->ex_mlen * REG_SIZE)) {
|
|
|
|
|
|
fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
/* Sadly, we've lost all notion of channels and bit sizes at this
|
|
|
|
|
|
* point. Just WE_all it.
|
|
|
|
|
|
*/
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder ibld = fs_builder(this, block, inst).exec_all().group(16, 0);
|
2018-11-15 21:05:08 -06:00
|
|
|
|
fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg copy_dst = tmp;
|
|
|
|
|
|
for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
|
|
|
|
|
|
if (inst->ex_mlen == i + 1) {
|
|
|
|
|
|
/* Only one register left; do SIMD8 */
|
|
|
|
|
|
ibld.group(8, 0).MOV(copy_dst, copy_src);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MOV(copy_dst, copy_src);
|
|
|
|
|
|
}
|
|
|
|
|
|
copy_src = offset(copy_src, ibld, 1);
|
|
|
|
|
|
copy_dst = offset(copy_dst, ibld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
inst->src[3] = tmp;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2018-11-15 21:05:08 -06:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Three source instruction must have a GRF/MRF destination register.
|
|
|
|
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fixup_3src_null_dest()
|
|
|
|
|
|
{
|
2016-03-11 15:27:22 -08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2022-06-29 14:13:31 -07:00
|
|
|
|
if (inst->is_3src(compiler) && inst->dst.is_null()) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
|
2014-12-29 20:33:12 -08:00
|
|
|
|
inst->dst.type);
|
2016-03-11 15:27:22 -08:00
|
|
|
|
progress = true;
|
2014-12-29 20:33:12 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-03-11 15:27:22 -08:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL |
|
|
|
|
|
|
DEPENDENCY_VARIABLES);
|
2014-12-29 20:33:12 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2021-10-27 14:11:27 -07:00
|
|
|
|
static bool
|
|
|
|
|
|
needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* This workaround is about making sure that any instruction writing
|
|
|
|
|
|
* through UGM has completed before we hit EOT.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->sfid != GFX12_SFID_UGM)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2022-08-16 08:08:43 +00:00
|
|
|
|
/* Any UGM, non-Scratch-surface Stores (not including Atomic) messages,
|
|
|
|
|
|
* where the L1-cache override is NOT among {WB, WS, WT}
|
|
|
|
|
|
*/
|
|
|
|
|
|
enum lsc_opcode opcode = lsc_msg_desc_opcode(devinfo, inst->desc);
|
|
|
|
|
|
if (lsc_opcode_is_store(opcode)) {
|
|
|
|
|
|
switch (lsc_msg_desc_cache_ctrl(devinfo, inst->desc)) {
|
|
|
|
|
|
case LSC_CACHE_STORE_L1STATE_L3MOCS:
|
|
|
|
|
|
case LSC_CACHE_STORE_L1WB_L3WB:
|
|
|
|
|
|
case LSC_CACHE_STORE_L1S_L3UC:
|
|
|
|
|
|
case LSC_CACHE_STORE_L1S_L3WB:
|
|
|
|
|
|
case LSC_CACHE_STORE_L1WT_L3UC:
|
|
|
|
|
|
case LSC_CACHE_STORE_L1WT_L3WB:
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Any UGM Atomic message WITHOUT return value */
|
|
|
|
|
|
if (lsc_opcode_is_atomic(opcode) && inst->dst.file == BAD_FILE)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
2021-10-27 14:11:27 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517
|
2022-12-06 18:11:10 +02:00
|
|
|
|
*
|
|
|
|
|
|
* The first instruction of any kernel should have non-zero emask.
|
|
|
|
|
|
* Make sure this happens by introducing a dummy mov instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_dummy_mov_instruction()
|
|
|
|
|
|
{
|
2023-01-20 23:19:34 -08:00
|
|
|
|
if (!intel_needs_workaround(devinfo, 14015360517))
|
2022-12-06 18:11:10 +02:00
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
struct backend_instruction *first_inst =
|
|
|
|
|
|
cfg->first_block()->start();
|
|
|
|
|
|
|
|
|
|
|
|
/* We can skip the WA if first instruction is marked with
|
|
|
|
|
|
* force_writemask_all or exec_size equals dispatch_width.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (first_inst->force_writemask_all ||
|
|
|
|
|
|
first_inst->exec_size == dispatch_width)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
/* Insert dummy mov as first instruction. */
|
|
|
|
|
|
const fs_builder ubld =
|
2023-11-21 09:47:18 -08:00
|
|
|
|
fs_builder(this, cfg->first_block(), (fs_inst *)first_inst).exec_all().group(8, 0);
|
|
|
|
|
|
ubld.MOV(ubld.null_reg_ud(), brw_imm_ud(0u));
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2021-10-27 14:11:27 -07:00
|
|
|
|
/* Wa_22013689345
|
|
|
|
|
|
*
|
|
|
|
|
|
* We need to emit UGM fence message before EOT, if shader has any UGM write
|
|
|
|
|
|
* or atomic message.
|
|
|
|
|
|
*
|
|
|
|
|
|
* TODO/FINISHME: According to Curro we could avoid the fence in some cases.
|
|
|
|
|
|
* We probably need a better criteria in needs_dummy_fence().
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_dummy_memory_fence_before_eot()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
bool has_ugm_write_or_atomic = false;
|
|
|
|
|
|
|
2023-04-12 12:03:43 +03:00
|
|
|
|
if (!intel_needs_workaround(devinfo, 22013689345))
|
2021-10-27 14:11:27 -07:00
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (!inst->eot) {
|
|
|
|
|
|
if (needs_dummy_fence(devinfo, inst))
|
|
|
|
|
|
has_ugm_write_or_atomic = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!has_ugm_write_or_atomic)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(1, 0);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_inst *dummy_fence = ubld.emit(SHADER_OPCODE_MEMORY_FENCE,
|
|
|
|
|
|
dst, brw_vec8_grf(0, 0),
|
|
|
|
|
|
/* commit enable */ brw_imm_ud(1),
|
|
|
|
|
|
/* bti */ brw_imm_ud(0));
|
|
|
|
|
|
dummy_fence->sfid = GFX12_SFID_UGM;
|
|
|
|
|
|
dummy_fence->desc = lsc_fence_msg_desc(devinfo, LSC_FENCE_TILE,
|
|
|
|
|
|
LSC_FLUSH_TYPE_NONE_6, false);
|
|
|
|
|
|
ubld.emit(FS_OPCODE_SCHEDULING_FENCE, ubld.null_reg_ud(), dst);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
/* TODO: remove this break if we ever have shader with multiple EOT. */
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS |
|
|
|
|
|
|
DEPENDENCY_VARIABLES);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-01-23 22:55:33 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Find the first instruction in the program that might start a region of
|
|
|
|
|
|
* divergent control flow due to a HALT jump. There is no
|
|
|
|
|
|
* find_halt_control_flow_region_end(), the region of divergence extends until
|
2020-11-19 09:32:27 -06:00
|
|
|
|
* the only SHADER_OPCODE_HALT_TARGET in the program.
|
2020-01-23 22:55:33 -08:00
|
|
|
|
*/
|
|
|
|
|
|
static const fs_inst *
|
|
|
|
|
|
find_halt_control_flow_region_start(const fs_visitor *v)
|
|
|
|
|
|
{
|
2020-11-19 10:26:44 -06:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_HALT ||
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_HALT_TARGET)
|
|
|
|
|
|
return inst;
|
2020-01-23 22:55:33 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2021-03-29 17:15:41 -07:00
|
|
|
|
* Work around the Gfx12 hardware bug filed as Wa_1407528679. EU fusion
|
2020-01-23 22:55:33 -08:00
|
|
|
|
* can cause a BB to be executed with all channels disabled, which will lead
|
|
|
|
|
|
* to the execution of any NoMask instructions in it, even though any
|
|
|
|
|
|
* execution-masked instructions will be correctly shot down. This may break
|
|
|
|
|
|
* assumptions of some NoMask SEND messages whose descriptor depends on data
|
|
|
|
|
|
* generated by live invocations of the shader.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This avoids the problem by predicating certain instructions on an ANY
|
|
|
|
|
|
* horizontal predicate that makes sure that their execution is omitted when
|
|
|
|
|
|
* all channels of the program are disabled.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::fixup_nomask_control_flow()
|
|
|
|
|
|
{
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver != 12)
|
2020-01-23 22:55:33 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
const brw_predicate pred = dispatch_width > 16 ? BRW_PREDICATE_ALIGN1_ANY32H :
|
|
|
|
|
|
dispatch_width > 8 ? BRW_PREDICATE_ALIGN1_ANY16H :
|
|
|
|
|
|
BRW_PREDICATE_ALIGN1_ANY8H;
|
|
|
|
|
|
const fs_inst *halt_start = find_halt_control_flow_region_start(this);
|
|
|
|
|
|
unsigned depth = 0;
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2016-03-13 16:25:57 -07:00
|
|
|
|
const fs_live_variables &live_vars = live_analysis.require();
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
|
|
|
|
|
/* Scan the program backwards in order to be able to easily determine
|
|
|
|
|
|
* whether the flag register is live at any point.
|
|
|
|
|
|
*/
|
|
|
|
|
|
foreach_block_reverse_safe(block, cfg) {
|
2016-03-13 16:25:57 -07:00
|
|
|
|
BITSET_WORD flag_liveout = live_vars.block_data[block->num]
|
2020-01-23 22:55:33 -08:00
|
|
|
|
.flag_liveout[0];
|
2016-03-13 16:25:57 -07:00
|
|
|
|
STATIC_ASSERT(ARRAY_SIZE(live_vars.block_data[0].flag_liveout) == 1);
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
|
|
|
|
|
|
if (!inst->predicate && inst->exec_size >= 8)
|
intel/fs: sel.cond writes the flags on Gfx4 and Gfx5
On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented
using a separte cmpn and sel instruction. This lowering occurs in
fs_vistor::lower_minmax which is called very, very late... a long, long
time after the first calls to opt_cmod_propagation. As a result,
conditional modifiers can be incorrectly propagated across sel.cond on
those platforms.
No tests were affected by this change, and I find that quite shocking.
After just changing flags_written(), all of the atan tests started
failing on ILK. That required the change in cmod_propagatin (and the
addition of the prop_across_into_sel_gfx5 unit test).
Shader-db results for ILK and GM45 are below. I looked at a couple
before and after shaders... and every case that I looked at had
experienced incorrect cmod propagation. This affected a LOT of apps!
Euro Truck Simulator 2, The Talos Principle, Serious Sam 3, Sanctum 2,
Gang Beasts, and on and on... :(
I discovered this bug while working on a couple new optimization
passes. One of the passes attempts to remove condition modifiers that
are never used. The pass made no progress except on ILK and GM45.
After investigating a couple of the affected shaders, I noticed that
the code in those shaders looked wrong... investigation led to this
cause.
v2: Trivial changes in the unit tests.
v3: Fix type in comment in unit tests. Noticed by Jason and Priit.
v4: Tweak handling of BRW_OPCODE_SEL special case. Suggested by Jason.
Fixes: df1aec763eb ("i965/fs: Define methods to calculate the flag subset read or written by an fs_inst.")
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Dave Airlie <airlied@redhat.com>
Iron Lake
total instructions in shared programs: 8180493 -> 8181781 (0.02%)
instructions in affected programs: 541796 -> 543084 (0.24%)
helped: 28
HURT: 1158
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.86% x̄: 0.53% x̃: 0.50%
HURT stats (abs) min: 1 max: 3 x̄: 1.14 x̃: 1
HURT stats (rel) min: 0.12% max: 4.00% x̄: 0.37% x̃: 0.23%
95% mean confidence interval for instructions value: 1.06 1.11
95% mean confidence interval for instructions %-change: 0.31% 0.38%
Instructions are HURT.
total cycles in shared programs: 239420470 -> 239421690 (<.01%)
cycles in affected programs: 2925992 -> 2927212 (0.04%)
helped: 49
HURT: 157
helped stats (abs) min: 2 max: 284 x̄: 62.69 x̃: 70
helped stats (rel) min: 0.04% max: 6.20% x̄: 1.68% x̃: 1.96%
HURT stats (abs) min: 2 max: 48 x̄: 27.34 x̃: 24
HURT stats (rel) min: 0.02% max: 2.91% x̄: 0.31% x̃: 0.20%
95% mean confidence interval for cycles value: -0.80 12.64
95% mean confidence interval for cycles %-change: -0.31% <.01%
Inconclusive result (value mean confidence interval includes 0).
GM45
total instructions in shared programs: 4985517 -> 4986207 (0.01%)
instructions in affected programs: 306935 -> 307625 (0.22%)
helped: 14
HURT: 625
helped stats (abs) min: 1 max: 1 x̄: 1.00 x̃: 1
helped stats (rel) min: 0.35% max: 0.82% x̄: 0.52% x̃: 0.49%
HURT stats (abs) min: 1 max: 3 x̄: 1.13 x̃: 1
HURT stats (rel) min: 0.12% max: 3.90% x̄: 0.34% x̃: 0.22%
95% mean confidence interval for instructions value: 1.04 1.12
95% mean confidence interval for instructions %-change: 0.29% 0.36%
Instructions are HURT.
total cycles in shared programs: 153827268 -> 153828052 (<.01%)
cycles in affected programs: 1669290 -> 1670074 (0.05%)
helped: 24
HURT: 84
helped stats (abs) min: 2 max: 232 x̄: 64.33 x̃: 67
helped stats (rel) min: 0.04% max: 4.62% x̄: 1.60% x̃: 1.94%
HURT stats (abs) min: 2 max: 48 x̄: 27.71 x̃: 24
HURT stats (rel) min: 0.02% max: 2.66% x̄: 0.34% x̃: 0.14%
95% mean confidence interval for cycles value: -1.94 16.46
95% mean confidence interval for cycles %-change: -0.29% 0.11%
Inconclusive result (value mean confidence interval includes 0).
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/12191>
2021-08-02 21:33:17 -07:00
|
|
|
|
flag_liveout &= ~inst->flags_written(devinfo);
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
|
case BRW_OPCODE_IF:
|
2020-11-30 17:24:51 -06:00
|
|
|
|
/* Note that this doesn't handle BRW_OPCODE_HALT since only
|
2020-01-23 22:55:33 -08:00
|
|
|
|
* the first one in the program closes the region of divergent
|
|
|
|
|
|
* control flow due to any HALT instructions -- Instead this is
|
|
|
|
|
|
* handled with the halt_start check below.
|
|
|
|
|
|
*/
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
2020-11-19 09:32:27 -06:00
|
|
|
|
case SHADER_OPCODE_HALT_TARGET:
|
2020-01-23 22:55:33 -08:00
|
|
|
|
depth++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
/* Note that the vast majority of NoMask SEND instructions in the
|
|
|
|
|
|
* program are harmless while executed in a block with all
|
|
|
|
|
|
* channels disabled, since any instructions with side effects we
|
|
|
|
|
|
* could hit here should be execution-masked.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The main concern is NoMask SEND instructions where the message
|
|
|
|
|
|
* descriptor or header depends on data generated by live
|
|
|
|
|
|
* invocations of the shader (RESINFO and
|
|
|
|
|
|
* FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD with a dynamically
|
|
|
|
|
|
* computed surface index seem to be the only examples right now
|
|
|
|
|
|
* where this could easily lead to GPU hangs). Unfortunately we
|
|
|
|
|
|
* have no straightforward way to detect that currently, so just
|
|
|
|
|
|
* predicate any NoMask SEND instructions we find under control
|
|
|
|
|
|
* flow.
|
|
|
|
|
|
*
|
|
|
|
|
|
* If this proves to have a measurable performance impact it can
|
|
|
|
|
|
* be easily extended with a whitelist of messages we know we can
|
|
|
|
|
|
* safely omit the predication for.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (depth && inst->force_writemask_all &&
|
|
|
|
|
|
is_send(inst) && !inst->predicate) {
|
|
|
|
|
|
/* We need to load the execution mask into the flag register by
|
|
|
|
|
|
* using a builder with channel group matching the whole shader
|
|
|
|
|
|
* (rather than the default which is derived from the original
|
|
|
|
|
|
* instruction), in order to avoid getting a right-shifted
|
|
|
|
|
|
* value.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder ubld = fs_builder(this, block, inst)
|
|
|
|
|
|
.exec_all().group(dispatch_width, 0);
|
|
|
|
|
|
const fs_reg flag = retype(brw_flag_reg(0, 0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
/* Due to the lack of flag register allocation we need to save
|
|
|
|
|
|
* and restore the flag register if it's live.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const bool save_flag = flag_liveout &
|
|
|
|
|
|
flag_mask(flag, dispatch_width / 8);
|
2023-03-24 13:56:06 +02:00
|
|
|
|
const fs_reg tmp = ubld.group(8, 0).vgrf(flag.type);
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
2023-03-24 13:56:06 +02:00
|
|
|
|
if (save_flag) {
|
|
|
|
|
|
ubld.group(8, 0).UNDEF(tmp);
|
2020-01-23 22:55:33 -08:00
|
|
|
|
ubld.group(1, 0).MOV(tmp, flag);
|
2023-03-24 13:56:06 +02:00
|
|
|
|
}
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
|
|
|
|
|
ubld.emit(FS_OPCODE_LOAD_LIVE_CHANNELS);
|
|
|
|
|
|
|
|
|
|
|
|
set_predicate(pred, inst);
|
|
|
|
|
|
inst->flag_subreg = 0;
|
2023-03-14 18:22:50 +02:00
|
|
|
|
inst->predicate_trivial = true;
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
|
|
|
|
|
if (save_flag)
|
|
|
|
|
|
ubld.group(1, 0).at(block, inst->next).MOV(flag, tmp);
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst == halt_start)
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
|
|
|
|
|
|
flag_liveout |= inst->flags_read(devinfo);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2016-03-13 19:26:37 -07:00
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
2020-01-23 22:55:33 -08:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-02-03 17:02:28 +01:00
|
|
|
|
uint32_t
|
|
|
|
|
|
fs_visitor::compute_max_register_pressure()
|
|
|
|
|
|
{
|
|
|
|
|
|
const register_pressure &rp = regpressure_analysis.require();
|
|
|
|
|
|
uint32_t ip = 0, max_pressure = 0;
|
|
|
|
|
|
foreach_block_and_inst(block, backend_instruction, inst, cfg) {
|
|
|
|
|
|
max_pressure = MAX2(max_pressure, rp.regs_live_at_ip[ip]);
|
|
|
|
|
|
ip++;
|
|
|
|
|
|
}
|
|
|
|
|
|
return max_pressure;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
|
static fs_inst **
|
|
|
|
|
|
save_instruction_order(const struct cfg_t *cfg)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
|
|
|
|
|
fs_inst **inst_arr = new fs_inst * [num_insts];
|
|
|
|
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
assert(ip >= block->start_ip && ip <= block->end_ip);
|
|
|
|
|
|
inst_arr[ip++] = inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
|
|
|
|
|
|
return inst_arr;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
|
restore_instruction_order(struct cfg_t *cfg, fs_inst **inst_arr)
|
|
|
|
|
|
{
|
|
|
|
|
|
int num_insts = cfg->last_block()->end_ip + 1;
|
|
|
|
|
|
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
|
foreach_block (block, cfg) {
|
|
|
|
|
|
block->instructions.make_empty();
|
|
|
|
|
|
|
|
|
|
|
|
assert(ip == block->start_ip);
|
|
|
|
|
|
for (; ip <= block->end_ip; ip++)
|
|
|
|
|
|
block->instructions.push_tail(inst_arr[ip]);
|
|
|
|
|
|
}
|
|
|
|
|
|
assert(ip == num_insts);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
void
|
2020-05-19 14:37:44 -07:00
|
|
|
|
fs_visitor::allocate_registers(bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
|
{
|
2019-05-09 14:44:16 -05:00
|
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
2021-11-09 22:55:49 -06:00
|
|
|
|
SCHEDULE_NONE,
|
2014-11-13 16:28:19 -08:00
|
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
|
static const char *scheduler_mode_name[] = {
|
2023-08-14 19:35:32 -07:00
|
|
|
|
[SCHEDULE_PRE] = "top-down",
|
|
|
|
|
|
[SCHEDULE_PRE_NON_LIFO] = "non-lifo",
|
|
|
|
|
|
[SCHEDULE_PRE_LIFO] = "lifo",
|
|
|
|
|
|
[SCHEDULE_POST] = "post",
|
|
|
|
|
|
[SCHEDULE_NONE] = "none",
|
2016-10-17 14:12:28 -07:00
|
|
|
|
};
|
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
uint32_t best_register_pressure = UINT32_MAX;
|
|
|
|
|
|
enum instruction_scheduler_mode best_sched = SCHEDULE_NONE;
|
|
|
|
|
|
|
2023-03-17 09:42:31 +02:00
|
|
|
|
compact_virtual_grfs();
|
|
|
|
|
|
|
2023-02-03 17:02:28 +01:00
|
|
|
|
if (needs_register_pressure)
|
|
|
|
|
|
shader_stats.max_register_pressure = compute_max_register_pressure();
|
|
|
|
|
|
|
2023-08-15 01:15:26 -07:00
|
|
|
|
debug_optimizer(nir, "pre_register_allocate", 90, 90);
|
2023-08-06 15:46:12 +03:00
|
|
|
|
|
2021-10-13 11:21:41 +02:00
|
|
|
|
bool spill_all = allow_spilling && INTEL_DEBUG(DEBUG_SPILL_FS);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
|
2021-11-09 19:03:19 -06:00
|
|
|
|
/* Before we schedule anything, stash off the instruction order as an array
|
|
|
|
|
|
* of fs_inst *. This way, we can reset it between scheduling passes to
|
|
|
|
|
|
* prevent dependencies between the different scheduling modes.
|
|
|
|
|
|
*/
|
2023-08-23 02:19:06 -07:00
|
|
|
|
fs_inst **orig_order = save_instruction_order(cfg);
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
fs_inst **best_pressure_order = NULL;
|
2021-11-09 19:03:19 -06:00
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
|
void *scheduler_ctx = ralloc_context(NULL);
|
|
|
|
|
|
fs_instruction_scheduler *sched = prepare_scheduler(scheduler_ctx);
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
2023-08-14 19:35:32 -07:00
|
|
|
|
enum instruction_scheduler_mode sched_mode = pre_modes[i];
|
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
|
schedule_instructions_pre_ra(sched, sched_mode);
|
2023-08-14 19:35:32 -07:00
|
|
|
|
this->shader_stats.scheduler_mode = scheduler_mode_name[sched_mode];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
2023-08-15 01:15:26 -07:00
|
|
|
|
debug_optimizer(nir, shader_stats.scheduler_mode, 95, i);
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
if (0) {
|
|
|
|
|
|
assign_regs_trivial();
|
2019-05-09 14:44:16 -05:00
|
|
|
|
allocated = true;
|
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
|
|
|
|
|
assert(!spilled_any_registers);
|
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
allocated = assign_regs(false, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
|
break;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
|
|
|
|
|
|
/* Save the maximum register pressure */
|
|
|
|
|
|
uint32_t this_pressure = compute_max_register_pressure();
|
|
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
|
fprintf(stderr, "Scheduler mode \"%s\" spilled, max pressure = %u\n",
|
|
|
|
|
|
scheduler_mode_name[sched_mode], this_pressure);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (this_pressure < best_register_pressure) {
|
|
|
|
|
|
best_register_pressure = this_pressure;
|
|
|
|
|
|
best_sched = sched_mode;
|
|
|
|
|
|
delete[] best_pressure_order;
|
|
|
|
|
|
best_pressure_order = save_instruction_order(cfg);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Reset back to the original order before trying the next mode */
|
|
|
|
|
|
restore_instruction_order(cfg, orig_order);
|
|
|
|
|
|
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
|
ralloc_free(scheduler_ctx);
|
|
|
|
|
|
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
if (!allocated) {
|
|
|
|
|
|
if (0) {
|
|
|
|
|
|
fprintf(stderr, "Spilling - using lowest-pressure mode \"%s\"\n",
|
|
|
|
|
|
scheduler_mode_name[best_sched]);
|
|
|
|
|
|
}
|
|
|
|
|
|
restore_instruction_order(cfg, best_pressure_order);
|
|
|
|
|
|
shader_stats.scheduler_mode = scheduler_mode_name[best_sched];
|
|
|
|
|
|
|
|
|
|
|
|
allocated = assign_regs(allow_spilling, spill_all);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-23 02:19:06 -07:00
|
|
|
|
delete[] orig_order;
|
intel/fs: Pick the lowest register pressure schedule when spilling
We try various pre-RA scheduler modes and see if any of them allow
us to register allocate without spilling. If all of them spill,
however, we left it on the last mode: LIFO. This is unfortunately
sometimes significantly worse than other modes (such as "none").
This patch makes us instead select the pre-RA scheduling mode that
gives the lowest register pressure estimate, if none of them manage
to avoid spilling. The hope is that this scheduling will spill the
least out of all of them.
fossil-db stats (on Alchemist) speak for themselves:
Totals:
Instrs: 197297092 -> 195326552 (-1.00%); split: -1.02%, +0.03%
Cycles: 14291286956 -> 14303502596 (+0.09%); split: -0.55%, +0.64%
Spill count: 190886 -> 129204 (-32.31%); split: -33.01%, +0.70%
Fill count: 361408 -> 225038 (-37.73%); split: -39.17%, +1.43%
Scratch Memory Size: 12935168 -> 10868736 (-15.98%); split: -16.08%, +0.10%
Totals from 1791 (0.27% of 668386) affected shaders:
Instrs: 7628929 -> 5658389 (-25.83%); split: -26.50%, +0.67%
Cycles: 719326691 -> 731542331 (+1.70%); split: -10.95%, +12.65%
Spill count: 110627 -> 48945 (-55.76%); split: -56.96%, +1.20%
Fill count: 221560 -> 85190 (-61.55%); split: -63.89%, +2.34%
Scratch Memory Size: 4471808 -> 2405376 (-46.21%); split: -46.51%, +0.30%
Improves performance when using XeSS in Cyberpunk 2077 by 90% on A770.
Improves performance of Borderlands 3 by 1.54% on A770.
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/24707>
2023-08-14 19:32:25 -07:00
|
|
|
|
delete[] best_pressure_order;
|
2023-08-23 02:19:06 -07:00
|
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
|
if (!allocated) {
|
2020-05-19 14:37:44 -07:00
|
|
|
|
fail("Failure to register allocate. Reduce number of "
|
|
|
|
|
|
"live scalar values to avoid this.");
|
2019-05-09 14:44:16 -05:00
|
|
|
|
} else if (spilled_any_registers) {
|
2021-07-29 14:27:57 -07:00
|
|
|
|
brw_shader_perf_log(compiler, log_data,
|
|
|
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
|
"values to improve performance.\n",
|
2023-09-24 21:38:47 -07:00
|
|
|
|
_mesa_shader_stage_to_string(stage));
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* This must come after all optimization and register allocation, since
|
|
|
|
|
|
* it inserts dead code that happens to have side effects, and it does
|
|
|
|
|
|
* so based on the actual physical registers in use.
|
|
|
|
|
|
*/
|
2021-03-29 15:40:04 -07:00
|
|
|
|
insert_gfx4_send_dependency_workarounds();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2017-06-15 15:23:57 -07:00
|
|
|
|
opt_bank_conflicts();
|
|
|
|
|
|
|
2023-10-20 10:32:54 -07:00
|
|
|
|
schedule_instructions_post_ra();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
2016-06-09 16:56:31 -07:00
|
|
|
|
if (last_scratch > 0) {
|
2019-06-19 12:47:19 +01:00
|
|
|
|
ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
|
2016-06-09 16:56:31 -07:00
|
|
|
|
|
2022-02-28 15:13:07 +02:00
|
|
|
|
/* Take the max of any previously compiled variant of the shader. In the
|
|
|
|
|
|
* case of bindless shaders with return parts, this will also take the
|
|
|
|
|
|
* max of all parts.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->total_scratch = MAX2(brw_get_scratch_size(last_scratch),
|
|
|
|
|
|
prog_data->total_scratch);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
|
2021-10-18 15:24:23 +03:00
|
|
|
|
if (gl_shader_stage_is_compute(stage)) {
|
2021-09-22 15:06:58 +03:00
|
|
|
|
if (devinfo->platform == INTEL_PLATFORM_HSW) {
|
2016-06-13 23:09:31 -07:00
|
|
|
|
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
|
|
|
|
|
|
* field documentation, Haswell supports a minimum of 2kB of
|
|
|
|
|
|
* scratch space for compute shaders, unlike every other stage
|
|
|
|
|
|
* and platform.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
|
2021-03-29 14:41:58 -07:00
|
|
|
|
} else if (devinfo->ver <= 7) {
|
2016-06-13 23:09:31 -07:00
|
|
|
|
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
|
|
|
|
|
|
* field documentation, platforms prior to Haswell measure scratch
|
|
|
|
|
|
* size linearly with a range of [1kB, 12kB] and 1kB granularity.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->total_scratch = ALIGN(last_scratch, 1024);
|
|
|
|
|
|
max_scratch_size = 12 * 1024;
|
|
|
|
|
|
}
|
2016-06-09 16:56:31 -07:00
|
|
|
|
}
|
2016-06-09 18:13:26 -07:00
|
|
|
|
|
|
|
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
|
*
|
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
|
*/
|
2016-06-13 23:09:31 -07:00
|
|
|
|
assert(prog_data->total_scratch < max_scratch_size);
|
2016-06-09 16:56:31 -07:00
|
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
|
|
|
|
|
|
lower_scoreboard();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
bool
|
2017-09-28 16:25:31 -07:00
|
|
|
|
fs_visitor::run_vs()
|
2014-10-27 22:42:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
2022-08-01 16:42:57 -07:00
|
|
|
|
payload_ = new vs_thread_payload(*this);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
2015-05-20 10:03:50 -07:00
|
|
|
|
emit_nir_code();
|
2015-03-09 01:58:59 -07:00
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-06-26 15:05:13 -07:00
|
|
|
|
emit_urb_writes();
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_vs_urb_setup();
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
allocate_registers(true /* allow_spilling */);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-05-03 14:20:00 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::set_tcs_invocation_id()
|
2015-11-14 17:40:43 -08:00
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
2019-05-03 14:24:49 -07:00
|
|
|
|
const unsigned instance_id_mask =
|
2022-08-09 14:02:16 -07:00
|
|
|
|
(devinfo->verx10 >= 125) ? INTEL_MASK(7, 0) :
|
|
|
|
|
|
(devinfo->ver >= 11) ? INTEL_MASK(22, 16) :
|
|
|
|
|
|
INTEL_MASK(23, 17);
|
2019-05-03 14:24:49 -07:00
|
|
|
|
const unsigned instance_id_shift =
|
2022-08-09 14:02:16 -07:00
|
|
|
|
(devinfo->verx10 >= 125) ? 0 : (devinfo->ver >= 11) ? 16 : 17;
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
2020-07-13 22:24:19 -07:00
|
|
|
|
/* Get instance number from g0.2 bits:
|
|
|
|
|
|
* * 7:0 on DG2+
|
|
|
|
|
|
* * 22:16 on gfx11+
|
|
|
|
|
|
* * 23:17 otherwise
|
|
|
|
|
|
*/
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
|
|
|
|
|
|
brw_imm_ud(instance_id_mask));
|
|
|
|
|
|
|
|
|
|
|
|
invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
2022-08-16 11:02:20 -07:00
|
|
|
|
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_MULTI_PATCH) {
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
/* gl_InvocationID is just the thread number */
|
|
|
|
|
|
bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
|
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
|
fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
|
|
|
|
|
|
fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
|
|
|
|
|
|
bld.MOV(channels_ud, channels_uw);
|
|
|
|
|
|
|
|
|
|
|
|
if (tcs_prog_data->instances == 1) {
|
|
|
|
|
|
invocation_id = channels_ud;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
2019-05-03 14:24:49 -07:00
|
|
|
|
bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.ADD(invocation_id, instance_times_8, channels_ud);
|
|
|
|
|
|
}
|
2019-05-03 14:20:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_tcs_thread_end()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Try and tag the last URB write with EOT instead of emitting a whole
|
|
|
|
|
|
* separate write just to finish the thread. There isn't guaranteed to
|
|
|
|
|
|
* be one, so this may not succeed.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->ver != 8 && mark_last_urb_write_with_eot())
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
|
/* Emit a URB write to end the thread. On Broadwell, we use this to write
|
|
|
|
|
|
* zero to the "TR DS Cache Disable" bit (we haven't implemented a fancy
|
|
|
|
|
|
* algorithm to set it optimally). On other platforms, we simply write
|
|
|
|
|
|
* zero to a reserved/MBZ patch header DWord which has no consequence.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg srcs[URB_LOGICAL_NUM_SRCS];
|
|
|
|
|
|
srcs[URB_LOGICAL_SRC_HANDLE] = tcs_payload().patch_urb_output;
|
|
|
|
|
|
srcs[URB_LOGICAL_SRC_CHANNEL_MASK] = brw_imm_ud(WRITEMASK_X << 16);
|
|
|
|
|
|
srcs[URB_LOGICAL_SRC_DATA] = brw_imm_ud(0);
|
2022-09-28 16:38:35 -07:00
|
|
|
|
srcs[URB_LOGICAL_SRC_COMPONENTS] = brw_imm_ud(1);
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
|
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_LOGICAL,
|
|
|
|
|
|
reg_undef, srcs, ARRAY_SIZE(srcs));
|
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-05-03 14:20:00 -07:00
|
|
|
|
bool
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
fs_visitor::run_tcs()
|
2019-05-03 14:20:00 -07:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
|
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
|
|
|
|
|
|
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
|
2022-08-16 11:02:20 -07:00
|
|
|
|
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_MULTI_PATCH);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
|
2022-08-19 14:41:52 -07:00
|
|
|
|
payload_ = new tcs_thread_payload(*this);
|
2019-05-03 14:20:00 -07:00
|
|
|
|
|
|
|
|
|
|
/* Initialize gl_InvocationID */
|
|
|
|
|
|
set_tcs_invocation_id();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
2019-05-03 14:28:51 -07:00
|
|
|
|
const bool fix_dispatch_mask =
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
|
2019-05-03 14:28:51 -07:00
|
|
|
|
(nir->info.tess.tcs_vertices_out % 8) != 0;
|
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
|
/* Fix the disptach mask */
|
2019-05-03 14:28:51 -07:00
|
|
|
|
if (fix_dispatch_mask) {
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.CMP(bld.null_reg_ud(), invocation_id,
|
2017-05-08 09:20:21 -07:00
|
|
|
|
brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.IF(BRW_PREDICATE_NORMAL);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
2019-05-03 14:28:51 -07:00
|
|
|
|
if (fix_dispatch_mask) {
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.emit(BRW_OPCODE_ENDIF);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel/compiler: Use an existing URB write to end TCS threads when viable
VS, TCS, TES, and GS threads must end with a URB write message with the
EOT (end of thread) bit set. For VS and TES, we shadow output variables
with temporaries and perform all stores at the end of the shader, giving
us an existing message to do the EOT.
In tessellation control shaders, we don't defer output stores until the
end of the thread like we do for vertex or evaluation shaders. We just
process store_output and store_per_vertex_output intrinsics where they
occur, which may be in control flow. So we can't guarantee that there's
a URB write being at the end of the shader.
Traditionally, we've just emitted a separate URB write to finish TCS
threads, doing a writemasked write to an single patch header DWord.
On Broadwell, we need to set a "TR DS Cache Disable" bit, so this is
a convenient spot to do so. But on other platforms, there's no such
field, and this write is purely wasteful.
Insetad of emitting a separate write, we can just look for an existing
URB write at the end of the program and tag that with EOT, if possible.
We already had code to do this for geometry shaders, so just lift it
into a helper function and reuse it.
No changes in shader-db.
Reviewed-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com>
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17944>
2022-08-03 20:54:52 -07:00
|
|
|
|
emit_tcs_thread_end();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
assign_tcs_urb_setup();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
allocate_registers(true /* allow_spilling */);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_tes()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
|
|
2022-09-07 14:11:05 -07:00
|
|
|
|
payload_ = new tes_thread_payload(*this);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_urb_writes();
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_tes_urb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
allocate_registers(true /* allow_spilling */);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_gs()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
2022-08-22 22:23:17 -07:00
|
|
|
|
payload_ = new gs_thread_payload(*this);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits > 0) {
|
|
|
|
|
|
/* Create a VGRF to store accumulated control data bits. */
|
|
|
|
|
|
this->control_data_bits = vgrf(glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
|
|
/* If we're outputting more than 32 control data bits, then EmitVertex()
|
|
|
|
|
|
* will set control_data_bits to 0 after emitting the first vertex.
|
|
|
|
|
|
* Otherwise, we need to initialize it to 0 here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits <= 32) {
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
2015-03-11 23:14:31 -07:00
|
|
|
|
const fs_builder abld = bld.annotate("initialize control data bits");
|
2015-11-02 11:26:16 -08:00
|
|
|
|
abld.MOV(this->control_data_bits, brw_imm_ud(0u));
|
2015-03-11 23:14:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
emit_gs_thread_end();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_gs_urb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
allocate_registers(true /* allow_spilling */);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-10-25 16:50:11 +03:00
|
|
|
|
/* From the SKL PRM, Volume 16, Workarounds:
|
|
|
|
|
|
*
|
|
|
|
|
|
* 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
|
|
|
|
|
|
* only header phases (R0-R2)
|
|
|
|
|
|
*
|
|
|
|
|
|
* WA: Enable a non-header phase (e.g. push constant) when dispatch would
|
|
|
|
|
|
* have been header only.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Instead of enabling push constants one can alternatively enable one of the
|
|
|
|
|
|
* inputs. Here one simply chooses "layer" which shouldn't impose much
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static void
|
2021-03-29 15:40:04 -07:00
|
|
|
|
gfx9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
|
2017-10-25 16:50:11 +03:00
|
|
|
|
{
|
|
|
|
|
|
if (wm_prog_data->num_varying_inputs)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
if (wm_prog_data->base.curb_read_length)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
|
|
|
|
|
wm_prog_data->num_varying_inputs = 1;
|
2018-12-11 18:45:43 +01:00
|
|
|
|
|
|
|
|
|
|
brw_compute_urb_setup_index(wm_prog_data);
|
2017-10-25 16:50:11 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
bool
|
2016-05-16 14:30:25 -07:00
|
|
|
|
fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
|
2010-08-26 12:12:00 -07:00
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
|
2014-10-27 23:36:31 -07:00
|
|
|
|
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
2014-10-27 23:36:31 -07:00
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
2022-08-19 12:40:20 -07:00
|
|
|
|
payload_ = new fs_thread_payload(*this, source_depth_to_render_target,
|
|
|
|
|
|
runtime_check_aads_emit);
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
intel/compiler: Delete unused emit_dummy_fs()
This code is compiled out, but has been left in place in case we wanted
to use it for debugging something. In the olden days, we'd use it for
platform enabling. I can't think of the last time we did that, though.
I also used to use it for debugging. If something was misrendering, I'd
iterate through shaders 0..N, replacing them with "draw hot pink" until
whatever shader was drawing the bad stuff was brightly illuminated.
Once it was identified, I'd start investigating that shader.
These days, we have frameretrace and renderdoc which are like, actual
tools that let you highlight draws and replace shaders. So we don't
need to resort iterative driver hacks anymore. Again, I can't think of
the last time I actually did that.
So, this code is basically just dead. And it's using legacy MRF paths,
which we could update...or we could just delete it.
Reviewed-by: Caio Oliveira <caio.oliveira@intel.com>
Reviewed-by: Emma Anholt <emma@anholt.net>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/20172>
2022-11-23 01:04:00 -08:00
|
|
|
|
if (do_rep_send) {
|
2015-06-19 17:25:28 -07:00
|
|
|
|
assert(dispatch_width == 16);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
emit_repclear_shader();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
} else {
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (nir->info.inputs_read > 0 ||
|
2021-01-19 17:14:28 -08:00
|
|
|
|
BITSET_TEST(nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) ||
|
2017-05-08 09:20:21 -07:00
|
|
|
|
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 6)
|
2021-03-29 15:40:04 -07:00
|
|
|
|
emit_interpolation_setup_gfx4();
|
2013-10-19 21:27:37 -07:00
|
|
|
|
else
|
2021-03-29 15:40:04 -07:00
|
|
|
|
emit_interpolation_setup_gfx6();
|
2013-10-19 21:27:37 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2012-12-06 12:15:13 -08:00
|
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
|
*/
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_prog_data->uses_kill) {
|
2020-01-04 16:16:24 -08:00
|
|
|
|
const unsigned lower_width = MIN2(dispatch_width, 16);
|
|
|
|
|
|
for (unsigned i = 0; i < dispatch_width / lower_width; i++) {
|
|
|
|
|
|
const fs_reg dispatch_mask =
|
2021-03-29 14:41:58 -07:00
|
|
|
|
devinfo->ver >= 6 ? brw_vec1_grf((i ? 2 : 1), 7) :
|
2020-01-04 16:16:24 -08:00
|
|
|
|
brw_vec1_grf(0, 0);
|
|
|
|
|
|
bld.exec_all().group(1, 0)
|
2022-06-27 12:24:58 -07:00
|
|
|
|
.MOV(brw_sample_mask_reg(bld.group(lower_width, i)),
|
2020-01-04 16:16:24 -08:00
|
|
|
|
retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
|
|
|
|
|
|
}
|
2012-12-06 12:15:13 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-04-29 13:48:58 -07:00
|
|
|
|
if (nir->info.writes_memory)
|
|
|
|
|
|
wm_prog_data->has_side_effects = true;
|
|
|
|
|
|
|
2015-05-20 10:03:50 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
2011-06-10 16:00:03 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
|
2021-12-09 18:21:14 -06:00
|
|
|
|
if (wm_key->emit_alpha_test)
|
2013-10-27 12:32:03 +13:00
|
|
|
|
emit_alpha_test();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_fb_writes();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
optimize();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_curb_setup();
|
2017-10-25 16:50:11 +03:00
|
|
|
|
|
2021-12-21 12:16:21 +01:00
|
|
|
|
if (devinfo->ver == 9)
|
2021-03-29 15:40:04 -07:00
|
|
|
|
gfx9_ps_header_only_workaround(wm_prog_data);
|
2017-10-25 16:50:11 +03:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_urb_setup();
|
2011-01-18 22:03:34 -08:00
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2020-05-19 14:37:44 -07:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
allocate_registers(allow_spilling);
|
2014-05-13 20:51:32 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
bool
|
2020-05-19 14:37:44 -07:00
|
|
|
|
fs_visitor::run_cs(bool allow_spilling)
|
2014-08-30 19:57:39 -07:00
|
|
|
|
{
|
2021-10-18 15:24:23 +03:00
|
|
|
|
assert(gl_shader_stage_is_compute(stage));
|
2022-08-22 21:47:02 -07:00
|
|
|
|
assert(devinfo->ver >= 7);
|
2023-11-21 09:47:18 -08:00
|
|
|
|
const fs_builder bld = fs_builder(this, dispatch_width).at_end();
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
2022-08-22 21:47:02 -07:00
|
|
|
|
payload_ = new cs_thread_payload(*this);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
2021-09-22 15:06:58 +03:00
|
|
|
|
if (devinfo->platform == INTEL_PLATFORM_HSW && prog_data->total_shared > 0) {
|
2016-02-20 01:22:08 -08:00
|
|
|
|
/* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
|
|
|
|
|
|
const fs_builder abld = bld.exec_all().group(1, 0);
|
2016-09-14 15:09:32 -07:00
|
|
|
|
abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
|
2016-02-20 01:22:08 -08:00
|
|
|
|
suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
allocate_registers(allow_spilling);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_bs(bool allow_spilling)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage >= MESA_SHADER_RAYGEN && stage <= MESA_SHADER_CALLABLE);
|
|
|
|
|
|
|
2022-09-11 00:57:26 -07:00
|
|
|
|
payload_ = new bs_thread_payload(*this);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* TODO(RT): Perhaps rename this? */
|
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
|
allocate_registers(allow_spilling);
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_task(bool allow_spilling)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TASK);
|
|
|
|
|
|
|
2022-08-21 23:05:08 -07:00
|
|
|
|
payload_ = new task_mesh_thread_payload(*this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2022-05-18 17:05:53 +02:00
|
|
|
|
emit_urb_fence();
|
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
|
allocate_registers(allow_spilling);
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_mesh(bool allow_spilling)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_MESH);
|
|
|
|
|
|
|
2022-08-21 23:05:08 -07:00
|
|
|
|
payload_ = new task_mesh_thread_payload(*this);
|
2021-10-29 12:27:45 -07:00
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2022-05-18 17:05:53 +02:00
|
|
|
|
emit_urb_fence();
|
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2021-10-27 14:11:27 -07:00
|
|
|
|
emit_dummy_memory_fence_before_eot();
|
2022-12-06 18:11:10 +02:00
|
|
|
|
|
2023-01-20 23:19:34 -08:00
|
|
|
|
/* Wa_14015360517 */
|
2022-12-06 18:11:10 +02:00
|
|
|
|
emit_dummy_mov_instruction();
|
|
|
|
|
|
|
2021-10-29 12:27:45 -07:00
|
|
|
|
allocate_registers(allow_spilling);
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-04-11 14:12:58 -05:00
|
|
|
|
static bool
|
2023-08-12 16:17:15 -04:00
|
|
|
|
is_used_in_not_interp_frag_coord(nir_def *def)
|
2019-04-11 14:12:58 -05:00
|
|
|
|
{
|
2023-04-06 13:19:31 -04:00
|
|
|
|
nir_foreach_use_including_if(src, def) {
|
2023-08-14 09:58:47 -04:00
|
|
|
|
if (nir_src_is_if(src))
|
2023-04-06 13:19:31 -04:00
|
|
|
|
return true;
|
|
|
|
|
|
|
2023-08-14 09:58:47 -04:00
|
|
|
|
if (nir_src_parent_instr(src)->type != nir_instr_type_intrinsic)
|
2019-04-11 14:12:58 -05:00
|
|
|
|
return true;
|
|
|
|
|
|
|
2023-08-14 09:58:47 -04:00
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(nir_src_parent_instr(src));
|
2019-07-18 09:59:44 -05:00
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
|
2019-04-11 14:12:58 -05:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Return a bitfield where bit n is set if barycentric interpolation mode n
|
2016-07-11 16:24:12 -07:00
|
|
|
|
* (see enum brw_barycentric_mode) is needed by the fragment shader.
|
2016-07-12 03:57:25 -07:00
|
|
|
|
*
|
|
|
|
|
|
* We examine the load_barycentric intrinsics rather than looking at input
|
|
|
|
|
|
* variables so that we catch interpolateAtCentroid() messages too, which
|
|
|
|
|
|
* also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
|
2015-10-08 16:01:44 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2021-04-05 13:19:39 -07:00
|
|
|
|
brw_compute_barycentric_interp_modes(const struct intel_device_info *devinfo,
|
2015-10-08 16:01:44 -07:00
|
|
|
|
const nir_shader *shader)
|
|
|
|
|
|
{
|
|
|
|
|
|
unsigned barycentric_interp_modes = 0;
|
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
|
nir_foreach_function_impl(impl, shader) {
|
|
|
|
|
|
nir_foreach_block(block, impl) {
|
2016-07-12 03:57:25 -07:00
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
continue;
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2019-04-11 14:12:58 -05:00
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
2022-03-16 19:26:54 -07:00
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2019-04-11 14:12:58 -05:00
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2016-07-12 03:57:25 -07:00
|
|
|
|
continue;
|
2019-04-11 14:12:58 -05:00
|
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
|
|
|
|
|
|
|
/* Ignore WPOS; it doesn't require interpolation. */
|
2023-08-14 11:56:00 -05:00
|
|
|
|
if (!is_used_in_not_interp_frag_coord(&intrin->def))
|
2016-07-12 03:57:25 -07:00
|
|
|
|
continue;
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
nir_intrinsic_op bary_op = intrin->intrinsic;
|
|
|
|
|
|
enum brw_barycentric_mode bary =
|
2022-07-06 13:01:24 -07:00
|
|
|
|
brw_barycentric_mode(intrin);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
barycentric_interp_modes |= 1 << bary;
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->needs_unlit_centroid_workaround &&
|
|
|
|
|
|
bary_op == nir_intrinsic_load_barycentric_centroid)
|
|
|
|
|
|
barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-10-08 16:01:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return barycentric_interp_modes;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-05 18:19:34 -07:00
|
|
|
|
static void
|
|
|
|
|
|
brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
|
2016-07-07 00:47:18 -07:00
|
|
|
|
const nir_shader *shader)
|
2016-04-05 18:19:34 -07:00
|
|
|
|
{
|
|
|
|
|
|
prog_data->flat_inputs = 0;
|
|
|
|
|
|
|
2020-07-18 18:24:25 -05:00
|
|
|
|
nir_foreach_shader_in_variable(var, shader) {
|
2022-02-24 17:09:25 +01:00
|
|
|
|
/* flat shading */
|
|
|
|
|
|
if (var->data.interpolation != INTERP_MODE_FLAT)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2022-02-24 17:06:33 +01:00
|
|
|
|
if (var->data.per_primitive)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2018-07-31 05:31:47 -07:00
|
|
|
|
unsigned slots = glsl_count_attribute_slots(var->type, false);
|
|
|
|
|
|
for (unsigned s = 0; s < slots; s++) {
|
|
|
|
|
|
int input_index = prog_data->urb_setup[var->data.location + s];
|
2016-04-05 18:19:34 -07:00
|
|
|
|
|
2022-02-24 17:09:25 +01:00
|
|
|
|
if (input_index >= 0)
|
2018-07-31 05:31:47 -07:00
|
|
|
|
prog_data->flat_inputs |= 1 << input_index;
|
|
|
|
|
|
}
|
2016-04-05 18:19:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
|
static uint8_t
|
|
|
|
|
|
computed_depth_mode(const nir_shader *shader)
|
|
|
|
|
|
{
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
|
|
|
|
|
switch (shader->info.fs.depth_layout) {
|
2015-10-08 16:01:44 -07:00
|
|
|
|
case FRAG_DEPTH_LAYOUT_NONE:
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_ANY:
|
|
|
|
|
|
return BRW_PSCDEPTH_ON;
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_GREATER:
|
|
|
|
|
|
return BRW_PSCDEPTH_ON_GE;
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_LESS:
|
|
|
|
|
|
return BRW_PSCDEPTH_ON_LE;
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_UNCHANGED:
|
|
|
|
|
|
return BRW_PSCDEPTH_OFF;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return BRW_PSCDEPTH_OFF;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Move load_interpolated_input with simple (payload-based) barycentric modes
|
|
|
|
|
|
* to the top of the program so we don't emit multiple PLNs for the same input.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This works around CSE not being able to handle non-dominating cases
|
|
|
|
|
|
* such as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* if (...) {
|
|
|
|
|
|
* interpolate input
|
|
|
|
|
|
* } else {
|
|
|
|
|
|
* interpolate the same exact input
|
|
|
|
|
|
* }
|
|
|
|
|
|
*
|
|
|
|
|
|
* This should be replaced by global value numbering someday.
|
|
|
|
|
|
*/
|
2019-07-18 09:23:23 -05:00
|
|
|
|
bool
|
|
|
|
|
|
brw_nir_move_interpolation_to_top(nir_shader *nir)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
{
|
2017-03-09 11:05:08 -08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
|
nir_foreach_function_impl(impl, nir) {
|
|
|
|
|
|
nir_block *top = nir_start_block(impl);
|
2023-03-20 20:57:47 -07:00
|
|
|
|
nir_cursor cursor = nir_before_instr(nir_block_first_instr(top));
|
2023-03-20 20:57:47 -07:00
|
|
|
|
bool impl_progress = false;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
|
2023-03-20 20:57:47 -07:00
|
|
|
|
for (nir_block *block = nir_block_cf_tree_next(top);
|
|
|
|
|
|
block != NULL;
|
|
|
|
|
|
block = nir_block_cf_tree_next(block)) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
|
nir_foreach_instr_safe(instr, block) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2016-07-26 13:19:46 -07:00
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
continue;
|
2016-07-26 13:19:46 -07:00
|
|
|
|
nir_intrinsic_instr *bary_intrinsic =
|
|
|
|
|
|
nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
|
|
|
|
|
|
nir_intrinsic_op op = bary_intrinsic->intrinsic;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
|
/* Leave interpolateAtSample/Offset() where they are. */
|
|
|
|
|
|
if (op == nir_intrinsic_load_barycentric_at_sample ||
|
|
|
|
|
|
op == nir_intrinsic_load_barycentric_at_offset)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_instr *move[3] = {
|
|
|
|
|
|
&bary_intrinsic->instr,
|
|
|
|
|
|
intrin->src[1].ssa->parent_instr,
|
|
|
|
|
|
instr
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2016-08-01 10:35:06 +10:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
|
2016-07-26 13:19:46 -07:00
|
|
|
|
if (move[i]->block != top) {
|
2023-03-20 20:57:47 -07:00
|
|
|
|
nir_instr_move(cursor, move[i]);
|
2023-03-20 20:57:47 -07:00
|
|
|
|
impl_progress = true;
|
2016-07-26 13:19:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2023-03-20 20:57:47 -07:00
|
|
|
|
|
|
|
|
|
|
progress = progress || impl_progress;
|
|
|
|
|
|
|
2023-06-28 19:40:56 +08:00
|
|
|
|
nir_metadata_preserve(impl, impl_progress ? (nir_metadata_block_index |
|
2023-03-20 20:57:47 -07:00
|
|
|
|
nir_metadata_dominance)
|
|
|
|
|
|
: nir_metadata_all);
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
}
|
2017-03-09 11:05:08 -08:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2021-05-17 15:25:26 -07:00
|
|
|
|
static void
|
2023-07-01 23:36:19 +03:00
|
|
|
|
brw_nir_populate_wm_prog_data(nir_shader *shader,
|
2021-04-05 13:19:39 -07:00
|
|
|
|
const struct intel_device_info *devinfo,
|
2019-07-18 09:23:47 -05:00
|
|
|
|
const struct brw_wm_prog_key *key,
|
2021-05-18 11:05:33 -07:00
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
|
const struct brw_mue_map *mue_map)
|
2019-07-18 09:23:47 -05:00
|
|
|
|
{
|
|
|
|
|
|
/* key->alpha_test_func means simulating alpha testing via discards,
|
|
|
|
|
|
* so the shader definitely kills pixels.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->uses_kill = shader->info.fs.uses_discard ||
|
2022-03-15 17:15:17 -07:00
|
|
|
|
shader->info.fs.uses_demote ||
|
|
|
|
|
|
key->emit_alpha_test;
|
2019-07-18 09:23:47 -05:00
|
|
|
|
prog_data->uses_omask = !key->ignore_sample_mask_out &&
|
|
|
|
|
|
(shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK));
|
2022-05-13 16:54:26 -05:00
|
|
|
|
prog_data->color_outputs_written = key->color_outputs_valid;
|
2019-07-18 09:23:47 -05:00
|
|
|
|
prog_data->computed_depth_mode = computed_depth_mode(shader);
|
|
|
|
|
|
prog_data->computed_stencil =
|
|
|
|
|
|
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
|
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
|
prog_data->sample_shading =
|
|
|
|
|
|
shader->info.fs.uses_sample_shading ||
|
|
|
|
|
|
shader->info.outputs_read;
|
|
|
|
|
|
|
2021-11-19 13:44:35 -06:00
|
|
|
|
assert(key->multisample_fbo != BRW_NEVER ||
|
|
|
|
|
|
key->persample_interp == BRW_NEVER);
|
2021-11-19 16:34:19 -06:00
|
|
|
|
|
|
|
|
|
|
prog_data->persample_dispatch = key->persample_interp;
|
2021-11-19 13:44:35 -06:00
|
|
|
|
if (prog_data->sample_shading)
|
2021-11-19 16:32:24 -06:00
|
|
|
|
prog_data->persample_dispatch = BRW_ALWAYS;
|
2019-07-18 09:23:47 -05:00
|
|
|
|
|
2021-11-19 13:44:35 -06:00
|
|
|
|
/* We can only persample dispatch if we have a multisample FBO */
|
|
|
|
|
|
prog_data->persample_dispatch = MIN2(prog_data->persample_dispatch,
|
|
|
|
|
|
key->multisample_fbo);
|
|
|
|
|
|
|
2022-03-09 15:31:34 +02:00
|
|
|
|
/* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If
|
|
|
|
|
|
* persample_dispatch & multisample_fbo are not dynamic, Anv should be able
|
|
|
|
|
|
* to definitively tell whether alpha_to_coverage is on or off.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->alpha_to_coverage = key->alpha_to_coverage;
|
|
|
|
|
|
assert(prog_data->alpha_to_coverage != BRW_SOMETIMES ||
|
|
|
|
|
|
prog_data->persample_dispatch == BRW_SOMETIMES);
|
|
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver >= 6) {
|
2019-10-24 17:31:18 -05:00
|
|
|
|
prog_data->uses_sample_mask =
|
2021-01-19 17:14:28 -08:00
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_MASK_IN);
|
2019-10-24 17:31:18 -05:00
|
|
|
|
|
|
|
|
|
|
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select
|
|
|
|
|
|
* POSOFFSET_SAMPLE"
|
|
|
|
|
|
*
|
|
|
|
|
|
* So we can only really get sample positions if we are doing real
|
|
|
|
|
|
* per-sample dispatch. If we need gl_SamplePosition and we don't have
|
|
|
|
|
|
* persample dispatch, we hard-code it to 0.5.
|
|
|
|
|
|
*/
|
2023-05-10 08:16:59 +03:00
|
|
|
|
prog_data->uses_pos_offset =
|
|
|
|
|
|
prog_data->persample_dispatch != BRW_NEVER &&
|
|
|
|
|
|
(BITSET_TEST(shader->info.system_values_read,
|
|
|
|
|
|
SYSTEM_VALUE_SAMPLE_POS) ||
|
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read,
|
|
|
|
|
|
SYSTEM_VALUE_SAMPLE_POS_OR_CENTER));
|
2019-10-24 17:31:18 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-18 09:23:47 -05:00
|
|
|
|
prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
|
|
|
|
|
|
|
|
|
|
|
|
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
|
|
|
|
|
|
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
|
|
|
|
|
|
prog_data->inner_coverage = shader->info.fs.inner_coverage;
|
|
|
|
|
|
|
|
|
|
|
|
prog_data->barycentric_interp_modes =
|
|
|
|
|
|
brw_compute_barycentric_interp_modes(devinfo, shader);
|
2022-03-16 19:26:54 -07:00
|
|
|
|
|
|
|
|
|
|
/* From the BDW PRM documentation for 3DSTATE_WM:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select Perspective
|
|
|
|
|
|
* Sample or Non- perspective Sample barycentric coordinates."
|
|
|
|
|
|
*
|
|
|
|
|
|
* So cleanup any potentially set sample barycentric mode when not in per
|
|
|
|
|
|
* sample dispatch.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (prog_data->persample_dispatch == BRW_NEVER) {
|
|
|
|
|
|
prog_data->barycentric_interp_modes &=
|
|
|
|
|
|
~BITFIELD_BIT(BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-07-07 12:24:38 +03:00
|
|
|
|
prog_data->uses_nonperspective_interp_modes |=
|
|
|
|
|
|
(prog_data->barycentric_interp_modes &
|
|
|
|
|
|
BRW_BARYCENTRIC_NONPERSPECTIVE_BITS) != 0;
|
2019-07-18 09:23:47 -05:00
|
|
|
|
|
2023-05-30 18:10:53 +02:00
|
|
|
|
/* The current VK_EXT_graphics_pipeline_library specification requires
|
|
|
|
|
|
* coarse to specified at compile time. But per sample interpolation can be
|
|
|
|
|
|
* dynamic. So we should never be in a situation where coarse &
|
|
|
|
|
|
* persample_interp are both respectively true & BRW_ALWAYS.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Coarse will dynamically turned off when persample_interp is active.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(!key->coarse_pixel || key->persample_interp != BRW_ALWAYS);
|
|
|
|
|
|
|
2021-11-19 16:32:24 -06:00
|
|
|
|
prog_data->coarse_pixel_dispatch =
|
|
|
|
|
|
brw_sometimes_invert(prog_data->persample_dispatch);
|
|
|
|
|
|
if (!key->coarse_pixel ||
|
|
|
|
|
|
prog_data->uses_omask ||
|
|
|
|
|
|
prog_data->sample_shading ||
|
|
|
|
|
|
prog_data->uses_sample_mask ||
|
|
|
|
|
|
(prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) ||
|
|
|
|
|
|
prog_data->computed_stencil) {
|
|
|
|
|
|
prog_data->coarse_pixel_dispatch = BRW_NEVER;
|
|
|
|
|
|
}
|
2020-10-22 13:23:06 +03:00
|
|
|
|
|
2023-07-01 23:36:19 +03:00
|
|
|
|
/* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater,
|
|
|
|
|
|
* Message Descriptor :
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Message Type. Specifies the type of message being sent when
|
|
|
|
|
|
* pixel-rate evaluation is requested :
|
|
|
|
|
|
*
|
|
|
|
|
|
* Format = U2
|
|
|
|
|
|
* 0: Per Message Offset (eval_snapped with immediate offset)
|
|
|
|
|
|
* 1: Sample Position Offset (eval_sindex)
|
|
|
|
|
|
* 2: Centroid Position Offset (eval_centroid)
|
|
|
|
|
|
* 3: Per Slot Offset (eval_snapped with register offset)
|
|
|
|
|
|
*
|
|
|
|
|
|
* Message Type. Specifies the type of message being sent when
|
|
|
|
|
|
* coarse-rate evaluation is requested :
|
|
|
|
|
|
*
|
|
|
|
|
|
* Format = U2
|
|
|
|
|
|
* 0: Coarse to Pixel Mapping Message (internal message)
|
|
|
|
|
|
* 1: Reserved
|
|
|
|
|
|
* 2: Coarse Centroid Position (eval_centroid)
|
|
|
|
|
|
* 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)"
|
|
|
|
|
|
*
|
|
|
|
|
|
* The Sample Position Offset is marked as reserved for coarse rate
|
|
|
|
|
|
* evaluation and leads to hangs if we try to use it. So disable coarse
|
|
|
|
|
|
* pixel shading if we have any intrinsic that will result in a pixel
|
|
|
|
|
|
* interpolater message at sample.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (brw_nir_pulls_at_sample(shader))
|
|
|
|
|
|
prog_data->coarse_pixel_dispatch = BRW_NEVER;
|
|
|
|
|
|
|
2019-06-07 18:17:36 -05:00
|
|
|
|
/* We choose to always enable VMask prior to XeHP, as it would cause
|
|
|
|
|
|
* us to lose out on the eliminate_find_live_channel() optimization.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->uses_vmask = devinfo->verx10 < 125 ||
|
|
|
|
|
|
shader->info.fs.needs_quad_helper_invocations ||
|
2023-11-02 15:38:46 +01:00
|
|
|
|
shader->info.uses_wide_subgroup_intrinsics ||
|
2021-11-19 16:32:24 -06:00
|
|
|
|
prog_data->coarse_pixel_dispatch != BRW_NEVER;
|
2019-06-07 18:17:36 -05:00
|
|
|
|
|
2020-10-29 15:10:59 +02:00
|
|
|
|
prog_data->uses_src_w =
|
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD);
|
|
|
|
|
|
prog_data->uses_src_depth =
|
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
|
2021-11-19 16:32:24 -06:00
|
|
|
|
prog_data->coarse_pixel_dispatch != BRW_ALWAYS;
|
2020-10-29 15:10:59 +02:00
|
|
|
|
prog_data->uses_depth_w_coefficients =
|
|
|
|
|
|
BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD) &&
|
2021-11-19 16:32:24 -06:00
|
|
|
|
prog_data->coarse_pixel_dispatch != BRW_NEVER;
|
2020-10-29 15:10:59 +02:00
|
|
|
|
|
2021-05-18 11:05:33 -07:00
|
|
|
|
calculate_urb_setup(devinfo, key, prog_data, shader, mue_map);
|
2019-07-18 09:23:47 -05:00
|
|
|
|
brw_compute_flat_inputs(prog_data, shader);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-02-28 18:11:33 -08:00
|
|
|
|
/**
|
2021-03-29 15:40:04 -07:00
|
|
|
|
* Pre-gfx6, the register file of the EUs was shared between threads,
|
2017-02-28 18:11:33 -08:00
|
|
|
|
* and each thread used some subset allocated on a 16-register block
|
|
|
|
|
|
* granularity. The unit states wanted these block counts.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static inline int
|
|
|
|
|
|
brw_register_blocks(int reg_count)
|
|
|
|
|
|
{
|
|
|
|
|
|
return ALIGN(reg_count, 16) / 16 - 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
const unsigned *
|
2021-03-22 22:13:09 -07:00
|
|
|
|
brw_compile_fs(const struct brw_compiler *compiler,
|
|
|
|
|
|
struct brw_compile_fs_params *params)
|
2011-03-11 19:19:01 -08:00
|
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
|
struct nir_shader *nir = params->base.nir;
|
2021-03-22 22:13:09 -07:00
|
|
|
|
const struct brw_wm_prog_key *key = params->key;
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data = params->prog_data;
|
|
|
|
|
|
bool allow_spilling = params->allow_spilling;
|
2021-03-23 11:38:28 -07:00
|
|
|
|
const bool debug_enabled =
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_should_print_shader(nir, params->base.debug_flag ?
|
|
|
|
|
|
params->base.debug_flag : DEBUG_WM);
|
2021-03-22 22:13:09 -07:00
|
|
|
|
|
2020-11-10 13:11:31 -09:00
|
|
|
|
prog_data->base.stage = MESA_SHADER_FRAGMENT;
|
2021-10-26 16:39:08 +03:00
|
|
|
|
prog_data->base.ray_queries = nir->info.ray_queries;
|
2022-02-28 15:13:07 +02:00
|
|
|
|
prog_data->base.total_scratch = 0;
|
2020-11-10 13:11:31 -09:00
|
|
|
|
|
2021-04-05 13:19:39 -07:00
|
|
|
|
const struct intel_device_info *devinfo = compiler->devinfo;
|
2021-03-29 14:41:58 -07:00
|
|
|
|
const unsigned max_subgroup_size = compiler->devinfo->ver >= 6 ? 32 : 16;
|
2019-02-22 10:48:39 -06:00
|
|
|
|
|
2023-05-17 17:09:06 +02:00
|
|
|
|
brw_nir_apply_key(nir, compiler, &key->base, max_subgroup_size);
|
2020-09-04 18:43:35 +02:00
|
|
|
|
brw_nir_lower_fs_inputs(nir, devinfo, key);
|
|
|
|
|
|
brw_nir_lower_fs_outputs(nir);
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver < 6)
|
2021-03-22 22:13:09 -07:00
|
|
|
|
brw_setup_vue_interpolation(params->vue_map, nir, prog_data);
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
|
2019-09-27 16:28:11 -07:00
|
|
|
|
/* From the SKL PRM, Volume 7, "Alpha Coverage":
|
|
|
|
|
|
* "If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
|
|
|
|
|
|
* hardware, regardless of the state setting for this feature."
|
|
|
|
|
|
*/
|
2022-03-09 15:31:34 +02:00
|
|
|
|
if (devinfo->ver > 6 && key->alpha_to_coverage != BRW_NEVER) {
|
2019-09-27 16:28:11 -07:00
|
|
|
|
/* Run constant fold optimization in order to get the correct source
|
|
|
|
|
|
* offset to determine render target 0 store instruction in
|
|
|
|
|
|
* emit_alpha_to_coverage pass.
|
|
|
|
|
|
*/
|
2023-01-11 11:15:27 -08:00
|
|
|
|
NIR_PASS(_, nir, nir_opt_constant_folding);
|
|
|
|
|
|
NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage, key, prog_data);
|
2019-09-27 16:28:11 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-01-11 11:15:27 -08:00
|
|
|
|
NIR_PASS(_, nir, brw_nir_move_interpolation_to_top);
|
2023-05-17 16:44:17 +02:00
|
|
|
|
brw_postprocess_nir(nir, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
|
key->base.robust_flags);
|
2015-11-11 10:04:43 -08:00
|
|
|
|
|
2021-05-18 11:05:33 -07:00
|
|
|
|
brw_nir_populate_wm_prog_data(nir, compiler->devinfo, key, prog_data,
|
|
|
|
|
|
params->mue_map);
|
2019-07-18 09:15:15 -05:00
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
|
std::unique_ptr<fs_visitor> v8, v16, v32;
|
2016-04-26 19:45:41 -07:00
|
|
|
|
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
|
2020-04-02 17:30:06 -07:00
|
|
|
|
float throughput = 0;
|
2020-05-19 14:37:44 -07:00
|
|
|
|
bool has_spilled = false;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
v8 = std::make_unique<fs_visitor>(compiler, ¶ms->base, &key->base,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
&prog_data->base, nir, 8,
|
2023-07-14 02:10:20 +03:00
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
debug_enabled);
|
2020-04-02 17:16:45 -07:00
|
|
|
|
if (!v8->run_fs(allow_spilling, false /* do_rep_send */)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
params->base.error_str = ralloc_strdup(params->base.mem_ctx, v8->fail_msg);
|
2012-11-20 16:21:27 -08:00
|
|
|
|
return NULL;
|
2023-01-21 12:49:44 +01:00
|
|
|
|
} else if (INTEL_SIMD(FS, 8)) {
|
2020-04-02 17:16:45 -07:00
|
|
|
|
simd8_cfg = v8->cfg;
|
2022-07-19 16:44:26 -07:00
|
|
|
|
|
|
|
|
|
|
assert(v8->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg = v8->payload().num_regs / reg_unit(devinfo);
|
|
|
|
|
|
|
2020-04-02 17:16:45 -07:00
|
|
|
|
prog_data->reg_blocks_8 = brw_register_blocks(v8->grf_used);
|
2020-04-02 17:30:06 -07:00
|
|
|
|
const performance &perf = v8->performance_analysis.require();
|
|
|
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
2020-05-19 14:37:44 -07:00
|
|
|
|
has_spilled = v8->spilled_any_registers;
|
|
|
|
|
|
allow_spilling = false;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2021-03-29 15:40:04 -07:00
|
|
|
|
/* Limit dispatch width to simd8 with dual source blending on gfx8.
|
2020-05-20 01:02:52 +02:00
|
|
|
|
* See: https://gitlab.freedesktop.org/mesa/mesa/-/issues/1917
|
2019-12-02 16:54:30 +02:00
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (devinfo->ver == 8 && prog_data->dual_src_blend &&
|
2023-01-21 12:49:44 +01:00
|
|
|
|
INTEL_SIMD(FS, 8)) {
|
2021-03-22 22:13:09 -07:00
|
|
|
|
assert(!params->use_rep_send);
|
2021-03-29 15:40:04 -07:00
|
|
|
|
v8->limit_dispatch_width(8, "gfx8 workaround: "
|
2020-04-02 17:16:45 -07:00
|
|
|
|
"using SIMD8 when dual src blending.\n");
|
2019-12-02 16:54:30 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-10-29 15:17:16 +02:00
|
|
|
|
if (key->coarse_pixel) {
|
|
|
|
|
|
if (prog_data->dual_src_blend) {
|
|
|
|
|
|
v8->limit_dispatch_width(8, "SIMD16 coarse pixel shading cannot"
|
|
|
|
|
|
" use SIMD8 messages.\n");
|
|
|
|
|
|
}
|
|
|
|
|
|
v8->limit_dispatch_width(16, "SIMD32 not supported with coarse"
|
|
|
|
|
|
" pixel shading.\n");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2021-07-02 12:47:38 +03:00
|
|
|
|
if (nir->info.ray_queries > 0)
|
|
|
|
|
|
v8->limit_dispatch_width(16, "SIMD32 with ray queries.\n");
|
|
|
|
|
|
|
2020-05-19 14:37:44 -07:00
|
|
|
|
if (!has_spilled &&
|
|
|
|
|
|
v8->max_dispatch_width >= 16 &&
|
2023-01-21 12:49:44 +01:00
|
|
|
|
(INTEL_SIMD(FS, 16) || params->use_rep_send)) {
|
2016-04-28 12:40:14 -07:00
|
|
|
|
/* Try a SIMD16 compile */
|
2023-07-14 02:10:20 +03:00
|
|
|
|
v16 = std::make_unique<fs_visitor>(compiler, ¶ms->base, &key->base,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
&prog_data->base, nir, 16,
|
2023-07-14 02:10:20 +03:00
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
debug_enabled);
|
|
|
|
|
|
v16->import_uniforms(v8.get());
|
2021-03-22 22:13:09 -07:00
|
|
|
|
if (!v16->run_fs(allow_spilling, params->use_rep_send)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
|
"SIMD16 shader failed to compile: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
|
v16->fail_msg);
|
2016-04-28 12:40:14 -07:00
|
|
|
|
} else {
|
2020-04-02 17:16:45 -07:00
|
|
|
|
simd16_cfg = v16->cfg;
|
2022-07-19 16:44:26 -07:00
|
|
|
|
|
|
|
|
|
|
assert(v16->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = v16->payload().num_regs / reg_unit(devinfo);
|
|
|
|
|
|
|
2020-04-02 17:16:45 -07:00
|
|
|
|
prog_data->reg_blocks_16 = brw_register_blocks(v16->grf_used);
|
2020-04-02 17:30:06 -07:00
|
|
|
|
const performance &perf = v16->performance_analysis.require();
|
|
|
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
2020-05-19 14:37:44 -07:00
|
|
|
|
has_spilled = v16->spilled_any_registers;
|
|
|
|
|
|
allow_spilling = false;
|
2012-07-12 12:48:58 -07:00
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-05-31 09:27:28 -07:00
|
|
|
|
const bool simd16_failed = v16 && !simd16_cfg;
|
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
|
/* Currently, the compiler only supports SIMD32 on SNB+ */
|
2020-05-19 14:37:44 -07:00
|
|
|
|
if (!has_spilled &&
|
2021-03-22 22:13:09 -07:00
|
|
|
|
v8->max_dispatch_width >= 32 && !params->use_rep_send &&
|
2021-03-29 14:41:58 -07:00
|
|
|
|
devinfo->ver >= 6 && !simd16_failed &&
|
2023-01-21 12:49:44 +01:00
|
|
|
|
INTEL_SIMD(FS, 32)) {
|
2016-04-26 19:45:41 -07:00
|
|
|
|
/* Try a SIMD32 compile */
|
2023-07-14 02:10:20 +03:00
|
|
|
|
v32 = std::make_unique<fs_visitor>(compiler, ¶ms->base, &key->base,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
&prog_data->base, nir, 32,
|
2023-07-14 02:10:20 +03:00
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
debug_enabled);
|
|
|
|
|
|
v32->import_uniforms(v8.get());
|
2020-04-02 17:16:45 -07:00
|
|
|
|
if (!v32->run_fs(allow_spilling, false)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-10-03 15:58:36 +03:00
|
|
|
|
"SIMD32 shader failed to compile: %s\n",
|
2021-07-29 14:27:57 -07:00
|
|
|
|
v32->fail_msg);
|
2016-04-26 19:45:41 -07:00
|
|
|
|
} else {
|
2020-04-02 17:30:06 -07:00
|
|
|
|
const performance &perf = v32->performance_analysis.require();
|
|
|
|
|
|
|
2023-08-10 14:12:24 -04:00
|
|
|
|
if (!INTEL_DEBUG(DEBUG_DO32) && throughput >= perf.throughput) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-07-29 14:27:57 -07:00
|
|
|
|
"SIMD32 shader inefficient\n");
|
2020-04-02 17:30:06 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
simd32_cfg = v32->cfg;
|
2022-07-19 16:44:26 -07:00
|
|
|
|
|
|
|
|
|
|
assert(v32->payload().num_regs % reg_unit(devinfo) == 0);
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32 = v32->payload().num_regs / reg_unit(devinfo);
|
|
|
|
|
|
|
2020-04-02 17:30:06 -07:00
|
|
|
|
prog_data->reg_blocks_32 = brw_register_blocks(v32->grf_used);
|
|
|
|
|
|
throughput = MAX2(throughput, perf.throughput);
|
|
|
|
|
|
}
|
2016-04-26 19:45:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-28 12:40:14 -07:00
|
|
|
|
/* When the caller requests a repclear shader, they want SIMD16-only */
|
2021-03-22 22:13:09 -07:00
|
|
|
|
if (params->use_rep_send)
|
2016-04-28 12:40:14 -07:00
|
|
|
|
simd8_cfg = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
/* Prior to Iron Lake, the PS had a single shader offset with a jump table
|
|
|
|
|
|
* at the top to select the shader. We've never implemented that.
|
|
|
|
|
|
* Instead, we just give them exactly one shader and we pick the widest one
|
|
|
|
|
|
* available.
|
|
|
|
|
|
*/
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (compiler->devinfo->ver < 5) {
|
2016-04-26 19:45:41 -07:00
|
|
|
|
if (simd32_cfg || simd16_cfg)
|
|
|
|
|
|
simd8_cfg = NULL;
|
|
|
|
|
|
if (simd32_cfg)
|
|
|
|
|
|
simd16_cfg = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* If computed depth is enabled SNB only allows SIMD8. */
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (compiler->devinfo->ver == 6 &&
|
2016-04-26 19:45:41 -07:00
|
|
|
|
prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
|
|
|
|
|
|
assert(simd16_cfg == NULL && simd32_cfg == NULL);
|
2016-04-28 12:40:14 -07:00
|
|
|
|
|
2021-03-29 14:41:58 -07:00
|
|
|
|
if (compiler->devinfo->ver <= 5 && !simd8_cfg) {
|
2018-05-17 23:49:29 -07:00
|
|
|
|
/* Iron lake and earlier only have one Dispatch GRF start field. Make
|
|
|
|
|
|
* the data available in the base prog data struct for convenience.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (simd16_cfg) {
|
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg =
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16;
|
2016-04-26 19:45:41 -07:00
|
|
|
|
} else if (simd32_cfg) {
|
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg =
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32;
|
2018-05-17 23:49:29 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base,
|
2020-04-02 17:16:45 -07:00
|
|
|
|
v8->runtime_check_aads_emit, MESA_SHADER_FRAGMENT);
|
2014-10-27 19:40:47 -07:00
|
|
|
|
|
2021-03-23 11:12:40 -07:00
|
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
g.enable_debug(ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
|
"%s fragment shader %s",
|
2020-09-04 18:43:35 +02:00
|
|
|
|
nir->info.label ?
|
|
|
|
|
|
nir->info.label : "unnamed",
|
|
|
|
|
|
nir->info.name));
|
2014-10-27 19:40:47 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
struct brw_compile_stats *stats = params->base.stats;
|
2023-03-19 15:03:33 +02:00
|
|
|
|
uint32_t max_dispatch_width = 0;
|
2021-03-22 22:13:09 -07:00
|
|
|
|
|
2016-04-28 12:40:14 -07:00
|
|
|
|
if (simd8_cfg) {
|
2016-04-28 15:37:39 -07:00
|
|
|
|
prog_data->dispatch_8 = true;
|
2020-03-26 16:27:32 -07:00
|
|
|
|
g.generate_code(simd8_cfg, 8, v8->shader_stats,
|
|
|
|
|
|
v8->performance_analysis.require(), stats);
|
2019-04-23 23:19:56 -05:00
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
|
max_dispatch_width = 8;
|
2018-05-17 23:49:29 -07:00
|
|
|
|
}
|
2016-04-28 15:37:39 -07:00
|
|
|
|
|
2018-05-17 23:49:29 -07:00
|
|
|
|
if (simd16_cfg) {
|
2016-04-28 15:37:39 -07:00
|
|
|
|
prog_data->dispatch_16 = true;
|
2020-03-26 16:27:32 -07:00
|
|
|
|
prog_data->prog_offset_16 = g.generate_code(
|
|
|
|
|
|
simd16_cfg, 16, v16->shader_stats,
|
|
|
|
|
|
v16->performance_analysis.require(), stats);
|
2019-04-23 23:19:56 -05:00
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
|
max_dispatch_width = 16;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
|
if (simd32_cfg) {
|
|
|
|
|
|
prog_data->dispatch_32 = true;
|
2020-03-26 16:27:32 -07:00
|
|
|
|
prog_data->prog_offset_32 = g.generate_code(
|
|
|
|
|
|
simd32_cfg, 32, v32->shader_stats,
|
|
|
|
|
|
v32->performance_analysis.require(), stats);
|
2019-04-23 23:19:56 -05:00
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
|
max_dispatch_width = 32;
|
2016-04-26 19:45:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
for (struct brw_compile_stats *s = params->base.stats; s != NULL && s != stats; s++)
|
2023-03-19 15:03:33 +02:00
|
|
|
|
s->max_dispatch_width = max_dispatch_width;
|
|
|
|
|
|
|
2020-09-04 18:43:35 +02:00
|
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
2018-02-26 16:34:55 -08:00
|
|
|
|
return g.get_assembly();
|
2010-08-26 12:12:00 -07:00
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2020-03-20 21:02:06 -07:00
|
|
|
|
unsigned
|
|
|
|
|
|
brw_cs_push_const_total_size(const struct brw_cs_prog_data *cs_prog_data,
|
|
|
|
|
|
unsigned threads)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(cs_prog_data->push.per_thread.size % REG_SIZE == 0);
|
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.size % REG_SIZE == 0);
|
|
|
|
|
|
return cs_prog_data->push.per_thread.size * threads +
|
|
|
|
|
|
cs_prog_data->push.cross_thread.size;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-22 21:46:28 -07:00
|
|
|
|
static void
|
|
|
|
|
|
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
|
|
|
|
|
|
{
|
|
|
|
|
|
block->dwords = dwords;
|
|
|
|
|
|
block->regs = DIV_ROUND_UP(dwords, 8);
|
|
|
|
|
|
block->size = block->regs * 32;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
2021-04-05 13:19:39 -07:00
|
|
|
|
cs_fill_push_const_info(const struct intel_device_info *devinfo,
|
2016-05-22 21:46:28 -07:00
|
|
|
|
struct brw_cs_prog_data *cs_prog_data)
|
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
|
2022-08-30 00:47:32 -07:00
|
|
|
|
int subgroup_id_index = brw_get_subgroup_id_param_index(devinfo, prog_data);
|
2021-05-14 18:04:46 +02:00
|
|
|
|
bool cross_thread_supported = devinfo->verx10 >= 75;
|
2016-05-22 21:46:28 -07:00
|
|
|
|
|
|
|
|
|
|
/* The thread ID should be stored in the last param dword */
|
2017-08-24 11:40:31 -07:00
|
|
|
|
assert(subgroup_id_index == -1 ||
|
|
|
|
|
|
subgroup_id_index == (int)prog_data->nr_params - 1);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
|
|
|
|
|
|
unsigned cross_thread_dwords, per_thread_dwords;
|
|
|
|
|
|
if (!cross_thread_supported) {
|
|
|
|
|
|
cross_thread_dwords = 0u;
|
2016-05-22 22:31:06 -07:00
|
|
|
|
per_thread_dwords = prog_data->nr_params;
|
2017-08-24 11:40:31 -07:00
|
|
|
|
} else if (subgroup_id_index >= 0) {
|
2016-05-22 21:46:28 -07:00
|
|
|
|
/* Fill all but the last register with cross-thread payload */
|
2017-08-24 11:40:31 -07:00
|
|
|
|
cross_thread_dwords = 8 * (subgroup_id_index / 8);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
|
|
|
|
|
|
assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Fill all data using cross-thread payload */
|
|
|
|
|
|
cross_thread_dwords = prog_data->nr_params;
|
|
|
|
|
|
per_thread_dwords = 0u;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
|
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
|
|
|
|
|
|
|
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
|
|
|
|
|
|
cs_prog_data->push.per_thread.size == 0);
|
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.dwords +
|
|
|
|
|
|
cs_prog_data->push.per_thread.dwords ==
|
|
|
|
|
|
prog_data->nr_params);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-04-28 13:09:27 -07:00
|
|
|
|
static bool
|
2020-07-29 17:50:03 -07:00
|
|
|
|
filter_simd(const nir_instr *instr, const void * /* options */)
|
2020-04-28 13:09:27 -07:00
|
|
|
|
{
|
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_load_simd_width_intel:
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_id:
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-08-12 16:17:15 -04:00
|
|
|
|
static nir_def *
|
2020-04-28 13:09:27 -07:00
|
|
|
|
lower_simd(nir_builder *b, nir_instr *instr, void *options)
|
|
|
|
|
|
{
|
|
|
|
|
|
uintptr_t simd_width = (uintptr_t)options;
|
|
|
|
|
|
|
|
|
|
|
|
switch (nir_instr_as_intrinsic(instr)->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_load_simd_width_intel:
|
|
|
|
|
|
return nir_imm_int(b, simd_width);
|
|
|
|
|
|
|
|
|
|
|
|
case nir_intrinsic_load_subgroup_id:
|
|
|
|
|
|
/* If the whole workgroup fits in one thread, we can lower subgroup_id
|
|
|
|
|
|
* to a constant zero.
|
|
|
|
|
|
*/
|
2021-05-05 12:24:44 -07:00
|
|
|
|
if (!b->shader->info.workgroup_size_variable) {
|
|
|
|
|
|
unsigned local_workgroup_size = b->shader->info.workgroup_size[0] *
|
|
|
|
|
|
b->shader->info.workgroup_size[1] *
|
|
|
|
|
|
b->shader->info.workgroup_size[2];
|
2020-04-28 13:09:27 -07:00
|
|
|
|
if (local_workgroup_size <= simd_width)
|
|
|
|
|
|
return nir_imm_int(b, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
|
bool
|
2020-04-28 13:09:27 -07:00
|
|
|
|
brw_nir_lower_simd(nir_shader *nir, unsigned dispatch_width)
|
|
|
|
|
|
{
|
2022-07-18 18:35:34 +02:00
|
|
|
|
return nir_shader_lower_instructions(nir, filter_simd, lower_simd,
|
2020-04-28 13:09:27 -07:00
|
|
|
|
(void *)(uintptr_t)dispatch_width);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-09-04 16:35:34 -07:00
|
|
|
|
const unsigned *
|
2021-03-23 21:01:21 -07:00
|
|
|
|
brw_compile_cs(const struct brw_compiler *compiler,
|
|
|
|
|
|
struct brw_compile_cs_params *params)
|
2015-09-04 16:35:34 -07:00
|
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
|
const nir_shader *nir = params->base.nir;
|
2021-03-23 21:01:21 -07:00
|
|
|
|
const struct brw_cs_prog_key *key = params->key;
|
|
|
|
|
|
struct brw_cs_prog_data *prog_data = params->prog_data;
|
|
|
|
|
|
|
2021-03-29 16:14:03 -07:00
|
|
|
|
const bool debug_enabled =
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_should_print_shader(nir, params->base.debug_flag ?
|
|
|
|
|
|
params->base.debug_flag : DEBUG_CS);
|
2021-03-23 11:12:40 -07:00
|
|
|
|
|
2020-11-10 13:11:31 -09:00
|
|
|
|
prog_data->base.stage = MESA_SHADER_COMPUTE;
|
2021-04-08 12:30:14 +02:00
|
|
|
|
prog_data->base.total_shared = nir->info.shared_size;
|
2021-10-26 16:39:08 +03:00
|
|
|
|
prog_data->base.ray_queries = nir->info.ray_queries;
|
2022-02-28 15:13:07 +02:00
|
|
|
|
prog_data->base.total_scratch = 0;
|
2018-11-12 06:29:51 -08:00
|
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
|
if (!nir->info.workgroup_size_variable) {
|
2021-05-05 12:24:44 -07:00
|
|
|
|
prog_data->local_size[0] = nir->info.workgroup_size[0];
|
|
|
|
|
|
prog_data->local_size[1] = nir->info.workgroup_size[1];
|
|
|
|
|
|
prog_data->local_size[2] = nir->info.workgroup_size[2];
|
2020-09-14 13:44:42 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
|
brw_simd_selection_state simd_state{
|
|
|
|
|
|
.devinfo = compiler->devinfo,
|
|
|
|
|
|
.prog_data = prog_data,
|
|
|
|
|
|
.required_width = brw_required_dispatch_width(&nir->info),
|
|
|
|
|
|
};
|
2020-09-14 13:44:42 -05:00
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
|
std::unique_ptr<fs_visitor> v[3];
|
2019-07-09 14:28:18 -05:00
|
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
|
for (unsigned simd = 0; simd < 3; simd++) {
|
2022-11-08 01:47:50 -08:00
|
|
|
|
if (!brw_simd_should_compile(simd_state, simd))
|
2021-10-07 00:23:07 -07:00
|
|
|
|
continue;
|
2020-05-19 10:08:12 -07:00
|
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
|
const unsigned dispatch_width = 8u << simd;
|
2017-08-21 21:27:19 -07:00
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
nir_shader *shader = nir_shader_clone(params->base.mem_ctx, nir);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
brw_nir_apply_key(shader, compiler, &key->base,
|
2023-05-17 17:09:06 +02:00
|
|
|
|
dispatch_width);
|
2017-08-21 19:30:24 -07:00
|
|
|
|
|
2022-07-18 18:35:34 +02:00
|
|
|
|
NIR_PASS(_, shader, brw_nir_lower_simd, dispatch_width);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
|
|
|
|
|
/* Clean up after the local index and ID calculations. */
|
2022-07-18 18:35:34 +02:00
|
|
|
|
NIR_PASS(_, shader, nir_opt_constant_folding);
|
|
|
|
|
|
NIR_PASS(_, shader, nir_opt_dce);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
2023-05-17 16:44:17 +02:00
|
|
|
|
brw_postprocess_nir(shader, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
|
key->base.robust_flags);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
|
|
|
|
|
|
&key->base,
|
|
|
|
|
|
&prog_data->base,
|
|
|
|
|
|
shader, dispatch_width,
|
|
|
|
|
|
params->base.stats != NULL,
|
2022-11-08 14:14:37 -08:00
|
|
|
|
debug_enabled);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
2022-11-08 03:38:18 -08:00
|
|
|
|
const int first = brw_simd_first_compiled(simd_state);
|
|
|
|
|
|
if (first >= 0)
|
2022-11-08 14:14:37 -08:00
|
|
|
|
v[simd]->import_uniforms(v[first].get());
|
2015-09-04 16:35:34 -07:00
|
|
|
|
|
2022-11-08 03:38:18 -08:00
|
|
|
|
const bool allow_spilling = first < 0 || nir->info.workgroup_size_variable;
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
|
|
|
|
|
if (v[simd]->run_cs(allow_spilling)) {
|
2016-05-22 21:46:28 -07:00
|
|
|
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
|
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
} else {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx, v[simd]->fail_msg);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
if (simd > 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2021-10-07 00:23:07 -07:00
|
|
|
|
"SIMD%u shader failed to compile: %s\n",
|
|
|
|
|
|
dispatch_width, v[simd]->fail_msg);
|
|
|
|
|
|
}
|
2016-05-16 18:25:22 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-11-08 01:47:50 -08:00
|
|
|
|
const int selected_simd = brw_simd_select(simd_state);
|
2021-10-07 00:23:07 -07:00
|
|
|
|
if (selected_simd < 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
params->base.error_str =
|
|
|
|
|
|
ralloc_asprintf(params->base.mem_ctx,
|
2023-09-21 13:35:42 -07:00
|
|
|
|
"Can't compile shader: "
|
|
|
|
|
|
"SIMD8 '%s', SIMD16 '%s' and SIMD32 '%s'.\n",
|
2023-07-14 02:10:20 +03:00
|
|
|
|
simd_state.error[0], simd_state.error[1],
|
|
|
|
|
|
simd_state.error[2]);
|
2020-05-19 10:08:12 -07:00
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2021-10-07 00:23:07 -07:00
|
|
|
|
assert(selected_simd < 3);
|
2022-11-08 14:14:37 -08:00
|
|
|
|
fs_visitor *selected = v[selected_simd].get();
|
2021-10-07 00:23:07 -07:00
|
|
|
|
|
|
|
|
|
|
if (!nir->info.workgroup_size_variable)
|
|
|
|
|
|
prog_data->prog_mask = 1 << selected_simd;
|
2020-05-19 10:08:12 -07:00
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base,
|
2021-10-07 00:23:07 -07:00
|
|
|
|
selected->runtime_check_aads_emit, MESA_SHADER_COMPUTE);
|
2021-03-23 11:12:40 -07:00
|
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
char *name = ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
|
"%s compute shader %s",
|
2020-09-04 18:43:35 +02:00
|
|
|
|
nir->info.label ?
|
|
|
|
|
|
nir->info.label : "unnamed",
|
|
|
|
|
|
nir->info.name);
|
2020-05-21 01:56:54 -07:00
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2023-03-19 15:03:33 +02:00
|
|
|
|
uint32_t max_dispatch_width = 8u << (util_last_bit(prog_data->prog_mask) - 1);
|
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
struct brw_compile_stats *stats = params->base.stats;
|
2021-10-07 00:23:07 -07:00
|
|
|
|
for (unsigned simd = 0; simd < 3; simd++) {
|
|
|
|
|
|
if (prog_data->prog_mask & (1u << simd)) {
|
|
|
|
|
|
assert(v[simd]);
|
|
|
|
|
|
prog_data->prog_offset[simd] =
|
|
|
|
|
|
g.generate_code(v[simd]->cfg, 8u << simd, v[simd]->shader_stats,
|
|
|
|
|
|
v[simd]->performance_analysis.require(), stats);
|
2023-03-19 15:03:33 +02:00
|
|
|
|
if (stats)
|
|
|
|
|
|
stats->max_dispatch_width = max_dispatch_width;
|
2020-05-21 01:56:54 -07:00
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2023-03-19 15:03:33 +02:00
|
|
|
|
max_dispatch_width = 8u << simd;
|
2020-05-21 01:56:54 -07:00
|
|
|
|
}
|
2015-09-04 16:35:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2020-09-04 18:43:35 +02:00
|
|
|
|
g.add_const_data(nir->constant_data, nir->constant_data_size);
|
2020-08-07 22:26:07 -05:00
|
|
|
|
|
2022-11-08 14:14:37 -08:00
|
|
|
|
return g.get_assembly();
|
2015-09-04 16:35:34 -07:00
|
|
|
|
}
|
2016-09-15 21:43:18 -07:00
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
|
struct brw_cs_dispatch_info
|
|
|
|
|
|
brw_cs_get_dispatch_info(const struct intel_device_info *devinfo,
|
|
|
|
|
|
const struct brw_cs_prog_data *prog_data,
|
|
|
|
|
|
const unsigned *override_local_size)
|
|
|
|
|
|
{
|
|
|
|
|
|
struct brw_cs_dispatch_info info = {};
|
|
|
|
|
|
|
|
|
|
|
|
const unsigned *sizes =
|
|
|
|
|
|
override_local_size ? override_local_size :
|
|
|
|
|
|
prog_data->local_size;
|
|
|
|
|
|
|
2022-11-08 01:24:36 -08:00
|
|
|
|
const int simd = brw_simd_select_for_workgroup_size(devinfo, prog_data, sizes);
|
2021-10-11 07:49:40 -07:00
|
|
|
|
assert(simd >= 0 && simd < 3);
|
|
|
|
|
|
|
2021-04-28 10:54:53 -07:00
|
|
|
|
info.group_size = sizes[0] * sizes[1] * sizes[2];
|
2021-10-11 07:49:40 -07:00
|
|
|
|
info.simd_size = 8u << simd;
|
2021-04-28 10:54:53 -07:00
|
|
|
|
info.threads = DIV_ROUND_UP(info.group_size, info.simd_size);
|
|
|
|
|
|
|
|
|
|
|
|
const uint32_t remainder = info.group_size & (info.simd_size - 1);
|
|
|
|
|
|
if (remainder > 0)
|
|
|
|
|
|
info.right_mask = ~0u >> (32 - remainder);
|
|
|
|
|
|
else
|
|
|
|
|
|
info.right_mask = ~0u >> (32 - info.simd_size);
|
|
|
|
|
|
|
|
|
|
|
|
return info;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
|
static uint8_t
|
2023-07-14 02:10:20 +03:00
|
|
|
|
compile_single_bs(const struct brw_compiler *compiler,
|
|
|
|
|
|
struct brw_compile_bs_params *params,
|
2020-09-04 12:40:06 -05:00
|
|
|
|
const struct brw_bs_prog_key *key,
|
|
|
|
|
|
struct brw_bs_prog_data *prog_data,
|
|
|
|
|
|
nir_shader *shader,
|
|
|
|
|
|
fs_generator *g,
|
|
|
|
|
|
struct brw_compile_stats *stats,
|
2023-07-14 02:10:20 +03:00
|
|
|
|
int *prog_offset)
|
2020-10-21 14:46:50 -05:00
|
|
|
|
{
|
2023-06-20 14:42:02 -07:00
|
|
|
|
const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
|
2021-03-23 11:12:40 -07:00
|
|
|
|
|
2020-10-21 14:46:50 -05:00
|
|
|
|
prog_data->base.stage = shader->info.stage;
|
2020-09-04 12:40:06 -05:00
|
|
|
|
prog_data->max_stack_size = MAX2(prog_data->max_stack_size,
|
|
|
|
|
|
shader->scratch_size);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
|
|
const unsigned max_dispatch_width = 16;
|
2023-05-17 17:09:06 +02:00
|
|
|
|
brw_nir_apply_key(shader, compiler, &key->base, max_dispatch_width);
|
2023-05-17 16:44:17 +02:00
|
|
|
|
brw_postprocess_nir(shader, compiler, debug_enabled,
|
2022-06-21 18:06:04 -07:00
|
|
|
|
key->base.robust_flags);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
brw_simd_selection_state simd_state{
|
|
|
|
|
|
.devinfo = compiler->devinfo,
|
|
|
|
|
|
.prog_data = prog_data,
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
/* Since divergence is a lot more likely in RT than compute, it makes
|
|
|
|
|
|
* sense to limit ourselves to SIMD8 for now.
|
|
|
|
|
|
*/
|
|
|
|
|
|
.required_width = 8,
|
|
|
|
|
|
};
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
std::unique_ptr<fs_visitor> v[2];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned simd = 0; simd < ARRAY_SIZE(v); simd++) {
|
|
|
|
|
|
if (!brw_simd_should_compile(simd_state, simd))
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
const unsigned dispatch_width = 8u << simd;
|
|
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
v[simd] = std::make_unique<fs_visitor>(compiler, ¶ms->base,
|
|
|
|
|
|
&key->base,
|
2022-11-07 16:21:17 -08:00
|
|
|
|
&prog_data->base, shader,
|
2023-02-03 17:02:28 +01:00
|
|
|
|
dispatch_width,
|
|
|
|
|
|
stats != NULL,
|
|
|
|
|
|
debug_enabled);
|
2022-11-07 16:21:17 -08:00
|
|
|
|
|
|
|
|
|
|
const bool allow_spilling = !brw_simd_any_compiled(simd_state);
|
|
|
|
|
|
if (v[simd]->run_bs(allow_spilling)) {
|
|
|
|
|
|
brw_simd_mark_compiled(simd_state, simd, v[simd]->spilled_any_registers);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
} else {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
simd_state.error[simd] = ralloc_strdup(params->base.mem_ctx,
|
|
|
|
|
|
v[simd]->fail_msg);
|
2022-11-07 16:21:17 -08:00
|
|
|
|
if (simd > 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
brw_shader_perf_log(compiler, params->base.log_data,
|
2022-11-07 16:21:17 -08:00
|
|
|
|
"SIMD%u shader failed to compile: %s",
|
|
|
|
|
|
dispatch_width, v[simd]->fail_msg);
|
|
|
|
|
|
}
|
2020-10-21 14:46:50 -05:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
const int selected_simd = brw_simd_select(simd_state);
|
|
|
|
|
|
if (selected_simd < 0) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
params->base.error_str =
|
|
|
|
|
|
ralloc_asprintf(params->base.mem_ctx,
|
2023-09-21 13:35:42 -07:00
|
|
|
|
"Can't compile shader: "
|
|
|
|
|
|
"SIMD8 '%s' and SIMD16 '%s'.\n",
|
2023-07-14 02:10:20 +03:00
|
|
|
|
simd_state.error[0], simd_state.error[1]);
|
2022-11-07 16:21:17 -08:00
|
|
|
|
return 0;
|
2020-10-21 14:46:50 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
assert(selected_simd < int(ARRAY_SIZE(v)));
|
|
|
|
|
|
fs_visitor *selected = v[selected_simd].get();
|
|
|
|
|
|
assert(selected);
|
|
|
|
|
|
|
|
|
|
|
|
const unsigned dispatch_width = selected->dispatch_width;
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
int offset = g->generate_code(selected->cfg, dispatch_width, selected->shader_stats,
|
|
|
|
|
|
selected->performance_analysis.require(), stats);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
if (prog_offset)
|
|
|
|
|
|
*prog_offset = offset;
|
|
|
|
|
|
else
|
|
|
|
|
|
assert(offset == 0);
|
|
|
|
|
|
|
2022-11-07 16:21:17 -08:00
|
|
|
|
return dispatch_width;
|
2020-09-04 12:40:06 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
uint64_t
|
|
|
|
|
|
brw_bsr(const struct intel_device_info *devinfo,
|
|
|
|
|
|
uint32_t offset, uint8_t simd_size, uint8_t local_arg_offset)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(offset % 64 == 0);
|
|
|
|
|
|
assert(simd_size == 8 || simd_size == 16);
|
|
|
|
|
|
assert(local_arg_offset % 8 == 0);
|
|
|
|
|
|
|
|
|
|
|
|
return offset |
|
2022-01-31 12:43:04 +00:00
|
|
|
|
SET_BITS(simd_size == 8, 4, 4) |
|
2020-09-04 12:40:06 -05:00
|
|
|
|
SET_BITS(local_arg_offset / 8, 2, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const unsigned *
|
2021-03-23 21:21:40 -07:00
|
|
|
|
brw_compile_bs(const struct brw_compiler *compiler,
|
|
|
|
|
|
struct brw_compile_bs_params *params)
|
2020-09-04 12:40:06 -05:00
|
|
|
|
{
|
2023-07-14 02:10:20 +03:00
|
|
|
|
nir_shader *shader = params->base.nir;
|
2021-03-23 21:21:40 -07:00
|
|
|
|
struct brw_bs_prog_data *prog_data = params->prog_data;
|
|
|
|
|
|
unsigned num_resume_shaders = params->num_resume_shaders;
|
|
|
|
|
|
nir_shader **resume_shaders = params->resume_shaders;
|
2023-06-20 14:42:02 -07:00
|
|
|
|
const bool debug_enabled = brw_should_print_shader(shader, DEBUG_RT);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
|
|
|
|
|
|
prog_data->base.stage = shader->info.stage;
|
2021-10-26 16:39:08 +03:00
|
|
|
|
prog_data->base.ray_queries = shader->info.ray_queries;
|
2022-02-28 15:13:07 +02:00
|
|
|
|
prog_data->base.total_scratch = 0;
|
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
|
prog_data->max_stack_size = 0;
|
2021-10-13 13:05:59 +00:00
|
|
|
|
prog_data->num_resume_shaders = num_resume_shaders;
|
2020-09-04 12:40:06 -05:00
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
fs_generator g(compiler, ¶ms->base, &prog_data->base,
|
2020-09-04 12:40:06 -05:00
|
|
|
|
false, shader->info.stage);
|
2021-03-23 11:12:40 -07:00
|
|
|
|
if (unlikely(debug_enabled)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
char *name = ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
|
"%s %s shader %s",
|
2020-10-21 14:46:50 -05:00
|
|
|
|
shader->info.label ?
|
|
|
|
|
|
shader->info.label : "unnamed",
|
|
|
|
|
|
gl_shader_stage_name(shader->info.stage),
|
|
|
|
|
|
shader->info.name);
|
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2020-09-04 12:40:06 -05:00
|
|
|
|
prog_data->simd_size =
|
2023-07-14 02:10:20 +03:00
|
|
|
|
compile_single_bs(compiler, params, params->key, prog_data,
|
|
|
|
|
|
shader, &g, params->base.stats, NULL);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
if (prog_data->simd_size == 0)
|
|
|
|
|
|
return NULL;
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
2023-07-14 02:10:20 +03:00
|
|
|
|
uint64_t *resume_sbt = ralloc_array(params->base.mem_ctx,
|
|
|
|
|
|
uint64_t, num_resume_shaders);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
for (unsigned i = 0; i < num_resume_shaders; i++) {
|
2021-10-13 11:21:41 +02:00
|
|
|
|
if (INTEL_DEBUG(DEBUG_RT)) {
|
2023-07-14 02:10:20 +03:00
|
|
|
|
char *name = ralloc_asprintf(params->base.mem_ctx,
|
|
|
|
|
|
"%s %s resume(%u) shader %s",
|
2020-09-04 12:40:06 -05:00
|
|
|
|
shader->info.label ?
|
|
|
|
|
|
shader->info.label : "unnamed",
|
|
|
|
|
|
gl_shader_stage_name(shader->info.stage),
|
|
|
|
|
|
i, shader->info.name);
|
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* TODO: Figure out shader stats etc. for resume shaders */
|
|
|
|
|
|
int offset = 0;
|
|
|
|
|
|
uint8_t simd_size =
|
2023-07-14 02:10:20 +03:00
|
|
|
|
compile_single_bs(compiler, params, params->key,
|
|
|
|
|
|
prog_data, resume_shaders[i], &g, NULL, &offset);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
if (simd_size == 0)
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
|
|
|
|
assert(offset > 0);
|
|
|
|
|
|
resume_sbt[i] = brw_bsr(compiler->devinfo, offset, simd_size, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We only have one constant data so we want to make sure they're all the
|
|
|
|
|
|
* same.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < num_resume_shaders; i++) {
|
|
|
|
|
|
assert(resume_shaders[i]->constant_data_size ==
|
|
|
|
|
|
shader->constant_data_size);
|
|
|
|
|
|
assert(memcmp(resume_shaders[i]->constant_data,
|
|
|
|
|
|
shader->constant_data,
|
|
|
|
|
|
shader->constant_data_size) == 0);
|
|
|
|
|
|
}
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
|
|
g.add_const_data(shader->constant_data, shader->constant_data_size);
|
2020-09-04 12:40:06 -05:00
|
|
|
|
g.add_resume_sbt(num_resume_shaders, resume_sbt);
|
2020-10-21 14:46:50 -05:00
|
|
|
|
|
|
|
|
|
|
return g.get_assembly();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-09-15 21:43:18 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Test the dispatch mask packing assumptions of
|
|
|
|
|
|
* brw_stage_has_packed_dispatch(). Call this from e.g. the top of
|
|
|
|
|
|
* fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
|
|
|
|
|
|
* executed with an unexpected dispatch mask.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static UNUSED void
|
|
|
|
|
|
brw_fs_test_dispatch_packing(const fs_builder &bld)
|
|
|
|
|
|
{
|
|
|
|
|
|
const gl_shader_stage stage = bld.shader->stage;
|
2019-06-07 18:17:36 -05:00
|
|
|
|
const bool uses_vmask =
|
|
|
|
|
|
stage == MESA_SHADER_FRAGMENT &&
|
|
|
|
|
|
brw_wm_prog_data(bld.shader->stage_prog_data)->uses_vmask;
|
2016-09-15 21:43:18 -07:00
|
|
|
|
|
|
|
|
|
|
if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
|
|
|
|
|
|
bld.shader->stage_prog_data)) {
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
|
const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
|
2019-06-07 18:17:36 -05:00
|
|
|
|
const fs_reg mask = uses_vmask ? brw_vmask_reg() : brw_dmask_reg();
|
2016-09-15 21:43:18 -07:00
|
|
|
|
|
|
|
|
|
|
ubld.ADD(tmp, mask, brw_imm_ud(1));
|
|
|
|
|
|
ubld.AND(tmp, mask, tmp);
|
|
|
|
|
|
|
|
|
|
|
|
/* This will loop forever if the dispatch mask doesn't have the expected
|
|
|
|
|
|
* form '2^n-1', in which case tmp will be non-zero.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bld.emit(BRW_OPCODE_DO);
|
|
|
|
|
|
bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
|
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2020-01-14 12:22:47 -08:00
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
|
fs_visitor::workgroup_size() const
|
|
|
|
|
|
{
|
2021-05-18 10:01:49 -07:00
|
|
|
|
assert(gl_shader_stage_uses_workgroup(stage));
|
2020-01-14 12:22:47 -08:00
|
|
|
|
const struct brw_cs_prog_data *cs = brw_cs_prog_data(prog_data);
|
|
|
|
|
|
return cs->local_size[0] * cs->local_size[1] * cs->local_size[2];
|
|
|
|
|
|
}
|
2023-06-20 14:42:02 -07:00
|
|
|
|
|
|
|
|
|
|
bool brw_should_print_shader(const nir_shader *shader, uint64_t debug_flag)
|
|
|
|
|
|
{
|
|
|
|
|
|
return INTEL_DEBUG(debug_flag) && (!shader->info.internal || NIR_DEBUG(PRINT_INTERNAL));
|
2023-06-21 07:51:00 -07:00
|
|
|
|
}
|
2023-11-21 07:49:02 -08:00
|
|
|
|
|
|
|
|
|
|
namespace brw {
|
|
|
|
|
|
fs_reg
|
|
|
|
|
|
fetch_payload_reg(const brw::fs_builder &bld, uint8_t regs[2],
|
|
|
|
|
|
brw_reg_type type)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!regs[0])
|
|
|
|
|
|
return fs_reg();
|
|
|
|
|
|
|
|
|
|
|
|
if (bld.dispatch_width() > 16) {
|
|
|
|
|
|
const fs_reg tmp = bld.vgrf(type);
|
|
|
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(16, 0);
|
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
|
|
|
|
|
fs_reg components[2];
|
|
|
|
|
|
assert(m <= 2);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
|
components[g] = retype(brw_vec8_grf(regs[g], 0), type);
|
|
|
|
|
|
|
|
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, m, 0);
|
|
|
|
|
|
|
|
|
|
|
|
return tmp;
|
|
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return fs_reg(retype(brw_vec8_grf(regs[0], 0), type));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg
|
|
|
|
|
|
fetch_barycentric_reg(const brw::fs_builder &bld, uint8_t regs[2])
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!regs[0])
|
|
|
|
|
|
return fs_reg();
|
|
|
|
|
|
|
|
|
|
|
|
const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
|
|
|
|
|
|
const brw::fs_builder hbld = bld.exec_all().group(8, 0);
|
|
|
|
|
|
const unsigned m = bld.dispatch_width() / hbld.dispatch_width();
|
|
|
|
|
|
fs_reg *const components = new fs_reg[2 * m];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned c = 0; c < 2; c++) {
|
|
|
|
|
|
for (unsigned g = 0; g < m; g++)
|
|
|
|
|
|
components[c * m + g] = offset(brw_vec8_grf(regs[g / 2], 0),
|
|
|
|
|
|
hbld, c + 2 * (g % 2));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
hbld.LOAD_PAYLOAD(tmp, components, 2 * m, 0);
|
|
|
|
|
|
|
|
|
|
|
|
delete[] components;
|
|
|
|
|
|
return tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
check_dynamic_msaa_flag(const fs_builder &bld,
|
|
|
|
|
|
const struct brw_wm_prog_data *wm_prog_data,
|
|
|
|
|
|
enum brw_wm_msaa_flags flag)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *inst = bld.AND(bld.null_reg_ud(),
|
|
|
|
|
|
dynamic_msaa_flags(wm_prog_data),
|
|
|
|
|
|
brw_imm_ud(flag));
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NZ;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|