2010-08-10 20:39:06 -07:00
|
|
|
|
/*
|
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
|
*
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
|
*
|
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
|
* Software.
|
|
|
|
|
|
*
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/** @file brw_fs.cpp
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#include "main/macros.h"
|
|
|
|
|
|
#include "brw_eu.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
|
#include "brw_fs.h"
|
2015-11-11 10:04:43 -08:00
|
|
|
|
#include "brw_nir.h"
|
2015-03-11 23:14:31 -07:00
|
|
|
|
#include "brw_vec4_gs_visitor.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
|
#include "brw_cfg.h"
|
2013-10-30 10:32:12 -07:00
|
|
|
|
#include "brw_dead_control_flow.h"
|
2019-04-05 15:39:51 -07:00
|
|
|
|
#include "dev/gen_debug.h"
|
2016-01-18 11:35:29 +02:00
|
|
|
|
#include "compiler/glsl_types.h"
|
2016-07-17 18:37:08 -07:00
|
|
|
|
#include "compiler/nir/nir_builder.h"
|
2016-05-05 09:18:07 +02:00
|
|
|
|
#include "program/prog_parameter.h"
|
2018-08-21 09:46:46 -07:00
|
|
|
|
#include "util/u_math.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
|
using namespace brw;
|
|
|
|
|
|
|
2016-08-22 15:01:08 -07:00
|
|
|
|
static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
|
2016-06-28 14:48:22 -07:00
|
|
|
|
const fs_inst *inst);
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(sources, 3)];
|
|
|
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
this->opcode = opcode;
|
|
|
|
|
|
this->dst = dst;
|
2014-02-20 08:18:22 -08:00
|
|
|
|
this->sources = sources;
|
2014-08-14 13:56:24 -07:00
|
|
|
|
this->exec_size = exec_size;
|
i965: Set fs_inst::base_mrf = -1 by default.
On MRF platforms, we need to set base_mrf to the first MRF value we'd
like to use for the message. On send-from-GRF platforms, we set it to
-1 to indicate that the operation doesn't use MRFs.
As MRF platforms are becoming increasingly a thing of the past, we've
forgotten to bother with this. It makes more sense to set it to -1 by
default, so we don't have to think about it for new code.
I searched the code for every instance of 'mlen =' in brw_fs*cpp, and
it appears that all MRF-based messages correctly program a base_mrf.
Forgetting to set base_mrf = -1 can confuse the register allocator,
causing it to think we have a large fake-MRF region. This ends up
moving the send-with-EOT registers earlier, sometimes even out of
the g112-g127 range, which is illegal. For example, this fixes
illegal sends in Piglit's arb_gpu_shader_fp64-layout-std430-fp64-shader,
which had SSBO messages with mlen > 0 but base_mrf == 0.
Cc: mesa-stable@lists.freedesktop.org
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-06-22 17:01:12 -07:00
|
|
|
|
this->base_mrf = -1;
|
2014-08-14 13:56:24 -07:00
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
|
switch (dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
case FIXED_GRF:
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case MRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2016-09-07 13:38:20 -07:00
|
|
|
|
this->size_written = dst.component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2016-09-07 13:38:20 -07:00
|
|
|
|
this->size_written = 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-18 12:30:43 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-06-18 12:30:43 -07:00
|
|
|
|
init(opcode, exec_size, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
|
{
|
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
|
memcpy((void*)this, &that, sizeof(that));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(that.sources, 3)];
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
for (unsigned i = 0; i < that.sources; i++)
|
2014-02-19 21:18:44 -08:00
|
|
|
|
this->src[i] = that.src[i];
|
2014-02-20 09:40:02 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
|
{
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 13:14:05 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->sources != num_sources) {
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
|
|
|
|
|
|
src[i] = this->src[i];
|
|
|
|
|
|
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
this->src = src;
|
2014-02-20 13:14:05 -08:00
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 22:22:39 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
|
|
|
|
|
|
const fs_reg &dst,
|
2014-02-19 20:31:14 -08:00
|
|
|
|
const fs_reg &surf_index,
|
|
|
|
|
|
const fs_reg &varying_offset,
|
2013-03-13 12:27:17 -07:00
|
|
|
|
uint32_t const_offset)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
|
|
|
|
|
* be any component of a vector, and then we load 4 contiguous
|
|
|
|
|
|
* components starting from that.
|
|
|
|
|
|
*
|
2016-09-02 13:53:13 -07:00
|
|
|
|
* We break down the const_offset to a portion added to the variable offset
|
|
|
|
|
|
* and a portion done using fs_reg::offset, which means that if you have
|
|
|
|
|
|
* GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
|
|
|
|
|
|
* we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
|
|
|
|
|
|
* later notice that those loads are all the same and eliminate the
|
|
|
|
|
|
* redundant ones.
|
2013-03-18 10:16:42 -07:00
|
|
|
|
*/
|
2015-11-25 09:59:03 -08:00
|
|
|
|
fs_reg vec4_offset = vgrf(glsl_type::uint_type);
|
2015-10-31 16:52:29 -07:00
|
|
|
|
bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
|
2013-03-18 10:16:42 -07:00
|
|
|
|
|
2016-01-14 08:55:28 +01:00
|
|
|
|
/* The pull load message will load a vec4 (16 bytes). If we are loading
|
|
|
|
|
|
* a double this means we are only loading 2 elements worth of data.
|
|
|
|
|
|
* We also want to use a 32-bit data type for the dst of the load operation
|
|
|
|
|
|
* so other parts of the driver don't get confused about the size of the
|
|
|
|
|
|
* result.
|
|
|
|
|
|
*/
|
2016-05-18 01:26:03 -07:00
|
|
|
|
fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
|
2016-05-17 23:18:38 -07:00
|
|
|
|
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
|
|
|
|
|
|
vec4_result, surf_index, vec4_offset);
|
2016-09-01 18:43:48 -07:00
|
|
|
|
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2018-06-09 11:45:42 +02:00
|
|
|
|
shuffle_from_32bit_read(bld, dst, vec4_result,
|
|
|
|
|
|
(const_offset & 0xf) / type_sz(dst.type), 1);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* A helper for MOV generation for fixing up broken hardware SEND dependency
|
|
|
|
|
|
* handling.
|
|
|
|
|
|
*/
|
2015-06-03 22:22:10 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* The caller always wants uncompressed to emit the minimal extra
|
|
|
|
|
|
* dependencies, and to avoid having to deal with aligning its regs to 2.
|
|
|
|
|
|
*/
|
2015-06-03 22:22:10 +03:00
|
|
|
|
const fs_builder ubld = bld.annotate("send dependency resolve")
|
|
|
|
|
|
.half(0);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
|
case SHADER_OPCODE_SEND:
|
2014-09-13 11:49:55 -07:00
|
|
|
|
case SHADER_OPCODE_SHADER_TIME_ADD:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2014-10-20 23:00:50 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
2015-05-06 00:04:10 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
2015-09-29 14:32:02 -07:00
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8:
|
2015-11-07 01:37:33 -08:00
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
|
2019-04-26 17:11:42 -07:00
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2014-09-13 11:49:55 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
return src[1].file == VGRF;
|
2014-09-12 16:17:37 -07:00
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
2016-07-21 16:52:33 -07:00
|
|
|
|
case FS_OPCODE_FB_READ:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
return src[0].file == VGRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
|
default:
|
|
|
|
|
|
if (is_tex())
|
2015-10-26 17:09:25 -07:00
|
|
|
|
return src[0].file == VGRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-01-16 18:30:08 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::is_control_source(unsigned arg) const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
|
|
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
|
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
|
|
|
|
|
return arg == 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
|
|
|
|
|
case SHADER_OPCODE_TEX:
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LZ:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
case SHADER_OPCODE_LOD:
|
|
|
|
|
|
case SHADER_OPCODE_TG4:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
|
|
|
|
|
return arg == 1 || arg == 2;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
|
return arg == 0 || arg == 1;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-09 14:13:37 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::is_payload(unsigned arg) const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
case FS_OPCODE_FB_READ:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
|
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8:
|
|
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
|
|
|
|
|
|
case VEC4_OPCODE_UNTYPED_ATOMIC:
|
|
|
|
|
|
case VEC4_OPCODE_UNTYPED_SURFACE_READ:
|
|
|
|
|
|
case VEC4_OPCODE_UNTYPED_SURFACE_WRITE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_SHADER_TIME_ADD:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case SHADER_OPCODE_INTERLOCK:
|
|
|
|
|
|
case SHADER_OPCODE_MEMORY_FENCE:
|
|
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
|
|
|
|
|
|
return arg == 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
|
return arg == 2 || arg == 3;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
if (is_tex())
|
|
|
|
|
|
return arg == 0;
|
|
|
|
|
|
else
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if this instruction's sources and destinations cannot
|
|
|
|
|
|
* safely be the same register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* In most cases, a register can be written over safely by the same
|
|
|
|
|
|
* instruction that is its last use. For a single instruction, the
|
|
|
|
|
|
* sources are dereferenced before writing of the destination starts
|
|
|
|
|
|
* (naturally).
|
|
|
|
|
|
*
|
|
|
|
|
|
* However, there are a few cases where this can be problematic:
|
|
|
|
|
|
*
|
|
|
|
|
|
* - Virtual opcodes that translate to multiple instructions in the
|
|
|
|
|
|
* code generator: if src == dst and one instruction writes the
|
|
|
|
|
|
* destination before a later instruction reads the source, then
|
|
|
|
|
|
* src will have been clobbered.
|
|
|
|
|
|
*
|
|
|
|
|
|
* - SIMD16 compressed instructions with certain regioning (see below).
|
|
|
|
|
|
*
|
|
|
|
|
|
* The register allocator uses this information to set up conflicts between
|
|
|
|
|
|
* GRF sources and the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::has_source_and_destination_hazard() const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
|
/* Multiple partial writes to the destination */
|
|
|
|
|
|
return true;
|
2017-08-29 09:21:32 -07:00
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
|
/* This instruction returns an arbitrary channel from the source and
|
|
|
|
|
|
* gets split into smaller instructions in the generator. It's possible
|
|
|
|
|
|
* that one of the instructions will read from a channel corresponding
|
|
|
|
|
|
* to an earlier instruction.
|
|
|
|
|
|
*/
|
2017-08-31 21:45:30 -07:00
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
|
/* This is implemented as
|
|
|
|
|
|
*
|
|
|
|
|
|
* mov(16) g4<1>D 0D { align1 WE_all 1H };
|
|
|
|
|
|
* mov(16) g4<1>D g5<8,8,1>D { align1 1H }
|
|
|
|
|
|
*
|
|
|
|
|
|
* Because the source is only read in the second instruction, the first
|
|
|
|
|
|
* may stomp all over it.
|
|
|
|
|
|
*/
|
2017-08-29 09:21:32 -07:00
|
|
|
|
return true;
|
2018-12-06 14:11:34 -08:00
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE:
|
|
|
|
|
|
switch (src[1].ud) {
|
|
|
|
|
|
case BRW_SWIZZLE_XXXX:
|
|
|
|
|
|
case BRW_SWIZZLE_YYYY:
|
|
|
|
|
|
case BRW_SWIZZLE_ZZZZ:
|
|
|
|
|
|
case BRW_SWIZZLE_WWWW:
|
|
|
|
|
|
case BRW_SWIZZLE_XXZZ:
|
|
|
|
|
|
case BRW_SWIZZLE_YYWW:
|
|
|
|
|
|
case BRW_SWIZZLE_XYXY:
|
|
|
|
|
|
case BRW_SWIZZLE_ZWZW:
|
|
|
|
|
|
/* These can be implemented as a single Align1 region on all
|
|
|
|
|
|
* platforms, so there's never a hazard between source and
|
|
|
|
|
|
* destination. C.f. fs_generator::generate_quad_swizzle().
|
|
|
|
|
|
*/
|
|
|
|
|
|
return false;
|
|
|
|
|
|
default:
|
|
|
|
|
|
return !is_uniform(src[0]);
|
|
|
|
|
|
}
|
i965: Add src/dst interference for certain instructions with hazards.
When working on tessellation shaders, I created some vec4 virtual
opcodes for creating message headers through a sequence like:
mov(8) g7<1>UD 0x00000000UD { align1 WE_all 1Q compacted };
mov(1) g7.5<1>UD 0x00000100UD { align1 WE_all };
mov(1) g7<1>UD g0<0,1,0>UD { align1 WE_all compacted };
mov(1) g7.3<1>UD g8<0,1,0>UD { align1 WE_all };
This is done in the generator since the vec4 backend can't handle align1
regioning. From the visitor's point of view, this is a single opcode:
hs_set_output_urb_offsets vgrf7.0:UD, 1U, vgrf8.xxxx:UD
Normally, there's no hazard between sources and destinations - an
instruction (naturally) reads its sources, then writes the result to the
destination. However, when the virtual instruction generates multiple
hardware instructions, we can get into trouble.
In the above example, if the register allocator assigned vgrf7 and vgrf8
to the same hardware register, then we'd clobber the source with 0 in
the first instruction, and read back the wrong value in the last one.
It occured to me that this is exactly the same problem we have with
SIMD16 instructions that use W/UW or B/UB types with 0 stride. The
hardware implicitly decodes them as two SIMD8 instructions, and with
the overlapping regions, the first would clobber the second.
Previously, we handled that by incrementing the live range end IP by 1,
which works, but is excessive: the next instruction doesn't actually
care about that. It might also be the end of control flow. This might
keep values alive too long. What we really want is to say "my source
and destinations interfere".
This patch creates new infrastructure for doing just that, and teaches
the register allocator to add interference when there's a hazard. For
my vec4 case, we can determine this by switching on opcodes. For the
SIMD16 case, we just move the existing code there.
I audited our existing virtual opcodes that generate multiple
instructions; I believe FS_OPCODE_PACK_HALF_2x16_SPLIT needs this
treatment as well, but no others.
v2: Rebased by mattst88.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2015-11-19 16:00:18 -08:00
|
|
|
|
default:
|
|
|
|
|
|
/* The SIMD16 compressed instruction
|
|
|
|
|
|
*
|
|
|
|
|
|
* add(16) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* is actually decoded in hardware as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* add(8) g4<1>F g4<8,8,1>F g6<8,8,1>F
|
|
|
|
|
|
* add(8) g5<1>F g5<8,8,1>F g7<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* Which is safe. However, if we have uniform accesses
|
|
|
|
|
|
* happening, we get into trouble:
|
|
|
|
|
|
*
|
|
|
|
|
|
* add(8) g4<1>F g4<0,1,0>F g6<8,8,1>F
|
|
|
|
|
|
* add(8) g5<1>F g4<0,1,0>F g7<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* Now our destination for the first instruction overwrote the
|
|
|
|
|
|
* second instruction's src0, and we get garbage for those 8
|
|
|
|
|
|
* pixels. There's a similar issue for the pre-gen6
|
|
|
|
|
|
* pixel_x/pixel_y, which are registers of 16-bit values and thus
|
|
|
|
|
|
* would get stomped by the first decode as well.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (exec_size == 16) {
|
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
|
|
|
|
|
if (src[i].file == VGRF && (src[i].stride == 0 ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_UW ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_W ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_UB ||
|
|
|
|
|
|
src[i].type == BRW_REGISTER_TYPE_B)) {
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-01 15:38:23 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg reg = this->src[0];
|
2016-09-02 19:32:37 -07:00
|
|
|
|
if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
|
2015-04-01 15:38:23 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
|
2015-04-01 15:38:23 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
for (int i = 0; i < this->sources; i++) {
|
|
|
|
|
|
reg.type = this->src[i].type;
|
|
|
|
|
|
if (!this->src[i].equals(reg))
|
2015-04-01 15:38:23 -07:00
|
|
|
|
return false;
|
2015-06-18 12:07:27 -07:00
|
|
|
|
|
|
|
|
|
|
if (i < this->header_size) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
reg.offset += REG_SIZE;
|
2015-06-18 12:07:27 -07:00
|
|
|
|
} else {
|
2015-08-11 14:10:46 -07:00
|
|
|
|
reg = horiz_offset(reg, this->exec_size);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
}
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
}
|
2015-04-01 15:38:23 -07:00
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2018-12-29 01:41:09 -08:00
|
|
|
|
fs_inst::can_do_source_mods(const struct gen_device_info *devinfo) const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6 && is_math())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2018-12-07 14:13:53 -08:00
|
|
|
|
/* From GEN:BUG:1604601757:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "When multiplying a DW and any lower precision integer, source modifier
|
|
|
|
|
|
* is not supported."
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen >= 12 && (opcode == BRW_OPCODE_MUL ||
|
|
|
|
|
|
opcode == BRW_OPCODE_MAD)) {
|
|
|
|
|
|
const brw_reg_type exec_type = get_exec_type(this);
|
|
|
|
|
|
const unsigned min_type_sz = opcode == BRW_OPCODE_MAD ?
|
|
|
|
|
|
MIN2(type_sz(src[1].type), type_sz(src[2].type)) :
|
|
|
|
|
|
MIN2(type_sz(src[0].type), type_sz(src[1].type));
|
|
|
|
|
|
|
|
|
|
|
|
if (brw_reg_type_is_integer(exec_type) &&
|
|
|
|
|
|
type_sz(exec_type) >= 4 &&
|
|
|
|
|
|
type_sz(exec_type) != min_type_sz)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (!backend_instruction::can_do_source_mods())
|
2013-09-19 19:48:22 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-10-08 12:22:35 -05:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::can_do_cmod()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!backend_instruction::can_do_cmod())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* The accumulator result appears to get used for the conditional modifier
|
|
|
|
|
|
* generation. When negating a UD value, there is a 33rd bit generated for
|
|
|
|
|
|
* the sign in the accumulator value, so now you can't check, for example,
|
|
|
|
|
|
* equality with a 32-bit value. See piglit fs-op-neg-uvec4.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < sources; i++) {
|
|
|
|
|
|
if (type_is_unsigned_int(src[i].type) && src[i].negate)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-14 02:12:09 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::can_change_types() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return dst.type == src[0].type &&
|
|
|
|
|
|
!src[0].abs && !src[0].negate && !saturate &&
|
|
|
|
|
|
(opcode == BRW_OPCODE_MOV ||
|
|
|
|
|
|
(opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
|
dst.type == src[1].type &&
|
|
|
|
|
|
predicate != BRW_PREDICATE_NONE &&
|
|
|
|
|
|
!src[1].abs && !src[1].negate));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_reg::init()
|
|
|
|
|
|
{
|
2018-07-16 13:19:30 -07:00
|
|
|
|
memset((void*)this, 0, sizeof(*this));
|
2017-07-25 13:16:25 -07:00
|
|
|
|
type = BRW_REGISTER_TYPE_UD;
|
2013-12-08 04:57:35 +01:00
|
|
|
|
stride = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Generic unset register constructor. */
|
|
|
|
|
|
fs_reg::fs_reg()
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = BAD_FILE;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-23 16:17:28 -08:00
|
|
|
|
fs_reg::fs_reg(struct ::brw_reg reg) :
|
2015-10-24 15:29:03 -07:00
|
|
|
|
backend_reg(reg)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2016-09-01 12:42:20 -07:00
|
|
|
|
this->offset = 0;
|
2015-10-24 15:29:03 -07:00
|
|
|
|
this->stride = 1;
|
2015-11-02 00:25:04 +00:00
|
|
|
|
if (this->file == IMM &&
|
|
|
|
|
|
(this->type != BRW_REGISTER_TYPE_V &&
|
|
|
|
|
|
this->type != BRW_REGISTER_TYPE_UV &&
|
|
|
|
|
|
this->type != BRW_REGISTER_TYPE_VF)) {
|
|
|
|
|
|
this->stride = 0;
|
|
|
|
|
|
}
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
2015-11-22 13:25:05 -08:00
|
|
|
|
return (this->backend_reg::equals(r) &&
|
2014-06-29 15:13:24 -07:00
|
|
|
|
stride == r.stride);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-07 16:11:37 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::negative_equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
|
|
|
|
|
return (this->backend_reg::negative_equals(r) &&
|
|
|
|
|
|
stride == r.stride);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_contiguous() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return stride == 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-14 15:43:44 +03:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_reg::component_size(unsigned width) const
|
|
|
|
|
|
{
|
2015-10-26 17:52:57 -07:00
|
|
|
|
const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
|
2015-10-24 15:29:03 -07:00
|
|
|
|
hstride == 0 ? 0 :
|
|
|
|
|
|
1 << (hstride - 1));
|
2015-07-14 15:43:44 +03:00
|
|
|
|
return MAX2(width * stride, 1) * type_sz(type);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-12 14:19:17 -07:00
|
|
|
|
extern "C" int
|
2019-03-29 12:39:48 +11:00
|
|
|
|
type_size_scalar(const struct glsl_type *type, bool bindless)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
|
|
|
|
|
unsigned int size, i;
|
|
|
|
|
|
|
|
|
|
|
|
switch (type->base_type) {
|
|
|
|
|
|
case GLSL_TYPE_UINT:
|
|
|
|
|
|
case GLSL_TYPE_INT:
|
|
|
|
|
|
case GLSL_TYPE_FLOAT:
|
|
|
|
|
|
case GLSL_TYPE_BOOL:
|
2010-08-27 10:44:04 -07:00
|
|
|
|
return type->components();
|
2017-07-01 08:06:45 +02:00
|
|
|
|
case GLSL_TYPE_UINT16:
|
|
|
|
|
|
case GLSL_TYPE_INT16:
|
|
|
|
|
|
case GLSL_TYPE_FLOAT16:
|
|
|
|
|
|
return DIV_ROUND_UP(type->components(), 2);
|
2018-01-25 07:59:06 -05:00
|
|
|
|
case GLSL_TYPE_UINT8:
|
|
|
|
|
|
case GLSL_TYPE_INT8:
|
|
|
|
|
|
return DIV_ROUND_UP(type->components(), 4);
|
2015-07-28 15:51:40 -07:00
|
|
|
|
case GLSL_TYPE_DOUBLE:
|
2016-06-20 14:54:28 +10:00
|
|
|
|
case GLSL_TYPE_UINT64:
|
|
|
|
|
|
case GLSL_TYPE_INT64:
|
2015-07-28 15:51:40 -07:00
|
|
|
|
return type->components() * 2;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
case GLSL_TYPE_ARRAY:
|
2019-03-29 12:39:48 +11:00
|
|
|
|
return type_size_scalar(type->fields.array, bindless) * type->length;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
case GLSL_TYPE_STRUCT:
|
2019-03-21 15:02:38 -07:00
|
|
|
|
case GLSL_TYPE_INTERFACE:
|
2010-08-15 18:58:58 -07:00
|
|
|
|
size = 0;
|
|
|
|
|
|
for (i = 0; i < type->length; i++) {
|
2019-03-29 12:39:48 +11:00
|
|
|
|
size += type_size_scalar(type->fields.structure[i].type, bindless);
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
return size;
|
|
|
|
|
|
case GLSL_TYPE_SAMPLER:
|
2018-08-16 16:23:10 -05:00
|
|
|
|
case GLSL_TYPE_IMAGE:
|
2019-03-29 12:39:48 +11:00
|
|
|
|
if (bindless)
|
|
|
|
|
|
return type->components() * 2;
|
|
|
|
|
|
case GLSL_TYPE_ATOMIC_UINT:
|
2018-08-16 16:23:10 -05:00
|
|
|
|
/* Samplers, atomics, and images take up no register space, since
|
|
|
|
|
|
* they're baked in at link time.
|
|
|
|
|
|
*/
|
2013-10-20 12:35:47 -07:00
|
|
|
|
return 0;
|
2015-07-21 14:22:11 +10:00
|
|
|
|
case GLSL_TYPE_SUBROUTINE:
|
|
|
|
|
|
return 1;
|
2012-12-11 12:56:03 -08:00
|
|
|
|
case GLSL_TYPE_VOID:
|
|
|
|
|
|
case GLSL_TYPE_ERROR:
|
2016-02-09 18:17:06 -08:00
|
|
|
|
case GLSL_TYPE_FUNCTION:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
2012-12-11 12:56:03 -08:00
|
|
|
|
|
|
|
|
|
|
return 0;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-26 23:51:27 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Create a MOV to read the timestamp register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The caller is responsible for emitting the MOV. The return value is
|
|
|
|
|
|
* the destination of the MOV, with extra parameters set.
|
|
|
|
|
|
*/
|
2012-11-27 14:10:52 -08:00
|
|
|
|
fs_reg
|
2015-06-03 20:43:09 +03:00
|
|
|
|
fs_visitor::get_timestamp(const fs_builder &bld)
|
2012-11-27 14:10:52 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2014-10-31 11:12:30 -07:00
|
|
|
|
fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
2012-11-27 14:10:52 -08:00
|
|
|
|
BRW_ARF_TIMESTAMP,
|
|
|
|
|
|
0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2014-10-31 11:12:30 -07:00
|
|
|
|
/* We want to read the 3 fields we care about even if it's not enabled in
|
|
|
|
|
|
* the dispatch.
|
2012-11-27 14:10:52 -08:00
|
|
|
|
*/
|
2015-06-18 12:24:27 -07:00
|
|
|
|
bld.group(4, 0).exec_all().MOV(dst, ts);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_begin()
|
|
|
|
|
|
{
|
2015-10-09 10:40:35 +01:00
|
|
|
|
/* We want only the low 32 bits of the timestamp. Since it's running
|
|
|
|
|
|
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
|
|
|
|
|
|
* which is plenty of time for our purposes. It is identical across the
|
|
|
|
|
|
* EUs, but since it's tracking GPU core speed it will increment at a
|
|
|
|
|
|
* varying rate as render P-states change.
|
|
|
|
|
|
*/
|
2016-09-01 00:35:03 -07:00
|
|
|
|
shader_start_time = component(
|
|
|
|
|
|
get_timestamp(bld.annotate("shader time start")), 0);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_end()
|
|
|
|
|
|
{
|
2015-02-26 22:55:54 -08:00
|
|
|
|
/* Insert our code just before the final SEND with EOT. */
|
|
|
|
|
|
exec_node *end = this->instructions.get_tail();
|
|
|
|
|
|
assert(end && ((fs_inst *) end)->eot);
|
2015-06-03 20:43:09 +03:00
|
|
|
|
const fs_builder ibld = bld.annotate("shader time end")
|
|
|
|
|
|
.exec_all().at(NULL, end);
|
2016-09-01 00:35:03 -07:00
|
|
|
|
const fs_reg timestamp = get_timestamp(ibld);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-10-09 10:40:35 +01:00
|
|
|
|
/* We only use the low 32 bits of the timestamp - see
|
|
|
|
|
|
* emit_shader_time_begin()).
|
|
|
|
|
|
*
|
|
|
|
|
|
* We could also check if render P-states have changed (or anything
|
|
|
|
|
|
* else that might disrupt timing) by setting smear to 2 and checking if
|
|
|
|
|
|
* that field is != 0.
|
|
|
|
|
|
*/
|
2016-09-01 00:35:03 -07:00
|
|
|
|
const fs_reg shader_end_time = component(timestamp, 0);
|
2015-10-09 10:40:35 +01:00
|
|
|
|
|
2012-11-27 14:10:52 -08:00
|
|
|
|
/* Check that there weren't any timestamp reset events (assuming these
|
|
|
|
|
|
* were the only two timestamp reads that happened).
|
|
|
|
|
|
*/
|
2016-09-01 00:35:03 -07:00
|
|
|
|
const fs_reg reset = component(timestamp, 2);
|
2015-06-03 20:43:09 +03:00
|
|
|
|
set_condmod(BRW_CONDITIONAL_Z,
|
2015-11-02 11:26:16 -08:00
|
|
|
|
ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
|
2015-06-03 20:43:09 +03:00
|
|
|
|
ibld.IF(BRW_PREDICATE_NORMAL);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg start = shader_start_time;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
start.negate = true;
|
2016-09-01 00:35:03 -07:00
|
|
|
|
const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
0);
|
2015-06-18 12:24:27 -07:00
|
|
|
|
const fs_builder cbld = ibld.group(1, 0);
|
|
|
|
|
|
cbld.group(1, 0).ADD(diff, start, shader_end_time);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* If there were no instructions between the two timestamp gets, the diff
|
|
|
|
|
|
* is 2 cycles. Remove that overhead, so I can forget about that when
|
|
|
|
|
|
* trying to determine the time taken for single instructions.
|
|
|
|
|
|
*/
|
2015-11-02 11:26:16 -08:00
|
|
|
|
cbld.ADD(diff, diff, brw_imm_ud(-2u));
|
2015-06-18 12:24:27 -07:00
|
|
|
|
SHADER_TIME_ADD(cbld, 0, diff);
|
2015-11-02 11:26:16 -08:00
|
|
|
|
SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
|
2015-06-03 20:43:09 +03:00
|
|
|
|
ibld.emit(BRW_OPCODE_ELSE);
|
2015-11-02 11:26:16 -08:00
|
|
|
|
SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
|
2015-06-03 20:43:09 +03:00
|
|
|
|
ibld.emit(BRW_OPCODE_ENDIF);
|
2012-12-10 09:21:34 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 20:43:09 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
|
2015-06-19 15:40:09 -07:00
|
|
|
|
int shader_time_subindex,
|
2015-06-19 14:46:03 -07:00
|
|
|
|
fs_reg value)
|
2012-12-10 09:21:34 -08:00
|
|
|
|
{
|
2015-06-19 14:46:03 -07:00
|
|
|
|
int index = shader_time_index * 3 + shader_time_subindex;
|
2017-02-28 18:10:53 -08:00
|
|
|
|
struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2013-03-19 15:28:11 -07:00
|
|
|
|
fs_reg payload;
|
|
|
|
|
|
if (dispatch_width == 8)
|
2014-05-16 02:21:51 -07:00
|
|
|
|
payload = vgrf(glsl_type::uvec2_type);
|
2013-03-19 15:28:11 -07:00
|
|
|
|
else
|
2014-05-16 02:21:51 -07:00
|
|
|
|
payload = vgrf(glsl_type::uint_type);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-06-03 20:43:09 +03:00
|
|
|
|
bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2015-02-18 17:43:07 -08:00
|
|
|
|
msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
|
2015-02-18 17:43:07 -08:00
|
|
|
|
if (debug_enabled) {
|
2011-06-10 15:26:02 -03:00
|
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
2016-05-18 14:39:52 -07:00
|
|
|
|
* Mark this program as impossible to compile with dispatch width greater
|
|
|
|
|
|
* than n.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
*
|
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
2016-05-18 14:39:52 -07:00
|
|
|
|
* things that are unsupported in SIMD16+ mode, so the compiler can skip the
|
|
|
|
|
|
* SIMD16+ compile altogether.
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
*
|
2016-05-18 14:39:52 -07:00
|
|
|
|
* During a compile of dispatch width greater than n (if one happens anyway),
|
|
|
|
|
|
* this just calls fail().
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
2016-05-18 14:39:52 -07:00
|
|
|
|
fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
{
|
2016-05-18 14:39:52 -07:00
|
|
|
|
if (dispatch_width > n) {
|
2015-06-22 16:30:04 -07:00
|
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
} else {
|
2016-05-18 14:39:52 -07:00
|
|
|
|
max_dispatch_width = n;
|
2015-06-22 17:17:56 -07:00
|
|
|
|
compiler->shader_perf_log(log_data,
|
2016-05-18 14:39:52 -07:00
|
|
|
|
"Shader dispatch width limited to SIMD%d: %s",
|
|
|
|
|
|
n, msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
|
* it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
2019-04-24 12:38:28 +02:00
|
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
|
{
|
2013-08-05 16:24:43 -07:00
|
|
|
|
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
|
2019-04-24 12:38:28 +02:00
|
|
|
|
(this->exec_size * type_sz(this->dst.type)) < 32 ||
|
i965/fs: recognize writes with a subreg_offset > 0 as partial
Usually, writes to a subreg_offset > 0 would also have a stride > 1
and we would recognize them as partial, however, there is one case
where this does not happen, that is when we generate code for 64-bit
imemdiates in gen7, where we produce something like this:
mov(8) vgrf10:UD, <low 32-bit>
mov(8) vgrf10+0.4:UD, <high 32-bit>
and then we use the result with a stride of 0, as in:
mov(8) vgrf13:DF, vgrf10<0>:DF
Although we could try to avoid this issue by producing different code
for this by using writes with a stride of 2, that runs into other
problems affecting gen7 and the fact is that any instruction that
writes to a subreg_offset > 0 is a partial write so we should really
recognize them as such.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2016-03-31 12:05:31 +02:00
|
|
|
|
!this->dst.is_contiguous() ||
|
2016-09-01 15:11:21 -07:00
|
|
|
|
this->dst.offset % REG_SIZE != 0);
|
2012-06-04 08:59:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_inst::components_read(unsigned i) const
|
|
|
|
|
|
{
|
2016-08-12 18:33:58 -07:00
|
|
|
|
/* Return zero if the source is not present. */
|
|
|
|
|
|
if (src[i].file == BAD_FILE)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_LINTERP:
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
|
|
|
|
|
assert(i == 0);
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
2015-10-20 14:29:37 -07:00
|
|
|
|
assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
2015-07-27 16:14:36 +03:00
|
|
|
|
/* First/second FB write color. */
|
|
|
|
|
|
if (i < 2)
|
2015-10-24 14:55:57 -07:00
|
|
|
|
return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
2015-07-27 16:14:36 +03:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
2018-10-31 09:52:33 -05:00
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
2015-09-08 15:52:09 +01:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-02-05 18:39:13 -08:00
|
|
|
|
assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
|
|
|
|
|
|
src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
/* Texture coordinates. */
|
2016-02-05 18:39:13 -08:00
|
|
|
|
if (i == TEX_LOGICAL_SRC_COORDINATE)
|
|
|
|
|
|
return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
/* Texture derivatives. */
|
2016-02-05 18:39:13 -08:00
|
|
|
|
else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
|
|
|
|
|
|
opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
|
|
|
|
return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
/* Texture offset. */
|
2016-11-28 18:13:02 -08:00
|
|
|
|
else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
return 2;
|
2015-09-08 15:52:09 +01:00
|
|
|
|
/* MCS */
|
2016-02-05 18:39:13 -08:00
|
|
|
|
else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
|
2015-09-08 15:52:09 +01:00
|
|
|
|
return 2;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface operation source (ignored for reads). */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
2015-07-21 18:45:32 +03:00
|
|
|
|
return 0;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface operation source. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
|
|
|
|
|
assert(src[2].file == IMM);
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
|
|
|
|
|
assert(src[2].file == IMM);
|
|
|
|
|
|
return i == 1 ? src[2].ud : 1;
|
|
|
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
2019-01-12 18:30:47 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
|
2018-11-26 15:15:04 -06:00
|
|
|
|
assert(src[2].file == IMM);
|
|
|
|
|
|
if (i == 1) {
|
|
|
|
|
|
/* Data source */
|
|
|
|
|
|
const unsigned op = src[2].ud;
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case BRW_AOP_INC:
|
|
|
|
|
|
case BRW_AOP_DEC:
|
|
|
|
|
|
case BRW_AOP_PREDEC:
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
case BRW_AOP_CMPWR:
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
default:
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
|
|
|
|
|
assert(src[2].file == IMM);
|
|
|
|
|
|
if (i == 1) {
|
|
|
|
|
|
/* Data source */
|
|
|
|
|
|
const unsigned op = src[2].ud;
|
|
|
|
|
|
return op == BRW_AOP_FCMPWR ? 2 : 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-07-01 08:19:17 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
|
/* Scattered logical opcodes use the following params:
|
|
|
|
|
|
* src[0] Surface coordinates
|
|
|
|
|
|
* src[1] Surface operation source (ignored for reads)
|
|
|
|
|
|
* src[2] Surface
|
|
|
|
|
|
* src[3] IMM with always 1 dimension.
|
|
|
|
|
|
* src[4] IMM with arg bitsize for scattered read/write 8, 16, 32
|
|
|
|
|
|
*/
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
return i == SURFACE_LOGICAL_SRC_DATA ? 0 : 1;
|
2017-07-01 08:19:17 +02:00
|
|
|
|
|
2017-07-01 08:16:01 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
2017-07-01 08:16:01 +02:00
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Surface operation source. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_CMPWR)
|
2015-07-21 18:45:32 +03:00
|
|
|
|
return 2;
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA &&
|
|
|
|
|
|
(op == BRW_AOP_INC || op == BRW_AOP_DEC || op == BRW_AOP_PREDEC))
|
2015-07-21 18:45:32 +03:00
|
|
|
|
return 0;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
2016-04-25 18:06:13 -07:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
return (i == 0 ? 2 : 1);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
|
2018-04-18 14:02:33 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL: {
|
2019-02-11 14:51:02 -06:00
|
|
|
|
assert(src[SURFACE_LOGICAL_SRC_IMM_DIMS].file == IMM &&
|
|
|
|
|
|
src[SURFACE_LOGICAL_SRC_IMM_ARG].file == IMM);
|
|
|
|
|
|
const unsigned op = src[SURFACE_LOGICAL_SRC_IMM_ARG].ud;
|
2018-04-18 14:02:33 -07:00
|
|
|
|
/* Surface coordinates. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
if (i == SURFACE_LOGICAL_SRC_ADDRESS)
|
|
|
|
|
|
return src[SURFACE_LOGICAL_SRC_IMM_DIMS].ud;
|
2018-04-18 14:02:33 -07:00
|
|
|
|
/* Surface operation source. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
else if (i == SURFACE_LOGICAL_SRC_DATA && op == BRW_AOP_FCMPWR)
|
2018-04-18 14:02:33 -07:00
|
|
|
|
return 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
default:
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-09-07 17:00:58 -07:00
|
|
|
|
unsigned
|
2016-09-07 17:00:07 -07:00
|
|
|
|
fs_inst::size_read(int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
{
|
2015-06-18 11:53:08 -07:00
|
|
|
|
switch (opcode) {
|
2018-10-29 15:06:14 -05:00
|
|
|
|
case SHADER_OPCODE_SEND:
|
|
|
|
|
|
if (arg == 2) {
|
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
|
} else if (arg == 3) {
|
|
|
|
|
|
return ex_mlen * REG_SIZE;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
2018-05-17 15:40:48 -07:00
|
|
|
|
case FS_OPCODE_REP_FB_WRITE:
|
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
|
if (base_mrf >= 0)
|
|
|
|
|
|
return src[0].file == BAD_FILE ? 0 : 2 * REG_SIZE;
|
|
|
|
|
|
else
|
|
|
|
|
|
return mlen * REG_SIZE;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-07-21 16:52:33 -07:00
|
|
|
|
case FS_OPCODE_FB_READ:
|
2015-06-18 11:53:08 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
2015-05-06 00:04:10 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
2015-09-29 14:32:02 -07:00
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8:
|
2015-11-07 01:37:33 -08:00
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
|
2018-04-19 20:48:42 -07:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
2015-06-18 11:53:08 -07:00
|
|
|
|
if (arg == 0)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2017-01-13 15:32:05 -08:00
|
|
|
|
case FS_OPCODE_SET_SAMPLE_ID:
|
|
|
|
|
|
if (arg == 1)
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-17 18:02:11 -07:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
|
|
|
|
|
|
/* The payload is actually stored in src1 */
|
|
|
|
|
|
if (arg == 1)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return mlen * REG_SIZE;
|
2015-06-17 18:02:11 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
case FS_OPCODE_LINTERP:
|
2015-07-21 17:28:39 +03:00
|
|
|
|
if (arg == 1)
|
2016-09-07 13:02:55 -07:00
|
|
|
|
return 16;
|
2015-06-18 17:48:27 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-30 15:51:13 -07:00
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
|
if (arg < this->header_size)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return REG_SIZE;
|
2015-06-30 15:51:13 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-07-16 15:04:43 -07:00
|
|
|
|
case CS_OPCODE_CS_TERMINATE:
|
2015-09-15 14:01:17 -07:00
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return REG_SIZE;
|
2015-07-16 15:04:43 -07:00
|
|
|
|
|
2015-11-07 18:58:34 -08:00
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT:
|
|
|
|
|
|
if (arg == 0) {
|
|
|
|
|
|
assert(src[2].file == IMM);
|
2016-09-07 14:36:32 -07:00
|
|
|
|
return src[2].ud;
|
2015-11-07 18:58:34 -08:00
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
default:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (is_tex() && arg == 0 && src[0].file == VGRF)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return mlen * REG_SIZE;
|
2015-06-18 11:53:08 -07:00
|
|
|
|
break;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
case IMM:
|
2016-09-02 16:23:44 -07:00
|
|
|
|
return components_read(arg) * type_sz(src[arg].type);
|
2016-08-12 18:33:58 -07:00
|
|
|
|
case BAD_FILE:
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
case FIXED_GRF:
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-08-05 16:29:30 +03:00
|
|
|
|
case ATTR:
|
2016-09-07 17:00:07 -07:00
|
|
|
|
return components_read(arg) * src[arg].component_size(exec_size);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case MRF:
|
|
|
|
|
|
unreachable("MRF registers are not allowed as sources");
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
}
|
2015-10-26 06:58:56 -07:00
|
|
|
|
return 0;
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
|
namespace {
|
2019-09-24 17:06:12 -05:00
|
|
|
|
unsigned
|
|
|
|
|
|
predicate_width(brw_predicate predicate)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (predicate) {
|
|
|
|
|
|
case BRW_PREDICATE_NONE: return 1;
|
|
|
|
|
|
case BRW_PREDICATE_NORMAL: return 1;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY2H: return 2;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL2H: return 2;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY4H: return 4;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL4H: return 4;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY8H: return 8;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL8H: return 8;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY16H: return 16;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL16H: return 16;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ANY32H: return 32;
|
|
|
|
|
|
case BRW_PREDICATE_ALIGN1_ALL32H: return 32;
|
|
|
|
|
|
default: unreachable("Unsupported predicate");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
|
/* Return the subset of flag registers that an instruction could
|
|
|
|
|
|
* potentially read or write based on the execution controls and flag
|
|
|
|
|
|
* subregister number of the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned
|
2019-09-24 17:06:12 -05:00
|
|
|
|
flag_mask(const fs_inst *inst, unsigned width)
|
2016-05-18 21:54:35 -07:00
|
|
|
|
{
|
2019-09-24 17:06:12 -05:00
|
|
|
|
assert(util_is_power_of_two_nonzero(width));
|
|
|
|
|
|
const unsigned start = (inst->flag_subreg * 16 + inst->group) &
|
|
|
|
|
|
~(width - 1);
|
|
|
|
|
|
const unsigned end = start + ALIGN(inst->exec_size, width);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
|
|
|
|
|
|
}
|
2017-06-20 22:38:48 -07:00
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
|
bit_mask(unsigned n)
|
|
|
|
|
|
{
|
|
|
|
|
|
return (n >= CHAR_BIT * sizeof(bit_mask(n)) ? ~0u : (1u << n) - 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
unsigned
|
|
|
|
|
|
flag_mask(const fs_reg &r, unsigned sz)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (r.file == ARF) {
|
|
|
|
|
|
const unsigned start = (r.nr - BRW_ARF_FLAG) * 4 + r.subnr;
|
|
|
|
|
|
const unsigned end = start + sz;
|
|
|
|
|
|
return bit_mask(end) & ~bit_mask(start);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-05-18 21:54:35 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
unsigned
|
2016-08-22 15:01:08 -07:00
|
|
|
|
fs_inst::flags_read(const gen_device_info *devinfo) const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
2016-05-18 21:54:35 -07:00
|
|
|
|
if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
|
|
|
|
|
|
predicate == BRW_PREDICATE_ALIGN1_ALLV) {
|
|
|
|
|
|
/* The vertical predication modes combine corresponding bits from
|
|
|
|
|
|
* f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
|
2019-09-24 17:06:12 -05:00
|
|
|
|
return flag_mask(this, 1) << shift | flag_mask(this, 1);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
} else if (predicate) {
|
2019-09-24 17:06:12 -05:00
|
|
|
|
return flag_mask(this, predicate_width(predicate));
|
2016-05-18 21:54:35 -07:00
|
|
|
|
} else {
|
2017-06-22 16:42:34 -07:00
|
|
|
|
unsigned mask = 0;
|
|
|
|
|
|
for (int i = 0; i < sources; i++) {
|
|
|
|
|
|
mask |= flag_mask(src[i], size_read(i));
|
|
|
|
|
|
}
|
|
|
|
|
|
return mask;
|
2016-05-18 21:54:35 -07:00
|
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 21:54:35 -07:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_inst::flags_written() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
2016-05-18 21:54:35 -07:00
|
|
|
|
if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
2015-11-22 20:12:17 -08:00
|
|
|
|
opcode != BRW_OPCODE_CSEL &&
|
2016-05-18 21:54:35 -07:00
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) ||
|
2017-01-13 14:17:20 -08:00
|
|
|
|
opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL ||
|
|
|
|
|
|
opcode == FS_OPCODE_FB_WRITE) {
|
2019-09-24 17:06:12 -05:00
|
|
|
|
return flag_mask(this, 1);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
} else {
|
2017-06-20 22:38:48 -07:00
|
|
|
|
return flag_mask(dst, size_written);
|
2016-05-18 21:54:35 -07:00
|
|
|
|
}
|
2013-10-20 11:32:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns how many MRFs an FS opcode will write over.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this is not the 0 or 1 implied writes in an actual gen
|
|
|
|
|
|
* instruction -- the FS opcodes often generate MOVs in addition.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int
|
2018-11-09 14:13:37 -08:00
|
|
|
|
fs_visitor::implied_mrf_writes(const fs_inst *inst) const
|
2010-11-19 15:57:05 +08:00
|
|
|
|
{
|
|
|
|
|
|
if (inst->mlen == 0)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->base_mrf == -1)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
switch (inst->opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 1 * dispatch_width / 8;
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_POW:
|
2011-09-28 17:37:54 -07:00
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 2 * dispatch_width / 8;
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
case FS_OPCODE_TXB:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
2013-12-10 16:36:31 +02:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2013-11-30 10:32:16 +13:00
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
2013-03-31 21:31:12 +13:00
|
|
|
|
case SHADER_OPCODE_TG4:
|
2013-10-08 21:42:10 +13:00
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
2013-03-06 14:47:01 -08:00
|
|
|
|
case SHADER_OPCODE_LOD:
|
2015-08-11 20:37:32 -04:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
2017-01-13 14:18:22 -08:00
|
|
|
|
case FS_OPCODE_REP_FB_WRITE:
|
|
|
|
|
|
return inst->src[0].file == BAD_FILE ? 0 : 2;
|
2012-11-07 10:42:34 -08:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_READ:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
2016-05-20 13:03:31 -07:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
|
2013-03-18 10:16:42 -07:00
|
|
|
|
return inst->mlen;
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
|
2015-05-19 17:35:29 -07:00
|
|
|
|
return inst->mlen;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
default:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::vgrf(const glsl_type *const type)
|
|
|
|
|
|
{
|
|
|
|
|
|
int reg_width = dispatch_width / 8;
|
2019-03-29 12:39:48 +11:00
|
|
|
|
return fs_reg(VGRF,
|
|
|
|
|
|
alloc.allocate(type_size_scalar(type, false) * reg_width),
|
2015-06-18 12:44:35 -07:00
|
|
|
|
brw_type_for_base_type(type));
|
2014-05-16 02:21:51 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 18:41:27 -07:00
|
|
|
|
fs_reg::fs_reg(enum brw_reg_file file, int nr)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
2010-09-03 13:21:51 -07:00
|
|
|
|
init();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->file = file;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
this->nr = nr;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 18:41:27 -07:00
|
|
|
|
fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
|
2010-10-15 12:04:52 -07:00
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = file;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
this->nr = nr;
|
2010-10-15 12:04:52 -07:00
|
|
|
|
this->type = type;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-10-15 12:04:52 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->pull_constant_loc = v->pull_constant_loc;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->uniforms = v->uniforms;
|
2017-08-24 11:40:31 -07:00
|
|
|
|
this->subgroup_id = v->subgroup_id;
|
2011-03-23 12:50:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-14 16:52:10 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
|
2010-09-28 13:29:45 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.x */
|
2016-05-17 01:52:16 -07:00
|
|
|
|
bld.MOV(wpos, this->pixel_x);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
wpos = offset(wpos, bld, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.y */
|
2016-05-17 01:52:16 -07:00
|
|
|
|
bld.MOV(wpos, this->pixel_y);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
wpos = offset(wpos, bld, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.z */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2017-01-13 15:23:48 -08:00
|
|
|
|
bld.MOV(wpos, fetch_payload_reg(bld, payload.source_depth_reg));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
} else {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.emit(FS_OPCODE_LINTERP, wpos,
|
2016-04-25 18:33:22 -07:00
|
|
|
|
this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
|
2019-09-01 22:12:07 -05:00
|
|
|
|
component(interp_reg(VARYING_SLOT_POS, 2), 0));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
wpos = offset(wpos, bld, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.w: Already set up in emit_interpolation */
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.MOV(wpos, this->wpos_w);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
enum brw_barycentric_mode
|
|
|
|
|
|
brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
|
2016-07-11 15:00:37 -07:00
|
|
|
|
{
|
|
|
|
|
|
/* Barycentric modes don't make sense for flat inputs. */
|
2016-07-07 02:02:38 -07:00
|
|
|
|
assert(mode != INTERP_MODE_FLAT);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
unsigned bary;
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_offset:
|
2016-07-11 15:00:37 -07:00
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
|
2016-07-12 03:57:25 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_at_sample:
|
|
|
|
|
|
bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("invalid intrinsic");
|
2016-07-11 15:00:37 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-07 02:02:38 -07:00
|
|
|
|
if (mode == INTERP_MODE_NOPERSPECTIVE)
|
2016-07-11 15:00:37 -07:00
|
|
|
|
bary += 3;
|
|
|
|
|
|
|
|
|
|
|
|
return (enum brw_barycentric_mode) bary;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Turn one of the two CENTROID barycentric modes into PIXEL mode.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static enum brw_barycentric_mode
|
|
|
|
|
|
centroid_to_pixel(enum brw_barycentric_mode bary)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
|
|
|
|
|
|
bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
|
|
|
|
|
|
return (enum brw_barycentric_mode) ((unsigned) bary - 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
fs_reg *
|
2014-08-05 10:29:00 -07:00
|
|
|
|
fs_visitor::emit_frontfacing_interpolation()
|
2010-10-06 11:13:22 -07:00
|
|
|
|
{
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2014-08-10 09:04:49 -07:00
|
|
|
|
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
|
* a boolean result from this (~0/true or 0/false).
|
|
|
|
|
|
*
|
|
|
|
|
|
* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
|
|
|
|
|
|
* this task in only one instruction:
|
|
|
|
|
|
* - a negation source modifier will flip the bit; and
|
|
|
|
|
|
* - a W -> D type conversion will sign extend the bit into the high
|
|
|
|
|
|
* word of the destination.
|
|
|
|
|
|
*
|
|
|
|
|
|
* An ASR 15 fills the low word of the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
|
|
|
|
|
|
g0.negate = true;
|
|
|
|
|
|
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.ASR(*reg, g0, brw_imm_d(15));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
} else {
|
2014-08-10 10:28:34 -07:00
|
|
|
|
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
|
* a boolean result from this (1/true or 0/false).
|
|
|
|
|
|
*
|
|
|
|
|
|
* Like in the above case, since the bit is the MSB of g1.6:UD we can use
|
|
|
|
|
|
* the negation source modifier to flip it. Unfortunately the SHR
|
|
|
|
|
|
* instruction only operates on UD (or D with an abs source modifier)
|
|
|
|
|
|
* sources without negation.
|
|
|
|
|
|
*
|
2014-12-02 12:28:13 -08:00
|
|
|
|
* Instead, use ASR (which will give ~0/true or 0/false).
|
2010-10-06 11:19:48 -07:00
|
|
|
|
*/
|
2014-08-10 10:28:34 -07:00
|
|
|
|
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
|
|
|
|
|
|
g1_6.negate = true;
|
|
|
|
|
|
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.ASR(*reg, g1_6, brw_imm_d(31));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
}
|
2010-10-06 11:13:22 -07:00
|
|
|
|
|
|
|
|
|
|
return reg;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 15:53:05 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
assert(dst.type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
2016-05-09 17:48:24 -07:00
|
|
|
|
if (wm_prog_data->persample_dispatch) {
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Convert int_sample_pos to floating point */
|
2015-06-03 21:56:20 +03:00
|
|
|
|
bld.MOV(dst, int_sample_pos);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Scale to the range [0, 1] */
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
else {
|
|
|
|
|
|
/* From ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SamplePosition will always be
|
|
|
|
|
|
* (0.5, 0.5).
|
|
|
|
|
|
*/
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.MOV(dst, brw_imm_f(0.5f));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
2014-08-05 11:10:07 -07:00
|
|
|
|
fs_visitor::emit_samplepos_setup()
|
2013-10-24 15:53:05 -07:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2015-06-03 21:56:20 +03:00
|
|
|
|
const fs_builder abld = bld.annotate("compute sample position");
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
fs_reg pos = *reg;
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg int_sample_x = vgrf(glsl_type::int_type);
|
|
|
|
|
|
fs_reg int_sample_y = vgrf(glsl_type::int_type);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
|
|
|
|
|
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
|
|
|
|
|
|
* mode will be enabled.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the Ivy Bridge PRM, volume 2 part 1, page 344:
|
|
|
|
|
|
* R31.1:0 Position Offset X/Y for Slot[3:0]
|
|
|
|
|
|
* R31.3:2 Position Offset X/Y for Slot[7:4]
|
|
|
|
|
|
* .....
|
|
|
|
|
|
*
|
|
|
|
|
|
* The X, Y sample positions come in as bytes in thread payload. So, read
|
|
|
|
|
|
* the positions using vstride=16, width=8, hstride=2.
|
|
|
|
|
|
*/
|
2017-01-13 15:23:48 -08:00
|
|
|
|
const fs_reg sample_pos_reg =
|
|
|
|
|
|
fetch_payload_reg(abld, payload.sample_pos_reg, BRW_REGISTER_TYPE_W);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
|
|
|
|
|
/* Compute gl_SamplePosition.x */
|
2017-01-13 14:53:00 -08:00
|
|
|
|
abld.MOV(int_sample_x, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 0));
|
|
|
|
|
|
compute_sample_position(offset(pos, abld, 0), int_sample_x);
|
|
|
|
|
|
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Compute gl_SamplePosition.y */
|
2017-01-13 14:53:00 -08:00
|
|
|
|
abld.MOV(int_sample_y, subscript(sample_pos_reg, BRW_REGISTER_TYPE_B, 1));
|
|
|
|
|
|
compute_sample_position(offset(pos, abld, 1), int_sample_y);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
fs_reg *
|
2014-10-17 12:59:18 -07:00
|
|
|
|
fs_visitor::emit_sampleid_setup()
|
2013-10-24 16:17:08 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
2015-06-03 21:56:20 +03:00
|
|
|
|
const fs_builder abld = bld.annotate("compute sample id");
|
2018-01-05 18:50:14 -08:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uint_type));
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
2016-04-05 19:35:46 -07:00
|
|
|
|
if (!key->multisample_fbo) {
|
2016-04-18 18:11:01 -07:00
|
|
|
|
/* As per GL_ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SampleID will always be zero."
|
|
|
|
|
|
*/
|
|
|
|
|
|
abld.MOV(*reg, brw_imm_d(0));
|
2016-04-05 19:29:36 -07:00
|
|
|
|
} else if (devinfo->gen >= 8) {
|
|
|
|
|
|
/* Sample ID comes in as 4-bit numbers in g1.0:
|
|
|
|
|
|
*
|
|
|
|
|
|
* 15:12 Slot 3 SampleID (only used in SIMD16)
|
|
|
|
|
|
* 11:8 Slot 2 SampleID (only used in SIMD16)
|
|
|
|
|
|
* 7:4 Slot 1 SampleID
|
|
|
|
|
|
* 3:0 Slot 0 SampleID
|
|
|
|
|
|
*
|
|
|
|
|
|
* Each slot corresponds to four channels, so we want to replicate each
|
|
|
|
|
|
* half-byte value to 4 channels in a row:
|
|
|
|
|
|
*
|
|
|
|
|
|
* dst+0: .7 .6 .5 .4 .3 .2 .1 .0
|
|
|
|
|
|
* 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0
|
|
|
|
|
|
*
|
|
|
|
|
|
* dst+1: .7 .6 .5 .4 .3 .2 .1 .0 (if SIMD16)
|
|
|
|
|
|
* 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8
|
|
|
|
|
|
*
|
|
|
|
|
|
* First, we read g1.0 with a <1,8,0>UB region, causing the first 8
|
|
|
|
|
|
* channels to read the first byte (7:0), and the second group of 8
|
|
|
|
|
|
* channels to read the second byte (15:8). Then, we shift right by
|
|
|
|
|
|
* a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
|
|
|
|
|
|
* values into place. Finally, we AND with 0xf to keep the low nibble.
|
|
|
|
|
|
*
|
|
|
|
|
|
* shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
|
|
|
|
|
|
* and(16) dst<1>D tmp<8,8,1>W 0xf:W
|
|
|
|
|
|
*
|
|
|
|
|
|
* TODO: These payload bits exist on Gen7 too, but they appear to always
|
|
|
|
|
|
* be zero, so this code fails to work. We should find out why.
|
|
|
|
|
|
*/
|
2017-01-13 15:32:05 -08:00
|
|
|
|
const fs_reg tmp = abld.vgrf(BRW_REGISTER_TYPE_UW);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < DIV_ROUND_UP(dispatch_width, 16); i++) {
|
|
|
|
|
|
const fs_builder hbld = abld.group(MIN2(16, dispatch_width), i);
|
|
|
|
|
|
hbld.SHR(offset(tmp, hbld, i),
|
|
|
|
|
|
stride(retype(brw_vec1_grf(1 + i, 0), BRW_REGISTER_TYPE_UB),
|
|
|
|
|
|
1, 8, 0),
|
|
|
|
|
|
brw_imm_v(0x44440000));
|
|
|
|
|
|
}
|
2016-04-05 19:29:36 -07:00
|
|
|
|
|
|
|
|
|
|
abld.AND(*reg, tmp, brw_imm_w(0xf));
|
2016-04-18 18:11:01 -07:00
|
|
|
|
} else {
|
2017-01-13 15:32:05 -08:00
|
|
|
|
const fs_reg t1 = component(abld.vgrf(BRW_REGISTER_TYPE_UD), 0);
|
|
|
|
|
|
const fs_reg t2 = abld.vgrf(BRW_REGISTER_TYPE_UW);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
|
|
|
|
|
/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
|
|
|
|
|
|
* 8x multisampling, subspan 0 will represent sample N (where N
|
|
|
|
|
|
* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
|
|
|
|
|
|
* 7. We can find the value of N by looking at R0.0 bits 7:6
|
|
|
|
|
|
* ("Starting Sample Pair Index (SSPI)") and multiplying by two
|
|
|
|
|
|
* (since samples are always delivered in pairs). That is, we
|
|
|
|
|
|
* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
|
|
|
|
|
|
* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
|
|
|
|
|
|
* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
|
|
|
|
|
|
* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
|
|
|
|
|
|
* populating a temporary variable with the sequence (0, 1, 2, 3),
|
|
|
|
|
|
* and then reading from it using vstride=1, width=4, hstride=0.
|
|
|
|
|
|
* These computations hold good for 4x multisampling as well.
|
2014-07-18 13:19:45 -07:00
|
|
|
|
*
|
|
|
|
|
|
* For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
|
|
|
|
|
|
* the first four slots are sample 0 of subspan 0; the next four
|
|
|
|
|
|
* are sample 1 of subspan 0; the third group is sample 0 of
|
|
|
|
|
|
* subspan 1, and finally sample 1 of subspan 1.
|
2013-10-24 16:17:08 -07:00
|
|
|
|
*/
|
2015-09-09 17:44:17 +01:00
|
|
|
|
|
|
|
|
|
|
/* SKL+ has an extra bit for the Starting Sample Pair Index to
|
|
|
|
|
|
* accomodate 16x MSAA.
|
|
|
|
|
|
*/
|
2015-10-20 18:23:50 -07:00
|
|
|
|
abld.exec_all().group(1, 0)
|
2018-01-05 18:50:14 -08:00
|
|
|
|
.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
|
2016-04-05 19:29:36 -07:00
|
|
|
|
brw_imm_ud(0xc0));
|
2015-11-02 11:26:16 -08:00
|
|
|
|
abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
|
2015-06-03 21:56:20 +03:00
|
|
|
|
|
2017-01-13 15:32:05 -08:00
|
|
|
|
/* This works for SIMD8-SIMD16. It also works for SIMD32 but only if we
|
|
|
|
|
|
* can assume 4x MSAA. Disallow it on IVB+
|
|
|
|
|
|
*
|
|
|
|
|
|
* FINISHME: One day, we could come up with a way to do this that
|
|
|
|
|
|
* actually works on gen7.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen >= 7)
|
|
|
|
|
|
limit_dispatch_width(16, "gl_SampleId is unsupported in SIMD32 on gen7");
|
|
|
|
|
|
abld.exec_all().group(8, 0).MOV(t2, brw_imm_v(0x32103210));
|
2015-06-03 21:56:20 +03:00
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
/* This special instruction takes care of setting vstride=1,
|
|
|
|
|
|
* width=4, hstride=0 of t2 during an ADD instruction.
|
|
|
|
|
|
*/
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Fix gl_SampleMaskIn[] in per-sample shading mode.
The coverage mask is not sufficient - in per-sample mode, we also need
to AND with a mask representing the samples being processed by the
current fragment shader invocation.
Fixes 18 dEQP-GLES31.functional.shaders.sample_variables tests:
sample_mask_in.bit_count_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bit_count_per_two_samples.multisample_{rbo,texture}_{4,8}
sample_mask_in.bits_unique_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bits_unique_per_two_samples.multisample_{rbo,texture}_{4,8}
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-04-05 20:14:22 -07:00
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_samplemaskin_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
|
i965: Fix gl_SampleMaskIn[] in per-sample shading mode.
The coverage mask is not sufficient - in per-sample mode, we also need
to AND with a mask representing the samples being processed by the
current fragment shader invocation.
Fixes 18 dEQP-GLES31.functional.shaders.sample_variables tests:
sample_mask_in.bit_count_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bit_count_per_two_samples.multisample_{rbo,texture}_{4,8}
sample_mask_in.bits_unique_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bits_unique_per_two_samples.multisample_{rbo,texture}_{4,8}
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-04-05 20:14:22 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
|
|
|
|
|
|
|
2017-01-13 15:23:48 -08:00
|
|
|
|
fs_reg coverage_mask =
|
|
|
|
|
|
fetch_payload_reg(bld, payload.sample_mask_in_reg, BRW_REGISTER_TYPE_D);
|
i965: Fix gl_SampleMaskIn[] in per-sample shading mode.
The coverage mask is not sufficient - in per-sample mode, we also need
to AND with a mask representing the samples being processed by the
current fragment shader invocation.
Fixes 18 dEQP-GLES31.functional.shaders.sample_variables tests:
sample_mask_in.bit_count_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bit_count_per_two_samples.multisample_{rbo,texture}_{4,8}
sample_mask_in.bits_unique_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bits_unique_per_two_samples.multisample_{rbo,texture}_{4,8}
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-04-05 20:14:22 -07:00
|
|
|
|
|
2016-05-09 17:48:24 -07:00
|
|
|
|
if (wm_prog_data->persample_dispatch) {
|
i965: Fix gl_SampleMaskIn[] in per-sample shading mode.
The coverage mask is not sufficient - in per-sample mode, we also need
to AND with a mask representing the samples being processed by the
current fragment shader invocation.
Fixes 18 dEQP-GLES31.functional.shaders.sample_variables tests:
sample_mask_in.bit_count_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bit_count_per_two_samples.multisample_{rbo,texture}_{4,8}
sample_mask_in.bits_unique_per_sample.multisample_{rbo,texture}_{1,2,4,8}
sample_mask_in.bits_unique_per_two_samples.multisample_{rbo,texture}_{4,8}
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-04-05 20:14:22 -07:00
|
|
|
|
/* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
|
|
|
|
|
|
* and a mask representing which sample is being processed by the
|
|
|
|
|
|
* current shader invocation.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the OES_sample_variables specification:
|
|
|
|
|
|
* "When per-sample shading is active due to the use of a fragment input
|
|
|
|
|
|
* qualified by "sample" or due to the use of the gl_SampleID or
|
|
|
|
|
|
* gl_SamplePosition variables, only the bit for the current sample is
|
|
|
|
|
|
* set in gl_SampleMaskIn."
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
|
|
|
|
|
|
|
|
|
|
|
|
if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
|
|
|
|
|
|
nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg one = vgrf(glsl_type::int_type);
|
|
|
|
|
|
fs_reg enabled_mask = vgrf(glsl_type::int_type);
|
|
|
|
|
|
abld.MOV(one, brw_imm_d(1));
|
|
|
|
|
|
abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
|
|
|
|
|
|
abld.AND(*reg, enabled_mask, coverage_mask);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* In per-pixel mode, the coverage mask is sufficient. */
|
|
|
|
|
|
*reg = coverage_mask;
|
|
|
|
|
|
}
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-10 11:52:50 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::resolve_source_modifiers(const fs_reg &src)
|
2015-03-05 20:39:49 -08:00
|
|
|
|
{
|
2015-08-10 11:52:50 -07:00
|
|
|
|
if (!src.abs && !src.negate)
|
|
|
|
|
|
return src;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg temp = bld.vgrf(src.type);
|
|
|
|
|
|
bld.MOV(temp, src);
|
2015-03-05 20:39:49 -08:00
|
|
|
|
|
2015-08-10 11:52:50 -07:00
|
|
|
|
return temp;
|
2015-03-05 20:39:49 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-05 15:48:39 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_discard_jump()
|
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
assert(brw_wm_prog_data(this->prog_data)->uses_kill);
|
2015-04-10 10:04:55 -07:00
|
|
|
|
|
2015-03-05 15:48:39 -08:00
|
|
|
|
/* For performance, after a discard, jump to the end of the
|
|
|
|
|
|
* shader if all relevant channels have been discarded.
|
|
|
|
|
|
*/
|
2015-06-03 20:45:54 +03:00
|
|
|
|
fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
|
2015-03-05 15:48:39 -08:00
|
|
|
|
discard_jump->flag_subreg = 1;
|
|
|
|
|
|
|
2016-08-08 12:43:18 -07:00
|
|
|
|
discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
|
2015-03-05 15:48:39 -08:00
|
|
|
|
discard_jump->predicate_inverse = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_gs_thread_end()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits > 0) {
|
|
|
|
|
|
emit_gs_control_data_bits(this->final_gs_vertex_count);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const fs_builder abld = bld.annotate("thread end");
|
|
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
|
|
if (gs_prog_data->static_vertex_count != -1) {
|
|
|
|
|
|
foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
|
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
|
|
|
|
|
|
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
|
|
|
|
|
|
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
|
|
|
|
|
|
prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
|
|
|
|
|
|
prev->eot = true;
|
|
|
|
|
|
|
|
|
|
|
|
/* Delete now dead instructions. */
|
|
|
|
|
|
foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
|
|
|
|
|
|
if (dead == prev)
|
|
|
|
|
|
break;
|
|
|
|
|
|
dead->remove();
|
|
|
|
|
|
}
|
|
|
|
|
|
return;
|
|
|
|
|
|
} else if (prev->is_control_flow() || prev->has_side_effects()) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
|
|
|
|
|
|
abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
|
|
|
|
|
|
inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
|
|
|
|
|
|
inst->mlen = 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
|
|
|
|
|
|
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
|
|
|
|
|
|
sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
sources[1] = this->final_gs_vertex_count;
|
|
|
|
|
|
abld.LOAD_PAYLOAD(payload, sources, 2, 2);
|
|
|
|
|
|
inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
|
|
|
|
|
|
inst->mlen = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
|
inst->offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
|
{
|
2016-11-29 02:47:15 -08:00
|
|
|
|
unsigned uniform_push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
|
|
|
|
|
|
|
|
|
|
|
|
unsigned ubo_push_length = 0;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
unsigned ubo_push_start[4];
|
2016-11-29 02:47:15 -08:00
|
|
|
|
for (int i = 0; i < 4; i++) {
|
2016-11-29 05:20:20 -08:00
|
|
|
|
ubo_push_start[i] = 8 * (ubo_push_length + uniform_push_length);
|
2016-11-29 02:47:15 -08:00
|
|
|
|
ubo_push_length += stage_prog_data->ubo_ranges[i].length;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
prog_data->curb_read_length = uniform_push_length + ubo_push_length;
|
2014-02-19 15:27:01 +01:00
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
int constant_nr;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
if (inst->src[i].nr >= UBO_START) {
|
|
|
|
|
|
/* constant_nr is in 32-bit units, the rest are in bytes */
|
|
|
|
|
|
constant_nr = ubo_push_start[inst->src[i].nr - UBO_START] +
|
|
|
|
|
|
inst->src[i].offset / 4;
|
|
|
|
|
|
} else if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
|
*/
|
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-13 21:52:51 -07:00
|
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
|
constant_nr / 8,
|
|
|
|
|
|
constant_nr % 8);
|
2015-10-24 15:29:03 -07:00
|
|
|
|
brw_reg.abs = inst->src[i].abs;
|
|
|
|
|
|
brw_reg.negate = inst->src[i].negate;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
assert(inst->src[i].stride == 0);
|
2015-10-24 15:29:03 -07:00
|
|
|
|
inst->src[i] = byte_offset(
|
2013-12-08 04:57:08 +01:00
|
|
|
|
retype(brw_reg, inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
|
inst->src[i].offset % 4);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
|
|
|
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
|
|
|
|
|
this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-18 09:15:15 -05:00
|
|
|
|
static void
|
|
|
|
|
|
calculate_urb_setup(const struct gen_device_info *devinfo,
|
|
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
|
|
|
|
|
const nir_shader *nir)
|
2010-08-16 21:53:02 -07:00
|
|
|
|
{
|
2014-08-24 21:51:28 -07:00
|
|
|
|
memset(prog_data->urb_setup, -1,
|
|
|
|
|
|
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
int urb_next = 0;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2018-08-21 09:46:46 -07:00
|
|
|
|
if (util_bitcount64(nir->info.inputs_read &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BRW_FS_VARYING_INPUT_MASK) <= 16) {
|
|
|
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
|
* a different vertex (or geometry) shader.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(i)) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_vue_map prev_stage_vue_map;
|
2015-04-17 12:52:00 -07:00
|
|
|
|
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
|
i965: Don't re-layout varyings for separate shader programs.
Previously, our VUE map code always assigned slots to varyings
sequentially, in one contiguous block.
This was a bad fit for separate shaders - the GS input layout depended
or the VS output layout, so if we swapped out vertex shaders, we might
have to recompile the GS on the fly - which rather defeats the point of
using separate shader objects. (Tessellation would suffer from this
as well - we could have to recompile the HS, DS, and GS.)
Instead, this patch makes the VUE map for separate shaders use a fixed
layout, based on the input/output variable's location field. (This is
either specified by layout(location = ...) or assigned by the linker.)
Corresponding inputs/outputs will match up by location; if there's a
mismatch, we're allowed to have undefined behavior.
This may be less efficient - depending what locations were chosen, we
may have empty padding slots in the VUE. But applications presumably
use small consecutive integers for locations, so it hopefully won't be
much worse in practice.
3% of Dota 2 Reborn shaders are hurt, but only by 2 instructions.
This seems like a small price to pay for avoiding recompiles.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
2015-09-09 16:21:56 -07:00
|
|
|
|
key->input_slots_valid,
|
2017-05-08 09:20:21 -07:00
|
|
|
|
nir->info.separate_shader);
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
|
|
2015-06-17 13:06:18 -07:00
|
|
|
|
int first_slot =
|
i965: skip reading unused slots at the begining of the URB for the FS
We can start reading the URB at the first offset that contains varyings
that are actually read in the URB. We still need to make sure that we
read at least one varying to honor hardware requirements.
This helps alleviate a problem introduced with 99df02ca26f61 for
separate shader objects: without separate shader objects we assign
locations sequentially, however, since that commit we have changed the
method for SSO so that the VUE slot assigned depends on the number of
builtin slots plus the location assigned to the varying. This fixed
layout is intended to help SSO programs by avoiding on-the-fly recompiles
when swapping out shaders, however, it also means that if a varying uses
a large location number close to the maximum allowed by the SF/FS units
(31), then the offset introduced by the number of builtin slots can push
the location outside the range and trigger an assertion.
This problem is affecting at least the following CTS tests for
enhanced layouts:
KHR-GL45.enhanced_layouts.varying_array_components
KHR-GL45.enhanced_layouts.varying_array_locations
KHR-GL45.enhanced_layouts.varying_components
KHR-GL45.enhanced_layouts.varying_locations
which use SSO and the the location layout qualifier to select such
location numbers explicitly.
This change helps these tests because for SSO we always have to include
things such as VARYING_SLOT_CLIP_DIST{0,1} even if the fragment shader is
very unlikely to read them, so by doing this we free builtin slots from
the fixed VUE layout and we avoid the tests to crash in this scenario.
Of course, this is not a proper fix, we'd still run into problems if someone
tries to use an explicit max location and read gl_ViewportIndex, gl_LayerID or
gl_CullDistancein in the FS, but that would be a much less common bug and we
can probably wait to see if anyone actually runs into that situation in a real
world scenario before making the decision that more aggresive changes are
required to support this without reverting 99df02ca26f61.
v2:
- Add a debug message when we skip clip distances (Ilia)
- we also need to account for this when we compute the urb setup
for the fragment shader stage, so add a compiler util to compute
the first slot that we need to read from the URB instead of
replicating the logic in both places.
v3:
- Make the util more generic so it can account for all unused slots
at the beginning of the URB, that will make it more useful (Ken).
- Drop the debug message, it was not what Ilia was asking for.
Suggested-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2017-09-20 09:22:51 +02:00
|
|
|
|
brw_compute_first_urb_slot_required(nir->info.inputs_read,
|
|
|
|
|
|
&prev_stage_vue_map);
|
2015-06-17 13:06:18 -07:00
|
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
|
slot++) {
|
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
2015-10-26 01:03:12 -07:00
|
|
|
|
if (varying != BRW_VARYING_SLOT_PAD &&
|
2017-05-08 09:20:21 -07:00
|
|
|
|
(nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(varying))) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* Point size is packed into the header, not as a general attribute */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
if (i == VARYING_SLOT_PSIZ)
|
2012-07-19 22:00:16 +02:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* The back color slot is skipped when the front color is
|
|
|
|
|
|
* also written to. In addition, some slots can be
|
|
|
|
|
|
* written in the vertex shader and not read in the
|
|
|
|
|
|
* fragment shader. So the register number must always be
|
|
|
|
|
|
* incremented, mapped or not.
|
|
|
|
|
|
*/
|
2013-02-23 08:28:18 -08:00
|
|
|
|
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next;
|
2012-07-19 22:00:16 +02:00
|
|
|
|
urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-02-27 15:46:32 +08:00
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* It's a FS only attribute, and we did interpolation for this attribute
|
|
|
|
|
|
* in SF thread. So, count it here, too.
|
|
|
|
|
|
*
|
|
|
|
|
|
* See compile_sf_prog() for more info.
|
|
|
|
|
|
*/
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->num_varying_inputs = urb_next;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_urb_setup()
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
|
2014-08-29 12:50:46 -07:00
|
|
|
|
|
2014-09-02 11:38:29 -07:00
|
|
|
|
int urb_start = payload.num_regs + prog_data->base.curb_read_length;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
2010-08-16 21:53:02 -07:00
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2016-04-25 18:33:22 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
|
|
|
|
|
/* ATTR regs in the FS are in units of logical scalar inputs each
|
|
|
|
|
|
* of which consumes half of a GRF register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->src[i].offset < REG_SIZE / 2);
|
|
|
|
|
|
const unsigned grf = urb_start + inst->src[i].nr / 2;
|
|
|
|
|
|
const unsigned offset = (inst->src[i].nr % 2) * (REG_SIZE / 2) +
|
|
|
|
|
|
inst->src[i].offset;
|
|
|
|
|
|
const unsigned width = inst->src[i].stride == 0 ?
|
|
|
|
|
|
1 : MIN2(inst->exec_size, 8);
|
|
|
|
|
|
struct brw_reg reg = stride(
|
|
|
|
|
|
byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
|
offset),
|
|
|
|
|
|
width * inst->src[i].stride,
|
|
|
|
|
|
width, inst->src[i].stride);
|
|
|
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
inst->src[i] = reg;
|
|
|
|
|
|
}
|
2011-01-12 12:52:16 -08:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-02 17:35:32 -07:00
|
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg. */
|
2014-10-03 19:05:32 -07:00
|
|
|
|
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
|
|
|
|
|
int grf = payload.num_regs +
|
|
|
|
|
|
prog_data->curb_read_length +
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr +
|
2016-09-01 12:42:20 -07:00
|
|
|
|
inst->src[i].offset / REG_SIZE;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
2016-03-23 12:20:05 +01:00
|
|
|
|
/* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
|
|
|
|
|
|
*
|
|
|
|
|
|
* VertStride must be used to cross GRF register boundaries. This
|
|
|
|
|
|
* rule implies that elements within a 'Width' cannot cross GRF
|
|
|
|
|
|
* boundaries.
|
|
|
|
|
|
*
|
|
|
|
|
|
* So, for registers that are large enough, we have to split the exec
|
|
|
|
|
|
* size in two and trust the compression state to sort it out.
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned total_size = inst->exec_size *
|
|
|
|
|
|
inst->src[i].stride *
|
|
|
|
|
|
type_sz(inst->src[i].type);
|
|
|
|
|
|
|
|
|
|
|
|
assert(total_size <= 2 * REG_SIZE);
|
|
|
|
|
|
const unsigned exec_size =
|
|
|
|
|
|
(total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
|
|
|
|
|
|
|
|
|
|
|
|
unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
|
2015-10-24 15:29:03 -07:00
|
|
|
|
struct brw_reg reg =
|
2015-03-11 23:14:31 -07:00
|
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
2016-09-01 15:11:21 -07:00
|
|
|
|
inst->src[i].offset % REG_SIZE),
|
2016-03-23 12:20:05 +01:00
|
|
|
|
exec_size * inst->src[i].stride,
|
2015-11-11 22:37:53 -08:00
|
|
|
|
width, inst->src[i].stride);
|
2015-10-24 15:29:03 -07:00
|
|
|
|
reg.abs = inst->src[i].abs;
|
|
|
|
|
|
reg.negate = inst->src[i].negate;
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[i] = reg;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_vs_urb_setup()
|
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
|
|
|
|
|
/* Each attribute is 4 regs. */
|
2016-04-04 12:47:57 +02:00
|
|
|
|
this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
assert(vs_prog_data->base.urb_read_length <= 15);
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to the hw grf that they land in. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-03-11 23:14:31 -07:00
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
|
void
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
fs_visitor::assign_tcs_urb_setup()
|
2015-11-14 17:40:43 -08:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to HW_REGs. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_tes_urb_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
|
|
first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to HW_REGs. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_gs_urb_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
first_non_payload_grf +=
|
2017-05-08 09:20:21 -07:00
|
|
|
|
8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:52:57 -07:00
|
|
|
|
/* Rewrite all ATTR file references to GRFs. */
|
2015-03-11 23:14:31 -07:00
|
|
|
|
convert_attr_sources_to_hw_regs(inst);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
2010-10-13 20:17:15 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Split large virtual GRFs into separate components if we can.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is mostly duplicated with what brw_fs_vector_splitting does,
|
|
|
|
|
|
* but that's really conservative because it's afraid of doing
|
|
|
|
|
|
* splitting that doesn't result in real progress after the rest of
|
|
|
|
|
|
* the optimization phases, which would cause infinite looping in
|
|
|
|
|
|
* optimization. We can do it once here, safely. This also has the
|
|
|
|
|
|
* opportunity to split interpolated values, or maybe even uniforms,
|
|
|
|
|
|
* which we don't have at the IR level.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We want to split, because virtual GRFs are what we register
|
|
|
|
|
|
* allocate and spill (due to contiguousness requirements for some
|
|
|
|
|
|
* instructions), and they're what we naturally generate in the
|
|
|
|
|
|
* codegen process, but most virtual GRFs don't actually need to be
|
|
|
|
|
|
* contiguous sets of GRFs. If we split, we'll end up with reduced
|
|
|
|
|
|
* live intervals and better dead code elimination and coalescing.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::split_virtual_grfs()
|
|
|
|
|
|
{
|
2016-10-15 03:18:36 -07:00
|
|
|
|
/* Compact the register file so we eliminate dead vgrfs. This
|
|
|
|
|
|
* only defines split points for live registers, so if we have
|
|
|
|
|
|
* too large dead registers they will hit assertions later.
|
|
|
|
|
|
*/
|
|
|
|
|
|
compact_virtual_grfs();
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int num_vars = this->alloc.count;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* Count the total number of registers */
|
|
|
|
|
|
int reg_count = 0;
|
|
|
|
|
|
int vgrf_to_reg[num_vars];
|
2010-10-13 20:17:15 -07:00
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
vgrf_to_reg[i] = reg_count;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
reg_count += alloc.sizes[i];
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* An array of "split points". For each register slot, this indicates
|
|
|
|
|
|
* if this slot can be separated from the previous slot. Every time an
|
|
|
|
|
|
* instruction uses multiple elements of a register (as a source or
|
|
|
|
|
|
* destination), we mark the used slots as inseparable. Then we go
|
|
|
|
|
|
* through and split the registers into the smallest pieces we can.
|
|
|
|
|
|
*/
|
2019-07-22 00:28:27 -05:00
|
|
|
|
bool *split_points = new bool[reg_count];
|
|
|
|
|
|
memset(split_points, 0, reg_count * sizeof(*split_points));
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* Mark all used registers as fully splittable */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
int reg = vgrf_to_reg[inst->dst.nr];
|
|
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
int reg = vgrf_to_reg[inst->src[i].nr];
|
|
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2019-09-06 19:34:42 -05:00
|
|
|
|
/* We fix up undef instructions later */
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_UNDEF) {
|
|
|
|
|
|
/* UNDEF instructions are currently only used to undef entire
|
|
|
|
|
|
* registers. We need this invariant later when we split them.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->dst.file == VGRF);
|
|
|
|
|
|
assert(inst->dst.offset == 0);
|
|
|
|
|
|
assert(inst->size_written == alloc.sizes[inst->dst.nr] * REG_SIZE);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
|
2016-09-07 16:59:35 -07:00
|
|
|
|
for (unsigned j = 1; j < regs_written(inst); j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = false;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
|
2016-09-07 16:59:35 -07:00
|
|
|
|
for (unsigned j = 1; j < regs_read(inst, i); j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = false;
|
2013-08-28 11:22:01 -07:00
|
|
|
|
}
|
2013-03-19 15:28:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-22 00:28:27 -05:00
|
|
|
|
int *new_virtual_grf = new int[reg_count];
|
|
|
|
|
|
int *new_reg_offset = new int[reg_count];
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
int reg = 0;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* The first one should always be 0 as a quick sanity check. */
|
|
|
|
|
|
assert(split_points[reg] == false);
|
|
|
|
|
|
|
|
|
|
|
|
/* j = 0 case */
|
|
|
|
|
|
new_reg_offset[reg] = 0;
|
|
|
|
|
|
reg++;
|
|
|
|
|
|
int offset = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/* j > 0 case */
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < alloc.sizes[i]; j++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* If this is a split point, reset the offset to 0 and allocate a
|
|
|
|
|
|
* new virtual GRF for the previous offset many registers
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (split_points[reg]) {
|
2014-10-01 10:54:59 -07:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int grf = alloc.allocate(offset);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int k = reg - offset; k < reg; k++)
|
|
|
|
|
|
new_virtual_grf[k] = grf;
|
|
|
|
|
|
offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
new_reg_offset[reg] = offset;
|
|
|
|
|
|
offset++;
|
|
|
|
|
|
reg++;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* The last one gets the original register number */
|
2014-10-01 10:54:59 -07:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[i] = offset;
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int k = reg - offset; k < reg; k++)
|
|
|
|
|
|
new_virtual_grf[k] = i;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
assert(reg == reg_count);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2019-09-06 19:34:42 -05:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_UNDEF) {
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
assert(inst->size_written % REG_SIZE == 0);
|
|
|
|
|
|
unsigned reg_offset = 0;
|
|
|
|
|
|
while (reg_offset < inst->size_written / REG_SIZE) {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->dst.nr] + reg_offset;
|
|
|
|
|
|
ibld.UNDEF(fs_reg(VGRF, new_virtual_grf[reg], inst->dst.type));
|
|
|
|
|
|
reg_offset += alloc.sizes[new_virtual_grf[reg]];
|
|
|
|
|
|
}
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = new_virtual_grf[reg];
|
2016-09-01 12:42:20 -07:00
|
|
|
|
inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
|
|
|
|
|
|
inst->dst.offset % REG_SIZE;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF) {
|
2016-09-01 12:42:20 -07:00
|
|
|
|
reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr = new_virtual_grf[reg];
|
2016-09-01 12:42:20 -07:00
|
|
|
|
inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
|
|
|
|
|
|
inst->src[i].offset % REG_SIZE;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2019-07-22 00:28:27 -05:00
|
|
|
|
|
|
|
|
|
|
delete[] split_points;
|
|
|
|
|
|
delete[] new_virtual_grf;
|
|
|
|
|
|
delete[] new_reg_offset;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove unused virtual GRFs and compact the virtual_grf_* arrays.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During code generation, we create tons of temporary variables, many of
|
|
|
|
|
|
* which get immediately killed and are never used again. Yet, in later
|
|
|
|
|
|
* optimization and analysis passes, such as compute_live_intervals, we need
|
|
|
|
|
|
* to loop over all the virtual GRFs. Compacting them can save a lot of
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool
|
2012-11-01 22:04:50 -07:00
|
|
|
|
fs_visitor::compact_virtual_grfs()
|
|
|
|
|
|
{
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool progress = false;
|
2019-07-22 00:28:27 -05:00
|
|
|
|
int *remap_table = new int[this->alloc.count];
|
|
|
|
|
|
memset(remap_table, -1, this->alloc.count * sizeof(int));
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
2014-08-19 16:11:36 -07:00
|
|
|
|
/* Mark which virtual GRFs are used. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, const fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
remap_table[inst->dst.nr] = 0;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
remap_table[inst->src[i].nr] = 0;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Compact the GRF arrays. */
|
|
|
|
|
|
int new_index = 0;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2014-09-16 13:14:09 -07:00
|
|
|
|
if (remap_table[i] == -1) {
|
|
|
|
|
|
/* We just found an unused register. This means that we are
|
|
|
|
|
|
* actually going to compact something.
|
|
|
|
|
|
*/
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
remap_table[i] = new_index;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[new_index] = alloc.sizes[i];
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-11-01 22:04:50 -07:00
|
|
|
|
++new_index;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
this->alloc.count = new_index;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
|
|
|
|
|
/* Patch all the instructions to use the newly renumbered registers */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->dst.file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = remap_table[inst->dst.nr];
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr = remap_table[inst->src[i].nr];
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
/* Patch all the references to delta_xy, since they're used in register
|
|
|
|
|
|
* allocation. If they're unused, switch them to BAD_FILE so we don't
|
|
|
|
|
|
* think some random VGRF is delta_xy.
|
2014-08-10 19:03:34 -07:00
|
|
|
|
*/
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (delta_xy[i].file == VGRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
if (remap_table[delta_xy[i].nr] != -1) {
|
|
|
|
|
|
delta_xy[i].nr = remap_table[delta_xy[i].nr];
|
2014-09-12 17:45:30 -07:00
|
|
|
|
} else {
|
2015-04-06 17:44:40 -07:00
|
|
|
|
delta_xy[i].file = BAD_FILE;
|
2014-09-12 17:45:30 -07:00
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 13:14:09 -07:00
|
|
|
|
|
2019-07-22 00:28:27 -05:00
|
|
|
|
delete[] remap_table;
|
|
|
|
|
|
|
2014-09-16 13:14:09 -07:00
|
|
|
|
return progress;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-09-29 12:22:48 -07:00
|
|
|
|
static int
|
2017-08-24 11:40:31 -07:00
|
|
|
|
get_subgroup_id_param_index(const brw_stage_prog_data *prog_data)
|
2017-09-29 12:22:48 -07:00
|
|
|
|
{
|
|
|
|
|
|
if (prog_data->nr_params == 0)
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
|
|
/* The local thread id is always the last parameter in the list */
|
|
|
|
|
|
uint32_t last_param = prog_data->param[prog_data->nr_params - 1];
|
2017-08-24 11:40:31 -07:00
|
|
|
|
if (last_param == BRW_PARAM_BUILTIN_SUBGROUP_ID)
|
2017-09-29 12:22:48 -07:00
|
|
|
|
return prog_data->nr_params - 1;
|
|
|
|
|
|
|
|
|
|
|
|
return -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Struct for handling complex alignments.
|
|
|
|
|
|
*
|
|
|
|
|
|
* A complex alignment is stored as multiplier and an offset. A value is
|
|
|
|
|
|
* considered to be aligned if it is {offset} larger than a multiple of {mul}.
|
|
|
|
|
|
* For instance, with an alignment of {8, 2}, cplx_align_apply would do the
|
|
|
|
|
|
* following:
|
|
|
|
|
|
*
|
|
|
|
|
|
* N | cplx_align_apply({8, 2}, N)
|
|
|
|
|
|
* ----+-----------------------------
|
|
|
|
|
|
* 4 | 6
|
|
|
|
|
|
* 6 | 6
|
|
|
|
|
|
* 8 | 14
|
|
|
|
|
|
* 10 | 14
|
|
|
|
|
|
* 12 | 14
|
|
|
|
|
|
* 14 | 14
|
|
|
|
|
|
* 16 | 22
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct cplx_align {
|
|
|
|
|
|
unsigned mul:4;
|
|
|
|
|
|
unsigned offset:4;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
#define CPLX_ALIGN_MAX_MUL 8
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
|
cplx_align_assert_sane(struct cplx_align a)
|
|
|
|
|
|
{
|
2017-11-13 13:00:53 -08:00
|
|
|
|
assert(a.mul > 0 && util_is_power_of_two_nonzero(a.mul));
|
2017-12-02 22:32:59 -08:00
|
|
|
|
assert(a.offset < a.mul);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Combines two alignments to produce a least multiple of sorts.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The returned alignment is the smallest (in terms of multiplier) such that
|
|
|
|
|
|
* anything aligned to both a and b will be aligned to the new alignment.
|
|
|
|
|
|
* This function will assert-fail if a and b are not compatible, i.e. if the
|
|
|
|
|
|
* offset parameters are such that no common alignment is possible.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static struct cplx_align
|
|
|
|
|
|
cplx_align_combine(struct cplx_align a, struct cplx_align b)
|
|
|
|
|
|
{
|
|
|
|
|
|
cplx_align_assert_sane(a);
|
|
|
|
|
|
cplx_align_assert_sane(b);
|
|
|
|
|
|
|
|
|
|
|
|
/* Assert that the alignments agree. */
|
|
|
|
|
|
assert((a.offset & (b.mul - 1)) == (b.offset & (a.mul - 1)));
|
|
|
|
|
|
|
|
|
|
|
|
return a.mul > b.mul ? a : b;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Apply a complex alignment
|
|
|
|
|
|
*
|
|
|
|
|
|
* This function will return the smallest number greater than or equal to
|
|
|
|
|
|
* offset that is aligned to align.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
|
|
|
|
|
cplx_align_apply(struct cplx_align align, unsigned offset)
|
|
|
|
|
|
{
|
|
|
|
|
|
return ALIGN(offset - align.offset, align.mul) + align.offset;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#define UNIFORM_SLOT_SIZE 4
|
|
|
|
|
|
|
|
|
|
|
|
struct uniform_slot_info {
|
|
|
|
|
|
/** True if the given uniform slot is live */
|
|
|
|
|
|
unsigned is_live:1;
|
|
|
|
|
|
|
|
|
|
|
|
/** True if this slot and the next slot must remain contiguous */
|
|
|
|
|
|
unsigned contiguous:1;
|
|
|
|
|
|
|
|
|
|
|
|
struct cplx_align align;
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
|
mark_uniform_slots_read(struct uniform_slot_info *slots,
|
|
|
|
|
|
unsigned num_slots, unsigned alignment)
|
|
|
|
|
|
{
|
2017-11-13 13:00:53 -08:00
|
|
|
|
assert(alignment > 0 && util_is_power_of_two_nonzero(alignment));
|
2017-12-02 22:32:59 -08:00
|
|
|
|
assert(alignment <= CPLX_ALIGN_MAX_MUL);
|
|
|
|
|
|
|
|
|
|
|
|
/* We can't align a slot to anything less than the slot size */
|
|
|
|
|
|
alignment = MAX2(alignment, UNIFORM_SLOT_SIZE);
|
|
|
|
|
|
|
|
|
|
|
|
struct cplx_align align = {alignment, 0};
|
|
|
|
|
|
cplx_align_assert_sane(align);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_slots; i++) {
|
|
|
|
|
|
slots[i].is_live = true;
|
|
|
|
|
|
if (i < num_slots - 1)
|
|
|
|
|
|
slots[i].contiguous = true;
|
|
|
|
|
|
|
|
|
|
|
|
align.offset = (i * UNIFORM_SLOT_SIZE) & (align.mul - 1);
|
|
|
|
|
|
if (slots[i].align.mul == 0) {
|
|
|
|
|
|
slots[i].align = align;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
slots[i].align = cplx_align_combine(slots[i].align, align);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-18 17:04:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*
|
2015-08-18 17:04:53 -07:00
|
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
2015-12-08 17:34:38 -08:00
|
|
|
|
* update the program to load them.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
2015-08-18 17:04:53 -07:00
|
|
|
|
fs_visitor::assign_constant_locations()
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
2016-02-22 10:42:07 -08:00
|
|
|
|
/* Only the first compile gets to decide on locations. */
|
2017-08-21 18:42:41 -07:00
|
|
|
|
if (push_constant_loc) {
|
|
|
|
|
|
assert(pull_constant_loc);
|
2014-03-07 16:10:50 -08:00
|
|
|
|
return;
|
2017-08-21 18:42:41 -07:00
|
|
|
|
}
|
2014-03-07 16:10:50 -08:00
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
struct uniform_slot_info slots[uniforms];
|
|
|
|
|
|
memset(slots, 0, sizeof(slots));
|
2015-11-24 15:16:14 -08:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0 ; i < inst->sources; i++) {
|
2015-08-18 17:40:02 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
/* NIR tightly packs things so the uniform number might not be
|
|
|
|
|
|
* aligned (if we have a double right after a float, for instance).
|
|
|
|
|
|
* This is fine because the process of re-arranging them will ensure
|
|
|
|
|
|
* that things are properly aligned. The offset into that uniform,
|
|
|
|
|
|
* however, must be aligned.
|
|
|
|
|
|
*
|
|
|
|
|
|
* In Vulkan, we have explicit offsets but everything is crammed
|
|
|
|
|
|
* into a single "variable" so inst->src[i].nr will always be 0.
|
|
|
|
|
|
* Everything will be properly aligned relative to that one base.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->src[i].offset % type_sz(inst->src[i].type) == 0);
|
|
|
|
|
|
|
|
|
|
|
|
unsigned u = inst->src[i].nr +
|
|
|
|
|
|
inst->src[i].offset / UNIFORM_SLOT_SIZE;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
if (u >= uniforms)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
unsigned slots_read;
|
2015-11-24 15:16:14 -08:00
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
|
2017-12-02 22:32:59 -08:00
|
|
|
|
slots_read = DIV_ROUND_UP(inst->src[2].ud, UNIFORM_SLOT_SIZE);
|
2015-08-18 17:40:02 -07:00
|
|
|
|
} else {
|
2017-12-02 22:32:59 -08:00
|
|
|
|
unsigned bytes_read = inst->components_read(i) *
|
|
|
|
|
|
type_sz(inst->src[i].type);
|
|
|
|
|
|
slots_read = DIV_ROUND_UP(bytes_read, UNIFORM_SLOT_SIZE);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
2017-12-02 22:32:59 -08:00
|
|
|
|
|
|
|
|
|
|
assert(u + slots_read <= uniforms);
|
|
|
|
|
|
mark_uniform_slots_read(&slots[u], slots_read,
|
|
|
|
|
|
type_sz(inst->src[i].type));
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
2017-08-24 11:40:31 -07:00
|
|
|
|
int subgroup_id_index = get_subgroup_id_param_index(stage_prog_data);
|
2016-05-22 21:29:53 -07:00
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Just demote the end of the list. We could probably do better
|
2010-10-22 12:57:00 -07:00
|
|
|
|
* here, demoting things that are rarely used in the program first.
|
2014-05-19 08:51:12 -07:00
|
|
|
|
*
|
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
|
* brw_curbe.c.
|
2010-10-22 12:57:00 -07:00
|
|
|
|
*/
|
2016-05-22 21:29:53 -07:00
|
|
|
|
unsigned int max_push_components = 16 * 8;
|
2017-08-24 11:40:31 -07:00
|
|
|
|
if (subgroup_id_index >= 0)
|
2016-05-22 21:29:53 -07:00
|
|
|
|
max_push_components--; /* Save a slot for the thread ID */
|
2015-12-08 17:34:38 -08:00
|
|
|
|
|
|
|
|
|
|
/* We push small arrays, but no bigger than 16 floats. This is big enough
|
|
|
|
|
|
* for a vec4 but hopefully not large enough to push out other stuff. We
|
|
|
|
|
|
* should probably use a better heuristic at some point.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned int max_chunk_size = 16;
|
|
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
unsigned int num_push_constants = 0;
|
2015-11-24 15:16:14 -08:00
|
|
|
|
unsigned int num_pull_constants = 0;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
2015-11-24 15:16:14 -08:00
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
2016-05-10 13:54:58 -07:00
|
|
|
|
/* Default to -1 meaning no location */
|
|
|
|
|
|
memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
|
|
|
|
|
|
memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
|
|
|
|
|
|
|
2015-12-08 17:34:38 -08:00
|
|
|
|
int chunk_start = -1;
|
2017-12-02 22:32:59 -08:00
|
|
|
|
struct cplx_align align;
|
2015-12-08 17:34:38 -08:00
|
|
|
|
for (unsigned u = 0; u < uniforms; u++) {
|
2017-12-02 22:32:59 -08:00
|
|
|
|
if (!slots[u].is_live) {
|
|
|
|
|
|
assert(chunk_start == -1);
|
2016-05-05 09:18:07 +02:00
|
|
|
|
continue;
|
2017-12-02 22:32:59 -08:00
|
|
|
|
}
|
2016-05-05 09:18:07 +02:00
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
/* Skip subgroup_id_index to put it in the last push register. */
|
|
|
|
|
|
if (subgroup_id_index == (int)u)
|
|
|
|
|
|
continue;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
if (chunk_start == -1) {
|
|
|
|
|
|
chunk_start = u;
|
|
|
|
|
|
align = slots[u].align;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Offset into the chunk */
|
|
|
|
|
|
unsigned chunk_offset = (u - chunk_start) * UNIFORM_SLOT_SIZE;
|
2015-12-08 17:34:38 -08:00
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
/* Shift the slot alignment down by the chunk offset so it is
|
|
|
|
|
|
* comparable with the base chunk alignment.
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct cplx_align slot_align = slots[u].align;
|
|
|
|
|
|
slot_align.offset =
|
|
|
|
|
|
(slot_align.offset - chunk_offset) & (align.mul - 1);
|
2015-12-08 17:34:38 -08:00
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
align = cplx_align_combine(align, slot_align);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Sanity check the alignment */
|
|
|
|
|
|
cplx_align_assert_sane(align);
|
|
|
|
|
|
|
|
|
|
|
|
if (slots[u].contiguous)
|
2016-05-22 21:29:53 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2017-12-02 22:32:59 -08:00
|
|
|
|
/* Adjust the alignment to be in terms of slots, not bytes */
|
|
|
|
|
|
assert((align.mul & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
|
|
|
|
assert((align.offset & (UNIFORM_SLOT_SIZE - 1)) == 0);
|
|
|
|
|
|
align.mul /= UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
align.offset /= UNIFORM_SLOT_SIZE;
|
|
|
|
|
|
|
|
|
|
|
|
unsigned push_start_align = cplx_align_apply(align, num_push_constants);
|
|
|
|
|
|
unsigned chunk_size = u - chunk_start + 1;
|
2017-12-02 22:34:47 -08:00
|
|
|
|
if ((!compiler->supports_pull_constants && u < UBO_START) ||
|
2017-12-02 22:32:59 -08:00
|
|
|
|
(chunk_size < max_chunk_size &&
|
|
|
|
|
|
push_start_align + chunk_size <= max_push_components)) {
|
|
|
|
|
|
/* Align up the number of push constants */
|
|
|
|
|
|
num_push_constants = push_start_align;
|
|
|
|
|
|
for (unsigned i = 0; i < chunk_size; i++)
|
|
|
|
|
|
push_constant_loc[chunk_start + i] = num_push_constants++;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We need to pull this one */
|
|
|
|
|
|
num_pull_constants = cplx_align_apply(align, num_pull_constants);
|
|
|
|
|
|
for (unsigned i = 0; i < chunk_size; i++)
|
|
|
|
|
|
pull_constant_loc[chunk_start + i] = num_pull_constants++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Reset the chunk and start again */
|
|
|
|
|
|
chunk_start = -1;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
2016-05-22 21:29:53 -07:00
|
|
|
|
/* Add the CS local thread ID uniform at the end of the push constants */
|
2017-08-24 11:40:31 -07:00
|
|
|
|
if (subgroup_id_index >= 0)
|
|
|
|
|
|
push_constant_loc[subgroup_id_index] = num_push_constants++;
|
2016-05-22 21:29:53 -07:00
|
|
|
|
|
2017-09-29 11:30:25 -07:00
|
|
|
|
/* As the uniforms are going to be reordered, stash the old array and
|
|
|
|
|
|
* create two new arrays for push/pull params.
|
2016-05-05 09:18:07 +02:00
|
|
|
|
*/
|
2017-09-29 11:30:25 -07:00
|
|
|
|
uint32_t *param = stage_prog_data->param;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
stage_prog_data->nr_params = num_push_constants;
|
2017-11-01 08:02:34 -07:00
|
|
|
|
if (num_push_constants) {
|
2017-12-02 22:32:59 -08:00
|
|
|
|
stage_prog_data->param = rzalloc_array(mem_ctx, uint32_t,
|
|
|
|
|
|
num_push_constants);
|
2017-11-01 08:02:34 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
stage_prog_data->param = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
assert(stage_prog_data->nr_pull_params == 0);
|
|
|
|
|
|
assert(stage_prog_data->pull_param == NULL);
|
2017-09-29 11:30:25 -07:00
|
|
|
|
if (num_pull_constants > 0) {
|
|
|
|
|
|
stage_prog_data->nr_pull_params = num_pull_constants;
|
2017-12-02 22:32:59 -08:00
|
|
|
|
stage_prog_data->pull_param = rzalloc_array(mem_ctx, uint32_t,
|
|
|
|
|
|
num_pull_constants);
|
2017-09-29 11:30:25 -07:00
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
|
/* Now that we know how many regular uniforms we'll push, reduce the
|
|
|
|
|
|
* UBO push ranges so we don't exceed the 3DSTATE_CONSTANT limits.
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned push_length = DIV_ROUND_UP(stage_prog_data->nr_params, 8);
|
|
|
|
|
|
for (int i = 0; i < 4; i++) {
|
|
|
|
|
|
struct brw_ubo_range *range = &prog_data->ubo_ranges[i];
|
|
|
|
|
|
|
|
|
|
|
|
if (push_length + range->length > 64)
|
|
|
|
|
|
range->length = 64 - push_length;
|
|
|
|
|
|
|
|
|
|
|
|
push_length += range->length;
|
|
|
|
|
|
}
|
|
|
|
|
|
assert(push_length <= 64);
|
|
|
|
|
|
|
2016-09-02 13:53:13 -07:00
|
|
|
|
/* Up until now, the param[] array has been indexed by reg + offset
|
2015-08-18 17:40:02 -07:00
|
|
|
|
* of UNIFORM registers. Move pull constants into pull_param[] and
|
|
|
|
|
|
* condense param[] to only contain the uniforms we chose to push.
|
|
|
|
|
|
*
|
|
|
|
|
|
* NOTE: Because we are condensing the params[] array, we know that
|
|
|
|
|
|
* push_constant_loc[i] <= i and we can do it in one smooth loop without
|
|
|
|
|
|
* having to make a copy.
|
2014-03-11 14:35:27 -07:00
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
2017-09-28 16:25:31 -07:00
|
|
|
|
uint32_t value = param[i];
|
2015-08-18 17:40:02 -07:00
|
|
|
|
if (pull_constant_loc[i] != -1) {
|
|
|
|
|
|
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
|
|
|
|
|
|
} else if (push_constant_loc[i] != -1) {
|
|
|
|
|
|
stage_prog_data->param[push_constant_loc[i]] = value;
|
|
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
}
|
2016-05-05 09:18:07 +02:00
|
|
|
|
ralloc_free(param);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::get_pull_locs(const fs_reg &src,
|
|
|
|
|
|
unsigned *out_surf_index,
|
|
|
|
|
|
unsigned *out_pull_index)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(src.file == UNIFORM);
|
|
|
|
|
|
|
2016-11-29 05:20:20 -08:00
|
|
|
|
if (src.nr >= UBO_START) {
|
|
|
|
|
|
const struct brw_ubo_range *range =
|
|
|
|
|
|
&prog_data->ubo_ranges[src.nr - UBO_START];
|
|
|
|
|
|
|
|
|
|
|
|
/* If this access is in our (reduced) range, use the push data. */
|
|
|
|
|
|
if (src.offset / 32 < range->length)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
*out_surf_index = prog_data->binding_table.ubo_start + range->block;
|
|
|
|
|
|
*out_pull_index = (32 * range->start + src.offset) / 4;
|
2019-09-09 22:21:17 -07:00
|
|
|
|
|
|
|
|
|
|
prog_data->has_ubo_pull = true;
|
2016-11-29 05:20:20 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
const unsigned location = src.nr + src.offset / 4;
|
|
|
|
|
|
|
|
|
|
|
|
if (location < uniforms && pull_constant_loc[location] != -1) {
|
|
|
|
|
|
/* A regular uniform push constant */
|
|
|
|
|
|
*out_surf_index = stage_prog_data->binding_table.pull_constants_start;
|
|
|
|
|
|
*out_pull_index = pull_constant_loc[location];
|
2019-09-09 22:21:17 -07:00
|
|
|
|
|
|
|
|
|
|
prog_data->has_ubo_pull = true;
|
2017-06-02 09:54:31 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
|
|
|
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2015-12-08 17:14:49 -08:00
|
|
|
|
fs_visitor::lower_constant_loads()
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
{
|
2017-06-02 09:54:31 -07:00
|
|
|
|
unsigned index, pull_index;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
/* Set up the annotation tracking for new generated instructions. */
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2010-10-22 12:57:00 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-11-24 15:12:20 -08:00
|
|
|
|
/* We'll handle this case later */
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
if (!get_pull_locs(inst->src[i], &index, &pull_index))
|
2010-10-22 12:57:00 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
assert(inst->src[i].stride == 0);
|
|
|
|
|
|
|
2016-12-08 19:18:00 -08:00
|
|
|
|
const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
|
|
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
|
|
|
|
|
|
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
const unsigned base = pull_index * 4;
|
|
|
|
|
|
|
2015-11-24 15:12:20 -08:00
|
|
|
|
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
|
2016-12-08 19:18:00 -08:00
|
|
|
|
dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->src[i].file = VGRF;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->src[i].nr = dst.nr;
|
2016-12-08 19:18:00 -08:00
|
|
|
|
inst->src[i].offset = (base & (block_sz - 1)) +
|
|
|
|
|
|
inst->src[i].offset % 4;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
|
|
|
|
|
|
inst->src[0].file == UNIFORM) {
|
|
|
|
|
|
|
2017-06-02 09:54:31 -07:00
|
|
|
|
if (!get_pull_locs(inst->src[0], &index, &pull_index))
|
|
|
|
|
|
continue;
|
2015-11-24 15:12:20 -08:00
|
|
|
|
|
|
|
|
|
|
VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
|
|
|
|
|
|
brw_imm_ud(index),
|
|
|
|
|
|
inst->src[1],
|
|
|
|
|
|
pull_index * 4);
|
|
|
|
|
|
inst->remove(block);
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_algebraic()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2018-10-11 09:55:38 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
switch (inst->opcode) {
|
2014-12-21 06:56:54 -08:00
|
|
|
|
case BRW_OPCODE_MOV:
|
2018-10-11 09:55:38 -07:00
|
|
|
|
if (!devinfo->has_64bit_types &&
|
|
|
|
|
|
(inst->dst.type == BRW_REGISTER_TYPE_DF ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UQ ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_Q)) {
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
assert(!inst->src[0].abs);
|
|
|
|
|
|
assert(!inst->src[0].negate);
|
|
|
|
|
|
const brw::fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
brw_imm_ud(inst->src[0].u64 >> 32));
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
brw_imm_ud(inst->src[0].u64));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1));
|
|
|
|
|
|
ibld.MOV(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-06-22 08:34:03 -07:00
|
|
|
|
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
|
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
|
|
|
|
|
|
inst->dst.is_null() &&
|
|
|
|
|
|
(inst->src[0].abs || inst->src[0].negate)) {
|
|
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-21 06:56:54 -08:00
|
|
|
|
if (inst->src[0].file != IMM)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->saturate) {
|
2018-06-26 19:21:43 -07:00
|
|
|
|
/* Full mixed-type saturates don't happen. However, we can end up
|
|
|
|
|
|
* with things like:
|
|
|
|
|
|
*
|
|
|
|
|
|
* mov.sat(8) g21<1>DF -1F
|
|
|
|
|
|
*
|
|
|
|
|
|
* Other mixed-size-but-same-base-type cases may also be possible.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->dst.type != inst->src[0].type &&
|
|
|
|
|
|
inst->dst.type != BRW_REGISTER_TYPE_DF &&
|
|
|
|
|
|
inst->src[0].type != BRW_REGISTER_TYPE_F)
|
2014-12-21 06:56:54 -08:00
|
|
|
|
assert(!"unimplemented: saturate mixed types");
|
|
|
|
|
|
|
2018-06-26 19:21:43 -07:00
|
|
|
|
if (brw_saturate_immediate(inst->src[0].type,
|
2015-11-19 21:51:37 -08:00
|
|
|
|
&inst->src[0].as_brw_reg())) {
|
2014-12-21 06:56:54 -08:00
|
|
|
|
inst->saturate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
case BRW_OPCODE_MUL:
|
2018-11-27 09:43:12 +01:00
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
2011-07-22 16:45:15 -07:00
|
|
|
|
|
2018-11-27 09:43:12 +01:00
|
|
|
|
/* a * 1.0 = a */
|
|
|
|
|
|
if (inst->src[1].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2011-07-22 16:45:15 -07:00
|
|
|
|
|
2015-02-04 18:08:30 -08:00
|
|
|
|
/* a * -1.0 = -a */
|
|
|
|
|
|
if (inst->src[1].is_negative_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].negate = !inst->src[0].negate;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-09 17:27:52 -08:00
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2015-10-24 14:55:57 -07:00
|
|
|
|
inst->src[0].f *= inst->src[1].f;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2018-11-27 09:43:12 +01:00
|
|
|
|
break;
|
2012-09-20 11:06:07 +02:00
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-11-09 17:27:52 -08:00
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2015-10-24 14:55:57 -07:00
|
|
|
|
inst->src[0].f += inst->src[1].f;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2012-09-20 11:06:07 +02:00
|
|
|
|
break;
|
2013-10-27 19:34:48 -07:00
|
|
|
|
case BRW_OPCODE_OR:
|
2018-06-13 12:32:27 -07:00
|
|
|
|
if (inst->src[0].equals(inst->src[1]) ||
|
|
|
|
|
|
inst->src[1].is_zero()) {
|
2018-12-12 18:14:34 -08:00
|
|
|
|
/* On Gen8+, the OR instruction can have a source modifier that
|
|
|
|
|
|
* performs logical not on the operand. Cases of 'OR r0, ~r1, 0'
|
|
|
|
|
|
* or 'OR r0, ~r1, ~r1' should become a NOT instead of a MOV.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->src[0].negate) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_NOT;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
}
|
2013-10-27 19:34:48 -07:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-01-05 13:51:03 -08:00
|
|
|
|
case BRW_OPCODE_CMP:
|
2018-06-22 08:34:03 -07:00
|
|
|
|
if ((inst->conditional_mod == BRW_CONDITIONAL_Z ||
|
|
|
|
|
|
inst->conditional_mod == BRW_CONDITIONAL_NZ) &&
|
|
|
|
|
|
inst->src[1].is_zero() &&
|
|
|
|
|
|
(inst->src[0].abs || inst->src[0].negate)) {
|
2015-01-05 13:51:03 -08:00
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
case BRW_OPCODE_SEL:
|
2018-10-11 09:55:38 -07:00
|
|
|
|
if (!devinfo->has_64bit_types &&
|
|
|
|
|
|
(inst->dst.type == BRW_REGISTER_TYPE_DF ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UQ ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_Q)) {
|
|
|
|
|
|
assert(inst->dst.type == inst->src[0].type);
|
|
|
|
|
|
assert(!inst->saturate);
|
|
|
|
|
|
assert(!inst->src[0].abs && !inst->src[0].negate);
|
|
|
|
|
|
assert(!inst->src[1].abs && !inst->src[1].negate);
|
|
|
|
|
|
const brw::fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
set_predicate(inst->predicate,
|
|
|
|
|
|
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0)));
|
|
|
|
|
|
set_predicate(inst->predicate,
|
|
|
|
|
|
ibld.SEL(subscript(inst->dst, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1)));
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
2014-04-18 10:01:41 -07:00
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NONE;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->saturate && inst->src[1].file == IMM) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
if (inst->src[1].f >= 1.0f) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
2015-02-10 21:36:26 -08:00
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 21:26:36 -07:00
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
if (inst->src[1].f <= 0.0f) {
|
2013-10-27 21:26:36 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2013-10-27 20:03:48 -07:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
case BRW_OPCODE_MAD:
|
2019-02-12 09:34:10 +01:00
|
|
|
|
if (inst->src[0].type != BRW_REGISTER_TYPE_F ||
|
|
|
|
|
|
inst->src[1].type != BRW_REGISTER_TYPE_F ||
|
|
|
|
|
|
inst->src[2].type != BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
break;
|
2019-02-12 12:43:30 +01:00
|
|
|
|
if (inst->src[1].is_one()) {
|
2014-11-09 17:27:52 -08:00
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[1] = inst->src[2];
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[2].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-02-19 14:52:24 +02:00
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2017-09-01 22:30:53 -07:00
|
|
|
|
/* It's possible that the selected component will be too large and
|
|
|
|
|
|
* overflow the register. This can happen if someone does a
|
|
|
|
|
|
* readInvocation() from GLSL or SPIR-V and provides an OOB
|
|
|
|
|
|
* invocationIndex. If this happens and we some how manage
|
|
|
|
|
|
* to constant fold it in and get here, then component() may cause
|
|
|
|
|
|
* us to start reading outside of the VGRF which will lead to an
|
|
|
|
|
|
* assert later. Instead, just let it wrap around if it goes over
|
|
|
|
|
|
* exec_size.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned comp = inst->src[1].ud & (inst->exec_size - 1);
|
|
|
|
|
|
inst->src[0] = component(inst->src[0], comp);
|
2015-02-19 14:52:24 +02:00
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2017-08-29 09:21:32 -07:00
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_SHUFFLE:
|
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = component(inst->src[0],
|
|
|
|
|
|
inst->src[1].ud);
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-02-19 14:52:24 +02:00
|
|
|
|
|
2011-05-03 10:55:50 -07:00
|
|
|
|
default:
|
2011-07-22 16:45:15 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-16 10:08:08 +02:00
|
|
|
|
/* Swap if src[0] is immediate. */
|
|
|
|
|
|
if (progress && inst->is_commutative()) {
|
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
fs_reg tmp = inst->src[1];
|
|
|
|
|
|
inst->src[1] = inst->src[0];
|
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2011-07-22 16:45:15 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-23 16:56:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages that have constant zero values for the trailing
|
|
|
|
|
|
* texture coordinates. We can just reduce the message length for these
|
|
|
|
|
|
* instructions instead of reserving a register for it. Trailing parameters
|
|
|
|
|
|
* that aren't sent default to zero anyway. This will cause the dead code
|
|
|
|
|
|
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
|
|
|
|
|
* set up the zero value.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_zero_samples()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Gen4 infers the texturing opcode based on the message length so we can't
|
|
|
|
|
|
* change it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 5)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (!inst->is_tex())
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *load_payload = (fs_inst *) inst->prev;
|
|
|
|
|
|
|
|
|
|
|
|
if (load_payload->is_head_sentinel() ||
|
|
|
|
|
|
load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-05-08 16:13:52 +01:00
|
|
|
|
/* We don't want to remove the message header or the first parameter.
|
|
|
|
|
|
* Removing the first parameter is not allowed, see the Haswell PRM
|
|
|
|
|
|
* volume 7, page 149:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Parameter 0 is required except for the sampleinfo message, which
|
|
|
|
|
|
* has no parameter 0"
|
2015-04-23 16:56:53 -07:00
|
|
|
|
*/
|
2015-07-13 15:33:04 +03:00
|
|
|
|
while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
|
2015-03-24 10:17:32 -07:00
|
|
|
|
load_payload->src[(inst->mlen - inst->header_size) /
|
2015-07-13 15:33:04 +03:00
|
|
|
|
(inst->exec_size / 8) +
|
2015-03-24 10:17:32 -07:00
|
|
|
|
inst->header_size - 1].is_zero()) {
|
2015-07-13 15:33:04 +03:00
|
|
|
|
inst->mlen -= inst->exec_size / 8;
|
2015-04-23 16:56:53 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages which are followed by the final RT write.
|
|
|
|
|
|
*
|
|
|
|
|
|
* CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
|
|
|
|
|
|
* results sent directly to the framebuffer, bypassing the EU. Recognize the
|
|
|
|
|
|
* final texturing results copied to the framebuffer write payload and modify
|
|
|
|
|
|
* them to write to the framebuffer directly.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_sampler_eot()
|
|
|
|
|
|
{
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
|
|
|
|
|
|
2016-04-25 17:08:42 -07:00
|
|
|
|
if (stage != MESA_SHADER_FRAGMENT || dispatch_width > 16)
|
2015-04-28 14:20:06 +01:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2017-06-27 15:16:35 -07:00
|
|
|
|
if (devinfo->gen != 9 && !devinfo->is_cherryview)
|
2015-02-08 13:59:57 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* FINISHME: It should be possible to implement this optimization when there
|
|
|
|
|
|
* are multiple drawbuffers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (key->nr_color_regions != 1)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2016-05-20 00:38:17 -07:00
|
|
|
|
/* Requires emitting a bunch of saturating MOV instructions during logical
|
|
|
|
|
|
* send lowering to clamp the color payload, which the sampler unit isn't
|
|
|
|
|
|
* going to do for us.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (key->clamp_fragment_color)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
/* Look for a texturing instruction immediately before the final FB_WRITE. */
|
2015-07-27 18:15:44 +03:00
|
|
|
|
bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
|
|
|
|
|
|
fs_inst *fb_write = (fs_inst *)block->end();
|
2015-02-08 13:59:57 -08:00
|
|
|
|
assert(fb_write->eot);
|
2016-05-20 00:38:17 -07:00
|
|
|
|
assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
|
2015-02-08 13:59:57 -08:00
|
|
|
|
|
|
|
|
|
|
/* There wasn't one; nothing to do. */
|
2016-04-30 14:24:31 -07:00
|
|
|
|
if (unlikely(fb_write->prev->is_head_sentinel()))
|
2015-02-08 13:59:57 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2016-04-30 14:24:31 -07:00
|
|
|
|
fs_inst *tex_inst = (fs_inst *) fb_write->prev;
|
|
|
|
|
|
|
2015-10-20 11:56:15 +02:00
|
|
|
|
/* 3D Sampler » Messages » Message Format
|
|
|
|
|
|
*
|
|
|
|
|
|
* “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
|
|
|
|
|
|
* messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
|
2015-05-08 17:35:18 +01:00
|
|
|
|
*/
|
2016-05-20 00:38:17 -07:00
|
|
|
|
if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
|
|
|
|
|
|
tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
|
2015-05-08 17:35:18 +01:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2016-05-20 00:38:17 -07:00
|
|
|
|
/* XXX - This shouldn't be necessary. */
|
2016-04-30 14:24:31 -07:00
|
|
|
|
if (tex_inst->prev->is_head_sentinel())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2016-05-20 00:38:17 -07:00
|
|
|
|
/* Check that the FB write sources are fully initialized by the single
|
|
|
|
|
|
* texturing instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
|
|
|
|
|
|
if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
|
|
|
|
|
|
if (!fb_write->src[i].equals(tex_inst->dst) ||
|
2016-09-07 17:00:07 -07:00
|
|
|
|
fb_write->size_read(i) != tex_inst->size_written)
|
2016-05-20 00:38:17 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
} else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
|
|
|
|
|
|
if (fb_write->src[i].file != BAD_FILE)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-02-08 13:59:57 -08:00
|
|
|
|
|
|
|
|
|
|
assert(!tex_inst->eot); /* We can't get here twice */
|
|
|
|
|
|
assert((tex_inst->offset & (0xff << 24)) == 0);
|
|
|
|
|
|
|
2015-07-27 18:15:44 +03:00
|
|
|
|
const fs_builder ibld(this, block, tex_inst);
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
tex_inst->offset |= fb_write->target << 24;
|
|
|
|
|
|
tex_inst->eot = true;
|
2015-07-27 18:15:44 +03:00
|
|
|
|
tex_inst->dst = ibld.null_reg_ud();
|
2016-09-07 13:38:20 -07:00
|
|
|
|
tex_inst->size_written = 0;
|
2015-02-08 13:59:57 -08:00
|
|
|
|
fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
|
|
|
|
|
|
|
2016-05-20 00:38:17 -07:00
|
|
|
|
/* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
|
|
|
|
|
|
* flag and submit a header together with the sampler message as required
|
|
|
|
|
|
* by the hardware.
|
2015-02-08 13:59:57 -08:00
|
|
|
|
*/
|
2016-03-11 15:22:56 -08:00
|
|
|
|
invalidate_live_intervals();
|
2015-02-08 13:59:57 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-14 15:01:37 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_register_renaming()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
int depth = 0;
|
|
|
|
|
|
|
2018-12-10 14:49:49 -08:00
|
|
|
|
unsigned remap[alloc.count];
|
|
|
|
|
|
memset(remap, ~0u, sizeof(unsigned) * alloc.count);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-04-14 15:01:37 -07:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
} else if (inst->opcode == BRW_OPCODE_ENDIF ||
|
|
|
|
|
|
inst->opcode == BRW_OPCODE_WHILE) {
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite instruction sources. */
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->src[i].file == VGRF &&
|
2018-12-10 14:49:49 -08:00
|
|
|
|
remap[inst->src[i].nr] != ~0u &&
|
2015-10-26 04:35:14 -07:00
|
|
|
|
remap[inst->src[i].nr] != inst->src[i].nr) {
|
|
|
|
|
|
inst->src[i].nr = remap[inst->src[i].nr];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-12-10 14:49:49 -08:00
|
|
|
|
const unsigned dst = inst->dst.nr;
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
|
|
|
|
|
if (depth == 0 &&
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->dst.file == VGRF &&
|
2016-09-07 13:38:20 -07:00
|
|
|
|
alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
|
2019-04-24 12:38:28 +02:00
|
|
|
|
!inst->is_partial_write()) {
|
2018-12-10 14:49:49 -08:00
|
|
|
|
if (remap[dst] == ~0u) {
|
2014-04-14 15:01:37 -07:00
|
|
|
|
remap[dst] = dst;
|
|
|
|
|
|
} else {
|
2016-09-07 16:59:35 -07:00
|
|
|
|
remap[dst] = alloc.allocate(regs_written(inst));
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = remap[dst];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
2015-10-26 17:09:25 -07:00
|
|
|
|
} else if (inst->dst.file == VGRF &&
|
2018-12-10 14:49:49 -08:00
|
|
|
|
remap[dst] != ~0u &&
|
2014-04-14 15:01:37 -07:00
|
|
|
|
remap[dst] != dst) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->dst.nr = remap[dst];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
2018-12-10 14:49:49 -08:00
|
|
|
|
if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != ~0u) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
delta_xy[i].nr = remap[delta_xy[i].nr];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-05 22:10:41 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove redundant or useless discard jumps.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, we can eliminate jumps in the following sequence:
|
|
|
|
|
|
*
|
|
|
|
|
|
* discard-jump (redundant with the next jump)
|
|
|
|
|
|
* discard-jump (useless; jumps to the next instruction)
|
|
|
|
|
|
* placeholder-halt
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_redundant_discard_jumps()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *placeholder_halt = NULL;
|
|
|
|
|
|
foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
|
|
|
|
|
|
if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
|
|
|
|
|
|
placeholder_halt = inst;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!placeholder_halt)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* Delete any HALTs immediately before the placeholder halt. */
|
|
|
|
|
|
for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
|
|
|
|
|
|
!prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
|
|
|
|
|
|
prev = (fs_inst *) placeholder_halt->prev) {
|
|
|
|
|
|
prev->remove(last_bblock);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Compute a bitmask with GRF granularity with a bit set for each GRF starting
|
2016-09-02 15:21:26 -07:00
|
|
|
|
* from \p r.offset which overlaps the region starting at \p s.offset and
|
|
|
|
|
|
* spanning \p ds bytes.
|
2016-05-27 16:03:34 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static inline unsigned
|
2016-09-02 15:21:26 -07:00
|
|
|
|
mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
|
2016-05-27 16:03:34 -07:00
|
|
|
|
{
|
2016-09-02 15:21:26 -07:00
|
|
|
|
const int rel_offset = reg_offset(s) - reg_offset(r);
|
|
|
|
|
|
const int shift = rel_offset / REG_SIZE;
|
|
|
|
|
|
const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
|
2016-05-27 16:03:34 -07:00
|
|
|
|
assert(reg_space(r) == reg_space(s) &&
|
2016-09-02 15:21:26 -07:00
|
|
|
|
shift >= 0 && shift < int(8 * sizeof(unsigned)));
|
|
|
|
|
|
return ((1 << n) - 1) << shift;
|
2016-05-27 16:03:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-02-21 18:06:56 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_peephole_csel()
|
|
|
|
|
|
{
|
|
|
|
|
|
if (devinfo->gen < 8)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_reverse(block, cfg) {
|
|
|
|
|
|
int ip = block->end_ip + 1;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
|
|
|
|
|
|
ip--;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != BRW_OPCODE_SEL ||
|
|
|
|
|
|
inst->predicate != BRW_PREDICATE_NORMAL ||
|
|
|
|
|
|
(inst->dst.type != BRW_REGISTER_TYPE_F &&
|
|
|
|
|
|
inst->dst.type != BRW_REGISTER_TYPE_D &&
|
|
|
|
|
|
inst->dst.type != BRW_REGISTER_TYPE_UD))
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Because it is a 3-src instruction, CSEL cannot have an immediate
|
|
|
|
|
|
* value as a source, but we can sometimes handle zero.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if ((inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
|
|
|
|
|
|
inst->src[0].file != UNIFORM) ||
|
|
|
|
|
|
(inst->src[1].file != VGRF && inst->src[1].file != ATTR &&
|
|
|
|
|
|
inst->src[1].file != UNIFORM && !inst->src[1].is_zero()))
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
|
|
|
|
|
if (!scan_inst->flags_written())
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
if ((scan_inst->opcode != BRW_OPCODE_CMP &&
|
|
|
|
|
|
scan_inst->opcode != BRW_OPCODE_MOV) ||
|
|
|
|
|
|
scan_inst->predicate != BRW_PREDICATE_NONE ||
|
|
|
|
|
|
(scan_inst->src[0].file != VGRF &&
|
|
|
|
|
|
scan_inst->src[0].file != ATTR &&
|
|
|
|
|
|
scan_inst->src[0].file != UNIFORM) ||
|
|
|
|
|
|
scan_inst->src[0].type != BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
if (scan_inst->opcode == BRW_OPCODE_CMP && !scan_inst->src[1].is_zero())
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
const brw::fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
const enum brw_conditional_mod cond =
|
|
|
|
|
|
inst->predicate_inverse
|
|
|
|
|
|
? brw_negate_cmod(scan_inst->conditional_mod)
|
|
|
|
|
|
: scan_inst->conditional_mod;
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *csel_inst = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[1].file != IMM) {
|
|
|
|
|
|
csel_inst = ibld.CSEL(inst->dst,
|
|
|
|
|
|
inst->src[0],
|
|
|
|
|
|
inst->src[1],
|
|
|
|
|
|
scan_inst->src[0],
|
|
|
|
|
|
cond);
|
|
|
|
|
|
} else if (cond == BRW_CONDITIONAL_NZ) {
|
|
|
|
|
|
/* Consider the sequence
|
|
|
|
|
|
*
|
|
|
|
|
|
* cmp.nz.f0 null<1>F g3<8,8,1>F 0F
|
|
|
|
|
|
* (+f0) sel g124<1>UD g2<8,8,1>UD 0x00000000UD
|
|
|
|
|
|
*
|
|
|
|
|
|
* The sel will pick the immediate value 0 if r0 is ±0.0.
|
|
|
|
|
|
* Therefore, this sequence is equivalent:
|
|
|
|
|
|
*
|
|
|
|
|
|
* cmp.nz.f0 null<1>F g3<8,8,1>F 0F
|
|
|
|
|
|
* (+f0) sel g124<1>F g2<8,8,1>F (abs)g3<8,8,1>F
|
|
|
|
|
|
*
|
|
|
|
|
|
* The abs is ensures that the result is 0UD when g3 is -0.0F.
|
|
|
|
|
|
* By normal cmp-sel merging, this is also equivalent:
|
|
|
|
|
|
*
|
|
|
|
|
|
* csel.nz g124<1>F g2<4,4,1>F (abs)g3<4,4,1>F g3<4,4,1>F
|
|
|
|
|
|
*/
|
|
|
|
|
|
csel_inst = ibld.CSEL(inst->dst,
|
|
|
|
|
|
inst->src[0],
|
|
|
|
|
|
scan_inst->src[0],
|
|
|
|
|
|
scan_inst->src[0],
|
|
|
|
|
|
cond);
|
|
|
|
|
|
|
|
|
|
|
|
csel_inst->src[1].abs = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (csel_inst != NULL) {
|
|
|
|
|
|
progress = true;
|
2019-03-11 19:00:21 -07:00
|
|
|
|
csel_inst->saturate = inst->saturate;
|
2018-02-21 18:06:56 -08:00
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::compute_to_mrf()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
2014-07-15 12:56:37 -07:00
|
|
|
|
int next_ip = 0;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
|
2014-10-29 14:21:14 -07:00
|
|
|
|
/* No MRFs on Gen >= 7. */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7)
|
2014-10-29 14:21:14 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2011-01-12 10:10:01 -08:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2014-09-03 23:52:26 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
int ip = next_ip;
|
|
|
|
|
|
next_ip++;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
2019-04-24 12:38:28 +02:00
|
|
|
|
inst->is_partial_write() ||
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->dst.file != MRF || inst->src[0].file != VGRF ||
|
2010-10-08 14:00:14 -07:00
|
|
|
|
inst->dst.type != inst->src[0].type ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[0].abs || inst->src[0].negate ||
|
2014-01-15 22:21:30 +01:00
|
|
|
|
!inst->src[0].is_contiguous() ||
|
2016-09-01 15:11:21 -07:00
|
|
|
|
inst->src[0].offset % REG_SIZE != 0)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Can't compute-to-MRF this GRF if someone else was going to
|
|
|
|
|
|
* read it later.
|
|
|
|
|
|
*/
|
2015-10-26 04:35:14 -07:00
|
|
|
|
if (this->virtual_grf_end[inst->src[0].nr] > ip)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Found a move of a GRF to a MRF. Let's see if we can go rewrite the
|
|
|
|
|
|
* things that computed the value of all GRFs of the source region. The
|
|
|
|
|
|
* regs_left bitset keeps track of the registers we haven't yet found a
|
|
|
|
|
|
* generating instruction for.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2016-09-07 16:59:35 -07:00
|
|
|
|
unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
|
2016-05-27 13:15:55 -07:00
|
|
|
|
|
2015-10-20 11:16:00 +02:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
2016-09-07 17:00:07 -07:00
|
|
|
|
inst->src[0], inst->size_read(0))) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
/* Found the last thing to write our reg we want to turn
|
|
|
|
|
|
* into a compute-to-MRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/* If this one instruction didn't populate all the
|
|
|
|
|
|
* channels, bail. We might be able to rewrite everything
|
2011-03-28 16:54:39 -07:00
|
|
|
|
* that writes that reg, but it would require smarter
|
2016-05-27 13:15:55 -07:00
|
|
|
|
* tracking.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2019-04-24 12:38:28 +02:00
|
|
|
|
if (scan_inst->is_partial_write())
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2016-05-27 16:41:35 -07:00
|
|
|
|
/* Handling things not fully contained in the source of the copy
|
|
|
|
|
|
* would need us to understand coalescing out more than one MOV at
|
|
|
|
|
|
* a time.
|
2014-07-15 12:56:37 -07:00
|
|
|
|
*/
|
2016-09-01 20:06:40 -07:00
|
|
|
|
if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
|
|
|
|
|
|
inst->src[0], inst->size_read(0)))
|
2014-07-15 12:56:37 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* SEND instructions can't have MRF as a destination. */
|
|
|
|
|
|
if (scan_inst->mlen)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
/* gen6 math instructions must have the destination be
|
|
|
|
|
|
* GRF, so no compute-to-MRF for them.
|
|
|
|
|
|
*/
|
2011-01-18 22:48:11 -08:00
|
|
|
|
if (scan_inst->is_math()) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Clear the bits for any registers this instruction overwrites. */
|
|
|
|
|
|
regs_left &= ~mask_relative_to(
|
2016-09-02 15:21:26 -07:00
|
|
|
|
inst->src[0], scan_inst->dst, scan_inst->size_written);
|
2016-05-27 16:03:34 -07:00
|
|
|
|
if (!regs_left)
|
|
|
|
|
|
break;
|
2010-11-18 15:03:50 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:36:18 -08:00
|
|
|
|
/* We don't handle control flow here. Most computation of
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* values that end up in MRFs are shortly before the MRF
|
|
|
|
|
|
* write anyway.
|
|
|
|
|
|
*/
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->start() == scan_inst)
|
2010-11-18 15:03:50 +08:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* You can't read from an MRF, so if someone else reads our
|
|
|
|
|
|
* MRF's source GRF that we wanted to rewrite, that stops us.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool interfered = false;
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->sources; i++) {
|
2016-09-07 17:00:07 -07:00
|
|
|
|
if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
|
|
|
|
|
|
inst->src[0], inst->size_read(0))) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
interfered = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (interfered)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
|
|
|
|
|
inst->dst, inst->size_written)) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
/* If somebody else writes our MRF here, we can't
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* compute-to-MRF before that.
|
|
|
|
|
|
*/
|
2016-05-27 12:50:28 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-11-18 15:03:50 +08:00
|
|
|
|
|
2016-05-27 12:50:28 -07:00
|
|
|
|
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
|
|
|
|
|
|
regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
|
2016-09-07 13:38:20 -07:00
|
|
|
|
inst->dst, inst->size_written)) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
/* Found a SEND instruction, which means that there are
|
|
|
|
|
|
* live values in MRFs from base_mrf to base_mrf +
|
|
|
|
|
|
* scan_inst->mlen - 1. Don't go pushing our MRF write up
|
|
|
|
|
|
* above it.
|
|
|
|
|
|
*/
|
2016-05-27 12:50:28 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-10-08 14:00:14 -07:00
|
|
|
|
}
|
2016-05-27 13:15:55 -07:00
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
if (regs_left)
|
2016-05-27 13:15:55 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Found all generating instructions of our MRF's source value, so it
|
|
|
|
|
|
* should be safe to rewrite them to point to the MRF directly.
|
2016-05-27 13:15:55 -07:00
|
|
|
|
*/
|
2016-09-07 16:59:35 -07:00
|
|
|
|
regs_left = (1 << regs_read(inst, 0)) - 1;
|
2016-05-27 16:03:34 -07:00
|
|
|
|
|
2016-05-27 13:15:55 -07:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(scan_inst->dst, scan_inst->size_written,
|
2016-09-07 17:00:07 -07:00
|
|
|
|
inst->src[0], inst->size_read(0))) {
|
2016-05-27 16:03:34 -07:00
|
|
|
|
/* Clear the bits for any registers this instruction overwrites. */
|
|
|
|
|
|
regs_left &= ~mask_relative_to(
|
2016-09-02 15:21:26 -07:00
|
|
|
|
inst->src[0], scan_inst->dst, scan_inst->size_written);
|
2016-05-27 16:03:34 -07:00
|
|
|
|
|
2016-09-03 13:14:28 -07:00
|
|
|
|
const unsigned rel_offset = reg_offset(scan_inst->dst) -
|
|
|
|
|
|
reg_offset(inst->src[0]);
|
2016-05-27 14:17:28 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->dst.nr & BRW_MRF_COMPR4) {
|
|
|
|
|
|
/* Apply the same address transformation done by the hardware
|
|
|
|
|
|
* for COMPR4 MRF writes.
|
|
|
|
|
|
*/
|
2016-09-03 13:14:28 -07:00
|
|
|
|
assert(rel_offset < 2 * REG_SIZE);
|
|
|
|
|
|
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
|
2016-05-27 14:17:28 -07:00
|
|
|
|
|
|
|
|
|
|
/* Clear the COMPR4 bit if the generating instruction is not
|
|
|
|
|
|
* compressed.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (scan_inst->size_written < 2 * REG_SIZE)
|
2016-05-27 14:17:28 -07:00
|
|
|
|
scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
|
|
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Calculate the MRF number the result of this instruction is
|
|
|
|
|
|
* ultimately written to.
|
|
|
|
|
|
*/
|
2016-09-03 13:14:28 -07:00
|
|
|
|
scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
|
2016-05-27 14:17:28 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 13:15:55 -07:00
|
|
|
|
scan_inst->dst.file = MRF;
|
2016-09-03 13:14:28 -07:00
|
|
|
|
scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
|
2016-05-27 13:15:55 -07:00
|
|
|
|
scan_inst->saturate |= inst->saturate;
|
2016-05-27 16:03:34 -07:00
|
|
|
|
if (!regs_left)
|
|
|
|
|
|
break;
|
2016-05-27 13:15:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 16:03:34 -07:00
|
|
|
|
assert(!regs_left);
|
2016-05-27 13:15:55 -07:00
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-20 20:25:04 +02:00
|
|
|
|
/**
|
|
|
|
|
|
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
|
|
|
|
|
* flow. We could probably do better here with some form of divergence
|
|
|
|
|
|
* analysis.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::eliminate_find_live_channel()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
unsigned depth = 0;
|
|
|
|
|
|
|
2016-09-15 17:20:23 -07:00
|
|
|
|
if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
|
|
|
|
|
|
/* The optimization below assumes that channel zero is live on thread
|
|
|
|
|
|
* dispatch, which may not be the case if the fixed function dispatches
|
|
|
|
|
|
* threads sparsely.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-20 20:25:04 +02:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_DISCARD_JUMP:
|
|
|
|
|
|
/* This can potentially make control flow non-uniform until the end
|
|
|
|
|
|
* of the program.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
|
if (depth == 0) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2015-11-02 11:26:16 -08:00
|
|
|
|
inst->src[0] = brw_imm_ud(0u);
|
2015-02-20 20:25:04 +02:00
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-07 15:27:17 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
|
|
|
|
|
* instructions to FS_OPCODE_REP_FB_WRITE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_visitor::emit_repclear_shader()
|
2014-07-07 15:27:17 -07:00
|
|
|
|
{
|
2014-08-19 13:57:11 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2016-04-29 18:40:35 -07:00
|
|
|
|
int base_mrf = 0;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
int color_mrf = base_mrf + 2;
|
2016-04-04 14:38:42 -07:00
|
|
|
|
fs_inst *mov;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2016-04-04 14:38:42 -07:00
|
|
|
|
if (uniforms > 0) {
|
|
|
|
|
|
mov = bld.exec_all().group(4, 0)
|
|
|
|
|
|
.MOV(brw_message_reg(color_mrf),
|
|
|
|
|
|
fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
struct brw_reg reg =
|
|
|
|
|
|
brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
|
|
|
|
|
|
BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
|
|
|
|
|
|
BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
|
|
|
|
|
|
|
|
|
|
|
|
mov = bld.exec_all().group(4, 0)
|
|
|
|
|
|
.MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
|
|
|
|
|
|
}
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2018-02-10 11:19:00 +00:00
|
|
|
|
fs_inst *write = NULL;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
if (key->nr_color_regions == 1) {
|
2015-06-03 21:07:52 +03:00
|
|
|
|
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->saturate = key->clamp_fragment_color;
|
|
|
|
|
|
write->base_mrf = color_mrf;
|
|
|
|
|
|
write->target = 0;
|
2015-03-24 10:17:32 -07:00
|
|
|
|
write->header_size = 0;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->mlen = 1;
|
|
|
|
|
|
} else {
|
2014-10-23 15:45:15 -07:00
|
|
|
|
assume(key->nr_color_regions > 0);
|
2018-05-17 08:46:03 -07:00
|
|
|
|
|
|
|
|
|
|
struct brw_reg header =
|
|
|
|
|
|
retype(brw_message_reg(base_mrf), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.exec_all().group(16, 0)
|
|
|
|
|
|
.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
for (int i = 0; i < key->nr_color_regions; ++i) {
|
2018-05-17 08:46:03 -07:00
|
|
|
|
if (i > 0) {
|
|
|
|
|
|
bld.exec_all().group(1, 0)
|
|
|
|
|
|
.MOV(component(header, 2), brw_imm_ud(i));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 21:07:52 +03:00
|
|
|
|
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->saturate = key->clamp_fragment_color;
|
|
|
|
|
|
write->base_mrf = base_mrf;
|
|
|
|
|
|
write->target = i;
|
2015-03-24 10:17:32 -07:00
|
|
|
|
write->header_size = 2;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->mlen = 3;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
2014-09-26 14:47:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
write->eot = true;
|
2017-01-13 14:01:45 -08:00
|
|
|
|
write->last_rt = true;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
calculate_cfg();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
assign_constant_locations();
|
|
|
|
|
|
assign_curb_setup();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
|
2016-04-04 14:38:42 -07:00
|
|
|
|
if (uniforms > 0) {
|
|
|
|
|
|
assert(mov->src[0].file == FIXED_GRF);
|
|
|
|
|
|
mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
|
|
|
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
|
|
|
|
|
|
lower_scoreboard();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
2012-01-27 11:06:49 -08:00
|
|
|
|
* Walks through basic blocks, looking for repeated MRF writes and
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* removing the later ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_duplicate_mrf_writes()
|
|
|
|
|
|
{
|
2015-09-22 12:53:08 +02:00
|
|
|
|
fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
|
2010-11-19 15:57:05 +08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2011-03-23 14:00:01 -07:00
|
|
|
|
/* Need to update the MRF tracking for compressed instructions. */
|
2016-04-25 17:09:39 -07:00
|
|
|
|
if (dispatch_width >= 16)
|
2011-03-23 14:00:01 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2013-02-05 15:36:18 -08:00
|
|
|
|
if (inst->is_control_flow()) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
|
2019-01-25 13:30:36 -06:00
|
|
|
|
if (prev_inst && prev_inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.equals(prev_inst->dst) &&
|
|
|
|
|
|
inst->src[0].equals(prev_inst->src[0]) &&
|
|
|
|
|
|
inst->saturate == prev_inst->saturate &&
|
|
|
|
|
|
inst->predicate == prev_inst->predicate &&
|
|
|
|
|
|
inst->conditional_mod == prev_inst->conditional_mod &&
|
|
|
|
|
|
inst->exec_size == prev_inst->exec_size) {
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->remove(block);
|
2010-11-19 15:57:05 +08:00
|
|
|
|
progress = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out the last-write records for MRFs that were overwritten. */
|
|
|
|
|
|
if (inst->dst.file == MRF) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
last_mrf_move[inst->dst.nr] = NULL;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->mlen > 0 && inst->base_mrf != -1) {
|
2011-01-18 13:28:32 -08:00
|
|
|
|
/* Found a SEND instruction, which will include two or fewer
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* implied MRF writes. We could do better here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < implied_mrf_writes(inst); i++) {
|
|
|
|
|
|
last_mrf_move[inst->base_mrf + i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out any MRF move records whose sources got overwritten. */
|
2016-05-25 13:17:41 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
|
|
|
|
|
|
if (last_mrf_move[i] &&
|
2016-09-07 13:38:20 -07:00
|
|
|
|
regions_overlap(inst->dst, inst->size_written,
|
2016-05-25 13:17:41 -07:00
|
|
|
|
last_mrf_move[i]->src[0],
|
2016-09-07 17:00:07 -07:00
|
|
|
|
last_mrf_move[i]->size_read(0))) {
|
2016-05-25 13:17:41 -07:00
|
|
|
|
last_mrf_move[i] = NULL;
|
|
|
|
|
|
}
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF &&
|
2016-05-25 13:17:41 -07:00
|
|
|
|
inst->src[0].file != ARF &&
|
2019-04-24 12:38:28 +02:00
|
|
|
|
!inst->is_partial_write()) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
last_mrf_move[inst->dst.nr] = inst;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-07-01 08:14:56 +02:00
|
|
|
|
/**
|
|
|
|
|
|
* Rounding modes for conversion instructions are included for each
|
|
|
|
|
|
* conversion, but right now it is a state. So once it is set,
|
|
|
|
|
|
* we don't need to call it again for subsequent calls.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful for vector/matrices conversions, as setting the
|
|
|
|
|
|
* mode once is enough for the full vector/matrix
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_extra_rounding_modes()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
2018-11-19 12:38:10 +01:00
|
|
|
|
unsigned execution_mode = this->nir->info.float_controls_execution_mode;
|
|
|
|
|
|
|
|
|
|
|
|
brw_rnd_mode base_mode = BRW_RND_MODE_UNSPECIFIED;
|
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP16 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP32 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTE_FP64) &
|
|
|
|
|
|
execution_mode)
|
|
|
|
|
|
base_mode = BRW_RND_MODE_RTNE;
|
|
|
|
|
|
if ((FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32 |
|
|
|
|
|
|
FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) &
|
|
|
|
|
|
execution_mode)
|
|
|
|
|
|
base_mode = BRW_RND_MODE_RTZ;
|
2017-07-01 08:14:56 +02:00
|
|
|
|
|
|
|
|
|
|
foreach_block (block, cfg) {
|
2018-11-19 12:38:10 +01:00
|
|
|
|
brw_rnd_mode prev_mode = base_mode;
|
2017-07-01 08:14:56 +02:00
|
|
|
|
|
|
|
|
|
|
foreach_inst_in_block_safe (fs_inst, inst, block) {
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_RND_MODE) {
|
|
|
|
|
|
assert(inst->src[0].file == BRW_IMMEDIATE_VALUE);
|
|
|
|
|
|
const brw_rnd_mode mode = (brw_rnd_mode) inst->src[0].d;
|
|
|
|
|
|
if (mode == prev_mode) {
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
prev_mode = mode;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
static void
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int grf;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
|
2015-10-24 15:29:03 -07:00
|
|
|
|
grf = inst->src[i].nr;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (grf >= first_grf &&
|
|
|
|
|
|
grf < first_grf + grf_len) {
|
|
|
|
|
|
deps[grf - first_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (inst->exec_size == 16)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
deps[grf - first_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
|
|
|
|
|
|
* check for post destination dependencies on this instruction, software
|
|
|
|
|
|
* must ensure that there is no destination hazard for the case of ‘write
|
|
|
|
|
|
* followed by a posted write’ shown in the following example.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1. mov r3 0
|
|
|
|
|
|
* 2. send r3.xy <rest of send instruction>
|
|
|
|
|
|
* 3. mov r2 r3
|
|
|
|
|
|
*
|
|
|
|
|
|
* Due to no post-destination dependency check on the ‘send’, the above
|
|
|
|
|
|
* code sequence could have two instructions (1 and 2) in flight at the
|
|
|
|
|
|
* same time that both consider ‘r3’ as the target of their final writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
|
|
|
|
|
|
fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2016-09-07 16:59:35 -07:00
|
|
|
|
int write_len = regs_written(inst);
|
2015-10-26 04:35:14 -07:00
|
|
|
|
int first_write_grf = inst->dst.nr;
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
|
bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
|
2013-02-05 15:46:22 -08:00
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Walk backwards looking for writes to registers we're writing which
|
|
|
|
|
|
* aren't read since being written. If we hit the start of the program,
|
|
|
|
|
|
* we assume that there are no outstanding dependencies on entry to the
|
|
|
|
|
|
* program.
|
|
|
|
|
|
*/
|
2015-10-20 11:16:00 +02:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, assume that there *are* outstanding
|
|
|
|
|
|
* dependencies, and force their cleanup before our instruction.
|
|
|
|
|
|
*/
|
2016-05-25 14:21:49 -07:00
|
|
|
|
if (block->start() == scan_inst && block->num != 0) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
2015-06-03 22:22:10 +03:00
|
|
|
|
if (needs_dep[i])
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, inst),
|
|
|
|
|
|
first_write_grf + i);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible on the assumption that any
|
|
|
|
|
|
* instruction but a MOV that might have left us an outstanding
|
|
|
|
|
|
* dependency has more latency than a MOV.
|
|
|
|
|
|
*/
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (scan_inst->dst.file == VGRF) {
|
2016-09-07 16:59:35 -07:00
|
|
|
|
for (unsigned i = 0; i < regs_written(scan_inst); i++) {
|
2015-10-26 04:35:14 -07:00
|
|
|
|
int reg = scan_inst->dst.nr + i;
|
2013-03-06 17:50:50 -08:00
|
|
|
|
|
|
|
|
|
|
if (reg >= first_write_grf &&
|
|
|
|
|
|
reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[reg - first_write_grf]) {
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (scan_inst->exec_size == 16)
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Errata: A destination register from a send can not be
|
|
|
|
|
|
* used as a destination register until after it has been sourced by an
|
|
|
|
|
|
* instruction with a different destination register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2016-09-07 16:59:35 -07:00
|
|
|
|
int write_len = regs_written(inst);
|
2018-12-10 14:49:49 -08:00
|
|
|
|
unsigned first_write_grf = inst->dst.nr;
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
|
bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
|
2013-02-05 15:46:22 -08:00
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
/* Walk forwards looking for writes to registers we're writing which aren't
|
|
|
|
|
|
* read before being written.
|
|
|
|
|
|
*/
|
2015-10-20 11:16:00 +02:00
|
|
|
|
foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, force resolve all remaining dependencies. */
|
2016-05-25 14:21:49 -07:00
|
|
|
|
if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
|
|
|
|
|
|
first_write_grf + i);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible since they're reading the
|
|
|
|
|
|
* result of a SEND, which has massive latency.
|
|
|
|
|
|
*/
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (scan_inst->dst.file == VGRF &&
|
2015-10-26 04:35:14 -07:00
|
|
|
|
scan_inst->dst.nr >= first_write_grf &&
|
|
|
|
|
|
scan_inst->dst.nr < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[scan_inst->dst.nr - first_write_grf]) {
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
|
2015-10-26 04:35:14 -07:00
|
|
|
|
scan_inst->dst.nr);
|
|
|
|
|
|
needs_dep[scan_inst->dst.nr - first_write_grf] = false;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::insert_gen4_send_dependency_workarounds()
|
|
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen != 4 || devinfo->is_g4x)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-06-09 02:59:22 -07:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2014-08-24 19:07:01 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
if (inst->mlen != 0 && inst->dst.file == VGRF) {
|
2014-08-24 19:07:01 -07:00
|
|
|
|
insert_gen4_pre_send_dependency_workarounds(block, inst);
|
|
|
|
|
|
insert_gen4_post_send_dependency_workarounds(block, inst);
|
2014-06-09 02:59:22 -07:00
|
|
|
|
progress = true;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-06-09 02:59:22 -07:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-15 19:26:48 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Turns the generic expression-style uniform pull constant load instruction
|
|
|
|
|
|
* into a hardware-specific series of instructions for loading a pull
|
|
|
|
|
|
* constant.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The expression style allows the CSE pass before this to optimize out
|
|
|
|
|
|
* repeated loads from the same offset, and gives the pre-register-allocation
|
|
|
|
|
|
* scheduling full flexibility, while the conversion to native instructions
|
|
|
|
|
|
* allows the post-register-allocation scheduler the best information
|
|
|
|
|
|
* possible.
|
2013-03-06 14:47:22 -08:00
|
|
|
|
*
|
|
|
|
|
|
* Note that execution masking for setting up pull constant loads is special:
|
|
|
|
|
|
* the channels that need to be written are unrelated to the current execution
|
|
|
|
|
|
* mask, since a later instruction will use one of the result channels as a
|
|
|
|
|
|
* source operand for all 8 or 16 of its channels.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::lower_uniform_pull_constant_loads()
|
|
|
|
|
|
{
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2013-02-15 19:26:48 -08:00
|
|
|
|
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7) {
|
2016-10-26 14:25:06 -07:00
|
|
|
|
const fs_builder ubld = fs_builder(this, block, inst).exec_all();
|
|
|
|
|
|
const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2016-10-26 14:25:06 -07:00
|
|
|
|
ubld.group(8, 0).MOV(payload,
|
|
|
|
|
|
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
ubld.group(1, 0).MOV(component(payload, 2),
|
|
|
|
|
|
brw_imm_ud(inst->src[1].ud / 16));
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
|
|
|
|
|
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
inst->src[1] = payload;
|
2016-10-26 14:25:06 -07:00
|
|
|
|
inst->header_size = 1;
|
|
|
|
|
|
inst->mlen = 1;
|
2013-03-06 15:58:46 -08:00
|
|
|
|
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Before register allocation, we didn't tell the scheduler about the
|
|
|
|
|
|
* MRF we use. We know it's safe to use this MRF because nothing
|
|
|
|
|
|
* else does except for register spill/unspill, which generates and
|
|
|
|
|
|
* uses its MRF within a single IR instruction.
|
|
|
|
|
|
*/
|
2015-09-22 13:01:18 +02:00
|
|
|
|
inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
|
2013-02-15 19:26:48 -08:00
|
|
|
|
inst->mlen = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-18 11:56:46 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_load_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
assert(inst->dst.file == MRF || inst->dst.file == VGRF);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
assert(inst->saturate == false);
|
|
|
|
|
|
fs_reg dst = inst->dst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Get rid of COMPR4. We'll add it back in if we need it */
|
|
|
|
|
|
if (dst.file == MRF)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
dst.nr = dst.nr & ~BRW_MRF_COMPR4;
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
2015-07-27 18:34:43 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
const fs_builder hbld = ibld.exec_all().group(8, 0);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
for (uint8_t i = 0; i < inst->header_size; i++) {
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
hbld.MOV(mov_dst, mov_src);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
dst = offset(dst, hbld, 1);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
|
if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
inst->exec_size > 8) {
|
|
|
|
|
|
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
|
|
|
|
|
|
* a straightforward copy. Instead, the result of the
|
|
|
|
|
|
* LOAD_PAYLOAD is treated as interleaved and the first four
|
|
|
|
|
|
* non-header sources are unpacked as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* m + 0: r0
|
|
|
|
|
|
* m + 1: g0
|
|
|
|
|
|
* m + 2: b0
|
|
|
|
|
|
* m + 3: a0
|
|
|
|
|
|
* m + 4: r1
|
|
|
|
|
|
* m + 5: g1
|
|
|
|
|
|
* m + 6: b1
|
|
|
|
|
|
* m + 7: a1
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is used for gen <= 5 fb writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
|
assert(inst->header_size + 4 <= inst->sources);
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
if (devinfo->has_compr4) {
|
|
|
|
|
|
fs_reg compr4_dst = retype(dst, inst->src[i].type);
|
2015-10-26 04:35:14 -07:00
|
|
|
|
compr4_dst.nr |= BRW_MRF_COMPR4;
|
2015-06-03 20:36:47 +03:00
|
|
|
|
ibld.MOV(compr4_dst, inst->src[i]);
|
2015-02-05 12:20:03 +02:00
|
|
|
|
} else {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
/* Platform doesn't have COMPR4. We have to fake it */
|
|
|
|
|
|
fs_reg mov_dst = retype(dst, inst->src[i].type);
|
2015-06-03 20:36:47 +03:00
|
|
|
|
ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
|
2015-10-26 04:35:14 -07:00
|
|
|
|
mov_dst.nr += 4;
|
2015-06-18 12:07:27 -07:00
|
|
|
|
ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
|
2015-02-04 19:49:32 +02:00
|
|
|
|
}
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
2015-10-26 04:35:14 -07:00
|
|
|
|
dst.nr++;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
/* The loop above only ever incremented us through the first set
|
|
|
|
|
|
* of 4 registers. However, thanks to the magic of COMPR4, we
|
|
|
|
|
|
* actually wrote to the first 8 registers, so we need to take
|
|
|
|
|
|
* that into account now.
|
|
|
|
|
|
*/
|
2015-10-26 04:35:14 -07:00
|
|
|
|
dst.nr += 4;
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
|
|
|
|
|
/* The COMPR4 code took care of the first 4 sources. We'll let
|
|
|
|
|
|
* the regular path handle any remaining sources. Yes, we are
|
|
|
|
|
|
* modifying the instruction but we're about to delete it so
|
|
|
|
|
|
* this really doesn't hurt anything.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->header_size += 4;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
2018-11-14 22:38:23 -06:00
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
dst.type = inst->src[i].type;
|
|
|
|
|
|
ibld.MOV(dst, inst->src[i]);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
dst.type = BRW_REGISTER_TYPE_UD;
|
|
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
dst = offset(dst, ibld, 1);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
void
|
2019-07-10 16:48:01 -07:00
|
|
|
|
fs_visitor::lower_mul_dword_inst(fs_inst *inst, bblock_t *block)
|
2019-07-11 16:56:05 -07:00
|
|
|
|
{
|
2019-07-10 16:48:01 -07:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
if (inst->src[1].file == IMM && inst->src[1].ud < (1 << 16)) {
|
|
|
|
|
|
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
|
|
|
|
|
|
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
|
|
|
|
|
|
* src1 are used.
|
|
|
|
|
|
*
|
|
|
|
|
|
* If multiplying by an immediate value that fits in 16-bits, do a
|
|
|
|
|
|
* single MUL instruction with that value in the proper location.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 7) {
|
|
|
|
|
|
fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8), inst->dst.type);
|
|
|
|
|
|
ibld.MOV(imm, inst->src[1]);
|
|
|
|
|
|
ibld.MUL(inst->dst, imm, inst->src[0]);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ibld.MUL(inst->dst, inst->src[0],
|
|
|
|
|
|
ud ? brw_imm_uw(inst->src[1].ud)
|
|
|
|
|
|
: brw_imm_w(inst->src[1].d));
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
|
|
|
|
|
|
* do 32-bit integer multiplication in one instruction, but instead
|
|
|
|
|
|
* must do a sequence (which actually calculates a 64-bit result):
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
|
|
|
|
|
|
* mach(8) null g3<8,8,1>D g4<8,8,1>D
|
|
|
|
|
|
* mov(8) g2<1>D acc0<8,8,1>D
|
|
|
|
|
|
*
|
|
|
|
|
|
* But on Gen > 6, the ability to use second accumulator register
|
|
|
|
|
|
* (acc1) for non-float data types was removed, preventing a simple
|
|
|
|
|
|
* implementation in SIMD16. A 16-channel result can be calculated by
|
|
|
|
|
|
* executing the three instructions twice in SIMD8, once with quarter
|
|
|
|
|
|
* control of 1Q for the first eight channels and again with 2Q for
|
|
|
|
|
|
* the second eight channels.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Which accumulator register is implicitly accessed (by AccWrEnable
|
|
|
|
|
|
* for instance) is determined by the quarter control. Unfortunately
|
|
|
|
|
|
* Ivybridge (and presumably Baytrail) has a hardware bug in which an
|
|
|
|
|
|
* implicit accumulator access by an instruction with 2Q will access
|
|
|
|
|
|
* acc1 regardless of whether the data type is usable in acc1.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Specifically, the 2Q mach(8) writes acc1 which does not exist for
|
|
|
|
|
|
* integer data types.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Since we only want the low 32-bits of the result, we can do two
|
|
|
|
|
|
* 32-bit x 16-bit multiplies (like the mul and mach are doing), and
|
|
|
|
|
|
* adjust the high result and add them (like the mach is doing):
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
|
|
|
|
|
|
* mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
|
|
|
|
|
|
* shl(8) g9<1>D g8<8,8,1>D 16D
|
|
|
|
|
|
* add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
|
|
|
|
|
|
*
|
|
|
|
|
|
* We avoid the shl instruction by realizing that we only want to add
|
|
|
|
|
|
* the low 16-bits of the "high" result to the high 16-bits of the
|
|
|
|
|
|
* "low" result and using proper regioning on the add:
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
|
|
|
|
|
|
* mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
|
|
|
|
|
|
* add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
|
|
|
|
|
|
*
|
|
|
|
|
|
* Since it does not use the (single) accumulator register, we can
|
|
|
|
|
|
* schedule multi-component multiplications much better.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
bool needs_mov = false;
|
|
|
|
|
|
fs_reg orig_dst = inst->dst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Get a new VGRF for the "low" 32x16-bit multiplication result if
|
|
|
|
|
|
* reusing the original destination is impossible due to hardware
|
|
|
|
|
|
* restrictions, source/destination overlap, or it being the null
|
|
|
|
|
|
* register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg low = inst->dst;
|
|
|
|
|
|
if (orig_dst.is_null() || orig_dst.file == MRF ||
|
|
|
|
|
|
regions_overlap(inst->dst, inst->size_written,
|
|
|
|
|
|
inst->src[0], inst->size_read(0)) ||
|
|
|
|
|
|
regions_overlap(inst->dst, inst->size_written,
|
|
|
|
|
|
inst->src[1], inst->size_read(1)) ||
|
|
|
|
|
|
inst->dst.stride >= 4) {
|
|
|
|
|
|
needs_mov = true;
|
|
|
|
|
|
low = fs_reg(VGRF, alloc.allocate(regs_written(inst)),
|
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Get a new VGRF but keep the same stride as inst->dst */
|
|
|
|
|
|
fs_reg high(VGRF, alloc.allocate(regs_written(inst)), inst->dst.type);
|
|
|
|
|
|
high.stride = inst->dst.stride;
|
|
|
|
|
|
high.offset = inst->dst.offset % REG_SIZE;
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 7) {
|
|
|
|
|
|
if (inst->src[1].abs)
|
|
|
|
|
|
lower_src_modifiers(this, block, inst, 1);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[1].file == IMM) {
|
|
|
|
|
|
ibld.MUL(low, inst->src[0],
|
|
|
|
|
|
brw_imm_uw(inst->src[1].ud & 0xffff));
|
|
|
|
|
|
ibld.MUL(high, inst->src[0],
|
|
|
|
|
|
brw_imm_uw(inst->src[1].ud >> 16));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MUL(low, inst->src[0],
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
|
|
|
|
|
|
ibld.MUL(high, inst->src[0],
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if (inst->src[0].abs)
|
|
|
|
|
|
lower_src_modifiers(this, block, inst, 0);
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
|
|
|
|
|
|
inst->src[1]);
|
|
|
|
|
|
ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
|
|
|
|
|
|
inst->src[1]);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ibld.ADD(subscript(low, BRW_REGISTER_TYPE_UW, 1),
|
|
|
|
|
|
subscript(low, BRW_REGISTER_TYPE_UW, 1),
|
|
|
|
|
|
subscript(high, BRW_REGISTER_TYPE_UW, 0));
|
|
|
|
|
|
|
|
|
|
|
|
if (needs_mov || inst->conditional_mod)
|
|
|
|
|
|
set_condmod(inst->conditional_mod, ibld.MOV(orig_dst, low));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 15:08:03 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::lower_mul_qword_inst(fs_inst *inst, bblock_t *block)
|
|
|
|
|
|
{
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
/* Considering two 64-bit integers ab and cd where each letter ab
|
|
|
|
|
|
* corresponds to 32 bits, we get a 128-bit result WXYZ. We * cd
|
|
|
|
|
|
* only need to provide the YZ part of the result. -------
|
|
|
|
|
|
* BD
|
|
|
|
|
|
* Only BD needs to be 64 bits. For AD and BC we only care + AD
|
|
|
|
|
|
* about the lower 32 bits (since they are part of the upper + BC
|
|
|
|
|
|
* 32 bits of our result). AC is not needed since it starts + AC
|
|
|
|
|
|
* on the 65th bit of the result. -------
|
|
|
|
|
|
* WXYZ
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned int q_regs = regs_written(inst);
|
|
|
|
|
|
unsigned int d_regs = (q_regs + 1) / 2;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg bd(VGRF, alloc.allocate(q_regs), BRW_REGISTER_TYPE_UQ);
|
|
|
|
|
|
fs_reg ad(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg bc(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
/* Here we need the full 64 bit result for 32b * 32b. */
|
|
|
|
|
|
if (devinfo->has_integer_dword_mul) {
|
|
|
|
|
|
ibld.MUL(bd, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg bd_high(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg bd_low(VGRF, alloc.allocate(d_regs), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg acc = retype(brw_acc_reg(inst->exec_size), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *mul = ibld.MUL(acc,
|
|
|
|
|
|
subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
|
|
|
|
|
|
mul->writes_accumulator = true;
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MACH(bd_high, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
ibld.MOV(bd_low, acc);
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 0), bd_low);
|
|
|
|
|
|
ibld.MOV(subscript(bd, BRW_REGISTER_TYPE_UD, 1), bd_high);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MUL(ad, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 0));
|
|
|
|
|
|
ibld.MUL(bc, subscript(inst->src[0], BRW_REGISTER_TYPE_UD, 0),
|
|
|
|
|
|
subscript(inst->src[1], BRW_REGISTER_TYPE_UD, 1));
|
|
|
|
|
|
|
|
|
|
|
|
ibld.ADD(ad, ad, bc);
|
|
|
|
|
|
ibld.ADD(subscript(bd, BRW_REGISTER_TYPE_UD, 1),
|
|
|
|
|
|
subscript(bd, BRW_REGISTER_TYPE_UD, 1), ad);
|
|
|
|
|
|
|
|
|
|
|
|
ibld.MOV(inst->dst, bd);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
void
|
2019-07-10 16:48:01 -07:00
|
|
|
|
fs_visitor::lower_mulh_inst(fs_inst *inst, bblock_t *block)
|
2019-07-11 16:56:05 -07:00
|
|
|
|
{
|
2019-07-10 16:48:01 -07:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
2019-07-11 16:56:05 -07:00
|
|
|
|
/* According to the BDW+ BSpec page for the "Multiply Accumulate
|
|
|
|
|
|
* High" instruction:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "An added preliminary mov is required for source modification on
|
|
|
|
|
|
* src1:
|
|
|
|
|
|
* mov (8) r3.0<1>:d -r3<8;8,1>:d
|
|
|
|
|
|
* mul (8) acc0:d r2.0<8;8,1>:d r3.0<16;8,2>:uw
|
|
|
|
|
|
* mach (8) r5.0<1>:d r2.0<8;8,1>:d r3.0<8;8,1>:d"
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen >= 8 && (inst->src[1].negate || inst->src[1].abs))
|
|
|
|
|
|
lower_src_modifiers(this, block, inst, 1);
|
|
|
|
|
|
|
|
|
|
|
|
/* Should have been lowered to 8-wide. */
|
|
|
|
|
|
assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
|
|
|
|
|
|
const fs_reg acc = retype(brw_acc_reg(inst->exec_size), inst->dst.type);
|
|
|
|
|
|
fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
|
|
|
|
|
|
fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 8) {
|
|
|
|
|
|
/* Until Gen8, integer multiplies read 32-bits from one source,
|
|
|
|
|
|
* and 16-bits from the other, and relying on the MACH instruction
|
|
|
|
|
|
* to generate the high bits of the result.
|
|
|
|
|
|
*
|
|
|
|
|
|
* On Gen8, the multiply instruction does a full 32x32-bit
|
|
|
|
|
|
* multiply, but in order to do a 64-bit multiply we can simulate
|
|
|
|
|
|
* the previous behavior and then use a MACH instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
|
mul->src[1].type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
mul->src[1].type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
mul->src[1].stride *= 2;
|
|
|
|
|
|
|
|
|
|
|
|
if (mul->src[1].file == IMM) {
|
|
|
|
|
|
mul->src[1] = brw_imm_uw(mul->src[1].ud);
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if (devinfo->gen == 7 && !devinfo->is_haswell &&
|
|
|
|
|
|
inst->group > 0) {
|
|
|
|
|
|
/* Among other things the quarter control bits influence which
|
|
|
|
|
|
* accumulator register is used by the hardware for instructions
|
|
|
|
|
|
* that access the accumulator implicitly (e.g. MACH). A
|
|
|
|
|
|
* second-half instruction would normally map to acc1, which
|
|
|
|
|
|
* doesn't exist on Gen7 and up (the hardware does emulate it for
|
|
|
|
|
|
* floating-point instructions *only* by taking advantage of the
|
|
|
|
|
|
* extra precision of acc0 not normally used for floating point
|
|
|
|
|
|
* arithmetic).
|
|
|
|
|
|
*
|
|
|
|
|
|
* HSW and up are careful enough not to try to access an
|
|
|
|
|
|
* accumulator register that doesn't exist, but on earlier Gen7
|
|
|
|
|
|
* hardware we need to make sure that the quarter control bits are
|
|
|
|
|
|
* zero to avoid non-deterministic behaviour and emit an extra MOV
|
|
|
|
|
|
* to get the result masked correctly according to the current
|
|
|
|
|
|
* channel enables.
|
|
|
|
|
|
*/
|
|
|
|
|
|
mach->group = 0;
|
|
|
|
|
|
mach->force_writemask_all = true;
|
|
|
|
|
|
mach->dst = ibld.vgrf(inst->dst.type);
|
|
|
|
|
|
ibld.MOV(inst->dst, mach->dst);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-05-11 09:29:56 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_integer_multiplication()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2015-08-05 16:47:18 +03:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_MUL) {
|
2019-07-11 15:08:03 -07:00
|
|
|
|
if ((inst->dst.type == BRW_REGISTER_TYPE_Q ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UQ) &&
|
|
|
|
|
|
(inst->src[0].type == BRW_REGISTER_TYPE_Q ||
|
|
|
|
|
|
inst->src[0].type == BRW_REGISTER_TYPE_UQ) &&
|
|
|
|
|
|
(inst->src[1].type == BRW_REGISTER_TYPE_Q ||
|
|
|
|
|
|
inst->src[1].type == BRW_REGISTER_TYPE_UQ)) {
|
|
|
|
|
|
lower_mul_qword_inst(inst, block);
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (!inst->dst.is_accumulator() &&
|
|
|
|
|
|
(inst->dst.type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
|
inst->dst.type == BRW_REGISTER_TYPE_UD) &&
|
|
|
|
|
|
!devinfo->has_integer_dword_mul) {
|
2019-07-10 17:03:48 -07:00
|
|
|
|
lower_mul_dword_inst(inst, block);
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
2015-08-06 14:04:00 +03:00
|
|
|
|
} else if (inst->opcode == SHADER_OPCODE_MULH) {
|
2019-07-10 16:48:01 -07:00
|
|
|
|
lower_mulh_inst(inst, block);
|
2019-07-10 17:03:48 -07:00
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2015-05-11 09:29:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-02-11 12:27:02 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_minmax()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(devinfo->gen < 6);
|
|
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_SEL &&
|
|
|
|
|
|
inst->predicate == BRW_PREDICATE_NONE) {
|
|
|
|
|
|
/* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
|
|
|
|
|
|
* the original SEL.L/GE instruction
|
|
|
|
|
|
*/
|
|
|
|
|
|
ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
|
|
|
|
|
|
inst->conditional_mod);
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:59:34 +03:00
|
|
|
|
static void
|
|
|
|
|
|
setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
|
|
|
|
|
|
fs_reg *dst, fs_reg color, unsigned components)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (key->clamp_fragment_color) {
|
|
|
|
|
|
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
|
|
|
|
|
|
assert(color.type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < components; i++)
|
|
|
|
|
|
set_saturate(true,
|
|
|
|
|
|
bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
|
|
|
|
|
|
|
|
|
|
|
|
color = tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < components; i++)
|
|
|
|
|
|
dst[i] = offset(color, bld, i);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-08-25 23:59:25 -07:00
|
|
|
|
uint32_t
|
|
|
|
|
|
brw_fb_write_msg_control(const fs_inst *inst,
|
|
|
|
|
|
const struct brw_wm_prog_data *prog_data)
|
|
|
|
|
|
{
|
|
|
|
|
|
uint32_t mctl;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == FS_OPCODE_REP_FB_WRITE) {
|
|
|
|
|
|
assert(inst->group == 0 && inst->exec_size == 16);
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
|
|
|
|
|
|
} else if (prog_data->dual_src_blend) {
|
|
|
|
|
|
assert(inst->exec_size == 8);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->group % 16 == 0)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
|
|
|
|
|
|
else if (inst->group % 16 == 8)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
|
|
|
|
|
|
else
|
|
|
|
|
|
unreachable("Invalid dual-source FB write instruction group");
|
|
|
|
|
|
} else {
|
|
|
|
|
|
assert(inst->group == 0 || (inst->group == 16 && inst->exec_size == 16));
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->exec_size == 16)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
|
|
|
|
|
|
else if (inst->exec_size == 8)
|
|
|
|
|
|
mctl = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
|
|
|
|
|
|
else
|
|
|
|
|
|
unreachable("Invalid FB write execution size");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return mctl;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
|
2016-09-08 23:48:51 -07:00
|
|
|
|
const struct brw_wm_prog_data *prog_data,
|
2015-07-27 16:14:36 +03:00
|
|
|
|
const brw_wm_prog_key *key,
|
|
|
|
|
|
const fs_visitor::thread_payload &payload)
|
|
|
|
|
|
{
|
2015-10-20 14:29:37 -07:00
|
|
|
|
assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
|
2016-08-22 15:01:08 -07:00
|
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
2015-10-20 14:29:37 -07:00
|
|
|
|
const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
|
|
|
|
|
|
const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
|
|
|
|
|
|
const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
|
|
|
|
|
|
const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
|
|
|
|
|
|
const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
|
2015-10-20 14:29:39 -07:00
|
|
|
|
const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
|
2015-10-20 14:29:37 -07:00
|
|
|
|
fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
|
|
|
|
|
|
const unsigned components =
|
2015-10-24 14:55:57 -07:00
|
|
|
|
inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
|
2015-07-27 16:14:36 +03:00
|
|
|
|
|
2015-07-13 17:59:34 +03:00
|
|
|
|
/* We can potentially have a message length of up to 15, so we have to set
|
|
|
|
|
|
* base_mrf to either 0 or 1 in order to fit in m0..m15.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg sources[15];
|
|
|
|
|
|
int header_size = 2, payload_header_size;
|
|
|
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
2018-05-17 08:46:03 -07:00
|
|
|
|
if (devinfo->gen < 6) {
|
2017-01-13 14:25:37 -08:00
|
|
|
|
/* TODO: Support SIMD32 on gen4-5 */
|
|
|
|
|
|
assert(bld.group() < 16);
|
|
|
|
|
|
|
2018-05-17 08:46:03 -07:00
|
|
|
|
/* For gen4-5, we always have a header consisting of g0 and g1. We have
|
|
|
|
|
|
* an implied MOV from g0,g1 to the start of the message. The MOV from
|
|
|
|
|
|
* g0 is handled by the hardware and the MOV from g1 is provided by the
|
|
|
|
|
|
* generator. This is required because, on gen4-5, the generator may
|
|
|
|
|
|
* generate two write messages with different message lengths in order
|
|
|
|
|
|
* to handle AA data properly.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Also, since the pixel mask goes in the g0 portion of the message and
|
|
|
|
|
|
* since render target writes are the last thing in the shader, we write
|
|
|
|
|
|
* the pixel mask directly into g0 and it will get copied as part of the
|
|
|
|
|
|
* implied write.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (prog_data->uses_kill) {
|
|
|
|
|
|
bld.exec_all().group(1, 0)
|
|
|
|
|
|
.MOV(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW),
|
|
|
|
|
|
brw_flag_reg(0, 1));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
assert(length == 0);
|
|
|
|
|
|
length = 2;
|
|
|
|
|
|
} else if ((devinfo->gen <= 7 && !devinfo->is_haswell &&
|
|
|
|
|
|
prog_data->uses_kill) ||
|
2019-08-23 18:23:32 -07:00
|
|
|
|
(devinfo->gen < 11 &&
|
|
|
|
|
|
(color1.file != BAD_FILE || key->nr_color_regions > 1))) {
|
2018-05-17 08:46:03 -07:00
|
|
|
|
/* From the Sandy Bridge PRM, volume 4, page 198:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Dispatched Pixel Enables. One bit per pixel indicating
|
|
|
|
|
|
* which pixels were originally enabled when the thread was
|
|
|
|
|
|
* dispatched. This field is only required for the end-of-
|
|
|
|
|
|
* thread message and on all dual-source messages."
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(8, 0);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
|
2017-01-13 14:25:37 -08:00
|
|
|
|
if (bld.group() < 16) {
|
|
|
|
|
|
/* The header starts off as g0 and g1 for the first half */
|
|
|
|
|
|
ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* The header starts off as g0 and g2 for the second half */
|
|
|
|
|
|
assert(bld.group() < 32);
|
|
|
|
|
|
const fs_reg header_sources[2] = {
|
|
|
|
|
|
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
};
|
|
|
|
|
|
ubld.LOAD_PAYLOAD(header, header_sources, 2, 0);
|
|
|
|
|
|
}
|
2018-05-17 08:46:03 -07:00
|
|
|
|
|
|
|
|
|
|
uint32_t g00_bits = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* Set "Source0 Alpha Present to RenderTarget" bit in message
|
|
|
|
|
|
* header.
|
|
|
|
|
|
*/
|
i965,iris,anv: Make alpha to coverage work with sample mask
From "Alpha Coverage" section of SKL PRM Volume 7:
"If Pixel Shader outputs oMask, AlphaToCoverage is disabled in
hardware, regardless of the state setting for this feature."
From OpenGL spec 4.6, "15.2 Shader Execution":
"The built-in integer array gl_SampleMask can be used to change
the sample coverage for a fragment from within the shader."
From OpenGL spec 4.6, "17.3.1 Alpha To Coverage":
"If SAMPLE_ALPHA_TO_COVERAGE is enabled, a temporary coverage value
is generated where each bit is determined by the alpha value at the
corresponding sample location. The temporary coverage value is then
ANDed with the fragment coverage value to generate a new fragment
coverage value."
Similar wording could be found in Vulkan spec 1.1.100
"25.6. Multisample Coverage"
Thus we need to compute alpha to coverage dithering manually in shader
and replace sample mask store with the bitwise-AND of sample mask and
alpha to coverage dithering.
The following formula is used to compute final sample mask:
m = int(16.0 * clamp(src0_alpha, 0.0, 1.0))
dither_mask = 0x1111 * ((0xfea80 >> (m & ~3)) & 0xf) |
0x0808 * (m & 2) | 0x0100 * (m & 1)
sample_mask = sample_mask & dither_mask
Credits to Francisco Jerez <currojerez@riseup.net> for creating it.
It gives a number of ones proportional to the alpha for 2, 4, 8 or 16
least significant bits of the result.
GEN6 hardware does not have issue with simultaneous usage of sample mask
and alpha to coverage however due to the wrong sending order of oMask
and src0_alpha it is still affected by it.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=109743
Signed-off-by: Danylo Piliaiev <danylo.piliaiev@globallogic.com>
Reviewed-by: Francisco Jerez <currojerez@riseup.net>
2019-02-20 19:39:18 +02:00
|
|
|
|
if (inst->target > 0 && prog_data->replicate_alpha)
|
2018-05-17 08:46:03 -07:00
|
|
|
|
g00_bits |= 1 << 11;
|
|
|
|
|
|
|
|
|
|
|
|
/* Set computes stencil to render target */
|
|
|
|
|
|
if (prog_data->computed_stencil)
|
|
|
|
|
|
g00_bits |= 1 << 14;
|
|
|
|
|
|
|
|
|
|
|
|
if (g00_bits) {
|
|
|
|
|
|
/* OR extra bits into g0.0 */
|
|
|
|
|
|
ubld.group(1, 0).OR(component(header, 0),
|
|
|
|
|
|
retype(brw_vec1_grf(0, 0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
brw_imm_ud(g00_bits));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Set the render target index for choosing BLEND_STATE. */
|
|
|
|
|
|
if (inst->target > 0) {
|
|
|
|
|
|
ubld.group(1, 0).MOV(component(header, 2), brw_imm_ud(inst->target));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (prog_data->uses_kill) {
|
2017-01-13 14:25:37 -08:00
|
|
|
|
assert(bld.group() < 16);
|
2018-05-17 08:46:03 -07:00
|
|
|
|
ubld.group(1, 0).MOV(retype(component(header, 15),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UW),
|
|
|
|
|
|
brw_flag_reg(0, 1));
|
|
|
|
|
|
}
|
2015-07-13 17:59:34 +03:00
|
|
|
|
|
2018-05-17 08:46:03 -07:00
|
|
|
|
assert(length == 0);
|
|
|
|
|
|
sources[0] = header;
|
|
|
|
|
|
sources[1] = horiz_offset(header, 8);
|
|
|
|
|
|
length = 2;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
}
|
2018-05-17 08:46:03 -07:00
|
|
|
|
assert(length == 0 || length == 2);
|
|
|
|
|
|
header_size = length;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
|
2017-01-13 15:36:51 -08:00
|
|
|
|
if (payload.aa_dest_stencil_reg[0]) {
|
2017-01-13 14:25:37 -08:00
|
|
|
|
assert(inst->group < 16);
|
2015-10-26 17:09:25 -07:00
|
|
|
|
sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
|
2015-07-13 17:59:34 +03:00
|
|
|
|
bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
|
|
|
|
|
|
.MOV(sources[length],
|
2017-01-13 15:36:51 -08:00
|
|
|
|
fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg[0], 0)));
|
2015-07-13 17:59:34 +03:00
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-08-23 18:23:32 -07:00
|
|
|
|
bool src0_alpha_present = false;
|
|
|
|
|
|
|
2019-02-27 17:10:42 +02:00
|
|
|
|
if (src0_alpha.file != BAD_FILE) {
|
|
|
|
|
|
for (unsigned i = 0; i < bld.dispatch_width() / 8; i++) {
|
|
|
|
|
|
const fs_builder &ubld = bld.exec_all().group(8, i)
|
|
|
|
|
|
.annotate("FB write src0 alpha");
|
|
|
|
|
|
const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
ubld.MOV(tmp, horiz_offset(src0_alpha, i * 8));
|
|
|
|
|
|
setup_color_payload(ubld, key, &sources[length], tmp, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
2019-08-23 18:23:32 -07:00
|
|
|
|
src0_alpha_present = true;
|
2019-02-27 17:10:42 +02:00
|
|
|
|
} else if (prog_data->replicate_alpha && inst->target != 0) {
|
|
|
|
|
|
/* Handle the case when fragment shader doesn't write to draw buffer
|
|
|
|
|
|
* zero. No need to call setup_color_payload() for src0_alpha because
|
|
|
|
|
|
* alpha value will be undefined.
|
|
|
|
|
|
*/
|
|
|
|
|
|
length += bld.dispatch_width() / 8;
|
2019-08-23 18:23:32 -07:00
|
|
|
|
src0_alpha_present = true;
|
2019-02-27 17:10:42 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-29 19:47:44 -07:00
|
|
|
|
if (sample_mask.file != BAD_FILE) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
|
2015-07-13 17:59:34 +03:00
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
/* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
|
|
|
|
|
|
* relevant. Since it's unsigned single words one vgrf is always
|
|
|
|
|
|
* 16-wide, but only the lower or higher 8 channels will be used by the
|
|
|
|
|
|
* hardware when doing a SIMD8 write depending on whether we have
|
|
|
|
|
|
* selected the subspans for the first or second half respectively.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
|
|
|
|
|
|
sample_mask.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
sample_mask.stride *= 2;
|
|
|
|
|
|
|
|
|
|
|
|
bld.exec_all().annotate("FB write oMask")
|
2016-05-20 16:14:13 -07:00
|
|
|
|
.MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
|
2017-01-13 14:25:37 -08:00
|
|
|
|
inst->group % 16),
|
2015-07-13 17:59:34 +03:00
|
|
|
|
sample_mask);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
payload_header_size = length;
|
|
|
|
|
|
|
|
|
|
|
|
setup_color_payload(bld, key, &sources[length], color0, components);
|
|
|
|
|
|
length += 4;
|
|
|
|
|
|
|
|
|
|
|
|
if (color1.file != BAD_FILE) {
|
|
|
|
|
|
setup_color_payload(bld, key, &sources[length], color1, components);
|
|
|
|
|
|
length += 4;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (src_depth.file != BAD_FILE) {
|
|
|
|
|
|
sources[length] = src_depth;
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (dst_depth.file != BAD_FILE) {
|
|
|
|
|
|
sources[length] = dst_depth;
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-20 14:29:39 -07:00
|
|
|
|
if (src_stencil.file != BAD_FILE) {
|
|
|
|
|
|
assert(devinfo->gen >= 9);
|
2017-01-13 14:25:37 -08:00
|
|
|
|
assert(bld.dispatch_width() == 8);
|
2015-10-20 14:29:39 -07:00
|
|
|
|
|
2015-11-16 17:23:01 -08:00
|
|
|
|
/* XXX: src_stencil is only available on gen9+. dst_depth is never
|
|
|
|
|
|
* available on gen9+. As such it's impossible to have both enabled at the
|
|
|
|
|
|
* same time and therefore length cannot overrun the array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(length < 15);
|
|
|
|
|
|
|
2015-10-20 14:29:39 -07:00
|
|
|
|
sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.exec_all().annotate("FB write OS")
|
2016-05-20 00:13:19 -07:00
|
|
|
|
.MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
|
|
|
|
|
|
subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
|
2015-10-20 14:29:39 -07:00
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:59:34 +03:00
|
|
|
|
fs_inst *load;
|
|
|
|
|
|
if (devinfo->gen >= 7) {
|
|
|
|
|
|
/* Send from the GRF */
|
2015-10-26 17:09:25 -07:00
|
|
|
|
fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
|
2015-07-13 17:59:34 +03:00
|
|
|
|
load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
|
2016-09-07 16:59:35 -07:00
|
|
|
|
payload.nr = bld.shader->alloc.allocate(regs_written(load));
|
2015-07-13 17:59:34 +03:00
|
|
|
|
load->dst = payload;
|
|
|
|
|
|
|
2019-08-26 00:05:21 -07:00
|
|
|
|
uint32_t msg_ctl = brw_fb_write_msg_control(inst, prog_data);
|
|
|
|
|
|
uint32_t ex_desc = 0;
|
|
|
|
|
|
|
|
|
|
|
|
inst->desc =
|
2019-09-01 21:57:05 -05:00
|
|
|
|
(inst->group / 16) << 11 | /* rt slot group */
|
2019-08-26 00:05:21 -07:00
|
|
|
|
brw_dp_write_desc(devinfo, inst->target, msg_ctl,
|
|
|
|
|
|
GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
|
|
|
|
|
|
inst->last_rt, false);
|
|
|
|
|
|
|
2019-08-23 18:23:32 -07:00
|
|
|
|
if (devinfo->gen >= 11) {
|
|
|
|
|
|
/* Set the "Render Target Index" and "Src0 Alpha Present" fields
|
|
|
|
|
|
* in the extended message descriptor, in lieu of using a header.
|
|
|
|
|
|
*/
|
|
|
|
|
|
ex_desc = inst->target << 12 | src0_alpha_present << 15;
|
2019-08-27 11:22:33 -07:00
|
|
|
|
|
|
|
|
|
|
if (key->nr_color_regions == 0)
|
|
|
|
|
|
ex_desc |= 1 << 20; /* Null Render Target */
|
2019-08-23 18:23:32 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-08-26 00:05:21 -07:00
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
|
|
|
|
|
inst->resize_sources(3);
|
|
|
|
|
|
inst->sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
|
|
|
|
|
|
inst->src[0] = brw_imm_ud(inst->desc);
|
|
|
|
|
|
inst->src[1] = brw_imm_ud(ex_desc);
|
|
|
|
|
|
inst->src[2] = payload;
|
|
|
|
|
|
inst->mlen = regs_written(load);
|
|
|
|
|
|
inst->ex_mlen = 0;
|
|
|
|
|
|
inst->header_size = header_size;
|
|
|
|
|
|
inst->check_tdr = true;
|
|
|
|
|
|
inst->send_has_side_effects = true;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Send from the MRF */
|
|
|
|
|
|
load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
|
|
|
|
|
|
sources, length, payload_header_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
|
|
|
|
|
|
* will do this for us if we just give it a COMPR4 destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 6 && bld.dispatch_width() == 16)
|
2015-10-26 04:35:14 -07:00
|
|
|
|
load->dst.nr |= BRW_MRF_COMPR4;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
|
2018-05-17 15:40:48 -07:00
|
|
|
|
if (devinfo->gen < 6) {
|
|
|
|
|
|
/* Set up src[0] for the implied MOV from grf0-1 */
|
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
|
inst->src[0] = brw_vec8_grf(0, 0);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
inst->resize_sources(0);
|
|
|
|
|
|
}
|
2015-07-13 17:59:34 +03:00
|
|
|
|
inst->base_mrf = 1;
|
2019-08-26 00:05:21 -07:00
|
|
|
|
inst->opcode = FS_OPCODE_FB_WRITE;
|
|
|
|
|
|
inst->mlen = regs_written(load);
|
|
|
|
|
|
inst->header_size = header_size;
|
2015-07-13 17:59:34 +03:00
|
|
|
|
}
|
2015-07-27 16:14:36 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-21 16:55:45 -07:00
|
|
|
|
static void
|
|
|
|
|
|
lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
|
{
|
2017-01-09 16:43:24 -08:00
|
|
|
|
const fs_builder &ubld = bld.exec_all().group(8, 0);
|
2016-07-21 16:55:45 -07:00
|
|
|
|
const unsigned length = 2;
|
2017-01-09 16:43:24 -08:00
|
|
|
|
const fs_reg header = ubld.vgrf(BRW_REGISTER_TYPE_UD, length);
|
2016-07-21 16:55:45 -07:00
|
|
|
|
|
2017-01-09 16:43:24 -08:00
|
|
|
|
if (bld.group() < 16) {
|
|
|
|
|
|
ubld.group(16, 0).MOV(header, retype(brw_vec8_grf(0, 0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
assert(bld.group() < 32);
|
|
|
|
|
|
const fs_reg header_sources[] = {
|
|
|
|
|
|
retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_UD)
|
|
|
|
|
|
};
|
|
|
|
|
|
ubld.LOAD_PAYLOAD(header, header_sources, ARRAY_SIZE(header_sources), 0);
|
|
|
|
|
|
}
|
2016-07-21 16:55:45 -07:00
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
|
inst->src[0] = header;
|
|
|
|
|
|
inst->opcode = FS_OPCODE_FB_READ;
|
|
|
|
|
|
inst->mlen = length;
|
|
|
|
|
|
inst->header_size = length;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-18 17:09:37 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
|
|
|
|
|
|
const fs_reg &coordinate,
|
|
|
|
|
|
const fs_reg &shadow_c,
|
|
|
|
|
|
const fs_reg &lod, const fs_reg &lod2,
|
2016-02-05 18:24:02 -08:00
|
|
|
|
const fs_reg &surface,
|
2015-07-18 17:09:37 +03:00
|
|
|
|
const fs_reg &sampler,
|
|
|
|
|
|
unsigned coord_components,
|
|
|
|
|
|
unsigned grad_components)
|
|
|
|
|
|
{
|
|
|
|
|
|
const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
|
|
|
|
|
|
op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
|
|
|
|
|
|
fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
fs_reg msg_end = msg_begin;
|
|
|
|
|
|
|
|
|
|
|
|
/* g0 header. */
|
|
|
|
|
|
msg_end = offset(msg_end, bld.group(8, 0), 1);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++)
|
|
|
|
|
|
bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
|
|
|
|
|
|
offset(coordinate, bld, i));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, coord_components);
|
|
|
|
|
|
|
|
|
|
|
|
/* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
|
|
|
|
|
|
* require all three components to be present and zero if they are unused.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (coord_components > 0 &&
|
|
|
|
|
|
(has_lod || shadow_c.file != BAD_FILE ||
|
|
|
|
|
|
(op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
|
|
|
|
|
|
for (unsigned i = coord_components; i < 3; i++)
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
|
2015-07-18 17:09:37 +03:00
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 3 - coord_components);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (op == SHADER_OPCODE_TXD) {
|
|
|
|
|
|
/* TXD unsupported in SIMD16 mode. */
|
|
|
|
|
|
assert(bld.dispatch_width() == 8);
|
|
|
|
|
|
|
|
|
|
|
|
/* the slots for u and v are always present, but r is optional */
|
|
|
|
|
|
if (coord_components < 2)
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 2 - coord_components);
|
|
|
|
|
|
|
|
|
|
|
|
/* P = u, v, r
|
|
|
|
|
|
* dPdx = dudx, dvdx, drdx
|
|
|
|
|
|
* dPdy = dudy, dvdy, drdy
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1-arg: Does not exist.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 2-arg: dudx dvdx dudy dvdy
|
|
|
|
|
|
* dPdx.x dPdx.y dPdy.x dPdy.y
|
|
|
|
|
|
* m4 m5 m6 m7
|
|
|
|
|
|
*
|
|
|
|
|
|
* 3-arg: dudx dvdx drdx dudy dvdy drdy
|
|
|
|
|
|
* dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
|
|
|
|
|
|
* m5 m6 m7 m8 m9 m10
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < grad_components; i++)
|
|
|
|
|
|
bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < grad_components; i++)
|
|
|
|
|
|
bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (has_lod) {
|
2016-12-12 08:32:38 -05:00
|
|
|
|
/* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
|
|
|
|
|
|
* shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
|
2015-07-18 17:09:37 +03:00
|
|
|
|
*/
|
|
|
|
|
|
assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
|
|
|
|
|
|
bld.dispatch_width() == 16);
|
|
|
|
|
|
|
|
|
|
|
|
const brw_reg_type type =
|
|
|
|
|
|
(op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
bld.MOV(retype(msg_end, type), lod);
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (shadow_c.file != BAD_FILE) {
|
|
|
|
|
|
if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
|
|
|
|
|
|
/* There's no plain shadow compare message, so we use shadow
|
|
|
|
|
|
* compare with a bias of 0.0.
|
|
|
|
|
|
*/
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.MOV(msg_end, brw_imm_f(0.0f));
|
2015-07-18 17:09:37 +03:00
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(msg_end, shadow_c);
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = op;
|
|
|
|
|
|
inst->src[0] = reg_undef;
|
2016-02-05 18:24:02 -08:00
|
|
|
|
inst->src[1] = surface;
|
|
|
|
|
|
inst->src[2] = sampler;
|
|
|
|
|
|
inst->resize_sources(3);
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->base_mrf = msg_begin.nr;
|
|
|
|
|
|
inst->mlen = msg_end.nr - msg_begin.nr;
|
2015-07-18 17:09:37 +03:00
|
|
|
|
inst->header_size = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-18 16:52:06 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
|
2016-04-30 21:54:47 -07:00
|
|
|
|
const fs_reg &coordinate,
|
2015-07-18 16:52:06 +03:00
|
|
|
|
const fs_reg &shadow_c,
|
2016-04-30 21:54:47 -07:00
|
|
|
|
const fs_reg &lod, const fs_reg &lod2,
|
2015-07-18 16:52:06 +03:00
|
|
|
|
const fs_reg &sample_index,
|
2016-02-05 18:24:02 -08:00
|
|
|
|
const fs_reg &surface,
|
2015-07-18 16:52:06 +03:00
|
|
|
|
const fs_reg &sampler,
|
|
|
|
|
|
unsigned coord_components,
|
|
|
|
|
|
unsigned grad_components)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
fs_reg msg_coords = message;
|
|
|
|
|
|
unsigned header_size = 0;
|
|
|
|
|
|
|
2016-11-28 18:13:02 -08:00
|
|
|
|
if (inst->offset != 0) {
|
2015-07-18 16:52:06 +03:00
|
|
|
|
/* The offsets set up by the visitor are in the m1 header, so we can't
|
|
|
|
|
|
* go headerless.
|
|
|
|
|
|
*/
|
|
|
|
|
|
header_size = 1;
|
2015-10-26 04:35:14 -07:00
|
|
|
|
message.nr--;
|
2015-07-18 16:52:06 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-30 21:54:47 -07:00
|
|
|
|
for (unsigned i = 0; i < coord_components; i++)
|
|
|
|
|
|
bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
|
|
|
|
|
|
offset(coordinate, bld, i));
|
|
|
|
|
|
|
2015-07-18 16:52:06 +03:00
|
|
|
|
fs_reg msg_end = offset(msg_coords, bld, coord_components);
|
|
|
|
|
|
fs_reg msg_lod = offset(msg_coords, bld, 4);
|
|
|
|
|
|
|
|
|
|
|
|
if (shadow_c.file != BAD_FILE) {
|
|
|
|
|
|
fs_reg msg_shadow = msg_lod;
|
|
|
|
|
|
bld.MOV(msg_shadow, shadow_c);
|
|
|
|
|
|
msg_lod = offset(msg_shadow, bld, 1);
|
|
|
|
|
|
msg_end = msg_lod;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
bld.MOV(msg_lod, lod);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
/**
|
|
|
|
|
|
* P = u, v, r
|
|
|
|
|
|
* dPdx = dudx, dvdx, drdx
|
|
|
|
|
|
* dPdy = dudy, dvdy, drdy
|
|
|
|
|
|
*
|
|
|
|
|
|
* Load up these values:
|
|
|
|
|
|
* - dudx dudy dvdx dvdy drdx drdy
|
|
|
|
|
|
* - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
|
|
|
|
|
|
*/
|
|
|
|
|
|
msg_end = msg_lod;
|
|
|
|
|
|
for (unsigned i = 0; i < grad_components; i++) {
|
2016-04-30 21:54:47 -07:00
|
|
|
|
bld.MOV(msg_end, offset(lod, bld, i));
|
2015-07-18 16:52:06 +03:00
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
|
2016-04-30 21:54:47 -07:00
|
|
|
|
bld.MOV(msg_end, offset(lod2, bld, i));
|
2015-07-18 16:52:06 +03:00
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.MOV(msg_lod, lod);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
msg_lod = offset(msg_coords, bld, 3);
|
|
|
|
|
|
bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
msg_lod = offset(msg_coords, bld, 3);
|
|
|
|
|
|
/* lod */
|
2015-11-02 11:26:16 -08:00
|
|
|
|
bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
|
2015-07-18 16:52:06 +03:00
|
|
|
|
/* sample index */
|
|
|
|
|
|
bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 2);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = op;
|
|
|
|
|
|
inst->src[0] = reg_undef;
|
2016-02-05 18:24:02 -08:00
|
|
|
|
inst->src[1] = surface;
|
|
|
|
|
|
inst->src[2] = sampler;
|
|
|
|
|
|
inst->resize_sources(3);
|
2015-10-26 04:35:14 -07:00
|
|
|
|
inst->base_mrf = message.nr;
|
|
|
|
|
|
inst->mlen = msg_end.nr - message.nr;
|
2015-07-18 16:52:06 +03:00
|
|
|
|
inst->header_size = header_size;
|
|
|
|
|
|
|
|
|
|
|
|
/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
|
|
|
|
|
|
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
static bool
|
2016-08-22 15:01:08 -07:00
|
|
|
|
is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
|
2015-07-13 18:08:51 +03:00
|
|
|
|
{
|
|
|
|
|
|
if (devinfo->gen < 8 && !devinfo->is_haswell)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-10-24 14:55:57 -07:00
|
|
|
|
return sampler.file != IMM || sampler.ud >= 16;
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-10-30 15:47:39 -05:00
|
|
|
|
static unsigned
|
|
|
|
|
|
sampler_msg_type(const gen_device_info *devinfo,
|
|
|
|
|
|
opcode opcode, bool shadow_compare)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(devinfo->gen >= 5);
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case SHADER_OPCODE_TEX:
|
|
|
|
|
|
return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE :
|
|
|
|
|
|
GEN5_SAMPLER_MESSAGE_SAMPLE;
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE :
|
|
|
|
|
|
GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
return shadow_compare ? GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE :
|
|
|
|
|
|
GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LZ:
|
|
|
|
|
|
return shadow_compare ? GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ :
|
|
|
|
|
|
GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
2019-02-21 09:59:35 -06:00
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
2018-10-30 15:47:39 -05:00
|
|
|
|
return GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
assert(!shadow_compare || devinfo->gen >= 8 || devinfo->is_haswell);
|
|
|
|
|
|
return shadow_compare ? HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE :
|
|
|
|
|
|
GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
return GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LZ:
|
|
|
|
|
|
assert(devinfo->gen >= 9);
|
|
|
|
|
|
return GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
|
|
|
|
|
assert(devinfo->gen >= 9);
|
|
|
|
|
|
return GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
return devinfo->gen >= 7 ? GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS :
|
|
|
|
|
|
GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
assert(devinfo->gen >= 7);
|
|
|
|
|
|
return GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
|
|
|
|
assert(devinfo->gen >= 7);
|
|
|
|
|
|
return GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
|
|
|
|
|
|
case SHADER_OPCODE_LOD:
|
|
|
|
|
|
return GEN5_SAMPLER_MESSAGE_LOD;
|
|
|
|
|
|
case SHADER_OPCODE_TG4:
|
|
|
|
|
|
assert(devinfo->gen >= 7);
|
|
|
|
|
|
return shadow_compare ? GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C :
|
|
|
|
|
|
GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
assert(devinfo->gen >= 7);
|
|
|
|
|
|
return shadow_compare ? GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C :
|
|
|
|
|
|
GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
|
|
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
|
|
|
|
|
return GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("not reached");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
|
2016-04-30 21:54:47 -07:00
|
|
|
|
const fs_reg &coordinate,
|
2015-07-13 18:08:51 +03:00
|
|
|
|
const fs_reg &shadow_c,
|
2016-04-30 21:54:47 -07:00
|
|
|
|
fs_reg lod, const fs_reg &lod2,
|
2018-10-11 15:57:50 -05:00
|
|
|
|
const fs_reg &min_lod,
|
2015-07-13 18:08:51 +03:00
|
|
|
|
const fs_reg &sample_index,
|
2016-02-05 18:24:02 -08:00
|
|
|
|
const fs_reg &mcs,
|
|
|
|
|
|
const fs_reg &surface,
|
|
|
|
|
|
const fs_reg &sampler,
|
2019-02-06 15:42:17 -06:00
|
|
|
|
const fs_reg &surface_handle,
|
|
|
|
|
|
const fs_reg &sampler_handle,
|
2016-11-28 18:13:02 -08:00
|
|
|
|
const fs_reg &tg4_offset,
|
2015-07-13 18:08:51 +03:00
|
|
|
|
unsigned coord_components,
|
|
|
|
|
|
unsigned grad_components)
|
|
|
|
|
|
{
|
2016-08-22 15:01:08 -07:00
|
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
2018-10-30 15:47:39 -05:00
|
|
|
|
const brw_stage_prog_data *prog_data = bld.shader->stage_prog_data;
|
2016-09-07 16:59:35 -07:00
|
|
|
|
unsigned reg_width = bld.dispatch_width() / 8;
|
2015-07-13 18:08:51 +03:00
|
|
|
|
unsigned header_size = 0, length = 0;
|
|
|
|
|
|
fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
|
|
|
|
|
|
sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
2019-02-06 15:42:17 -06:00
|
|
|
|
/* We must have exactly one of surface/sampler and surface/sampler_handle */
|
|
|
|
|
|
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
|
|
|
|
|
|
assert((sampler.file == BAD_FILE) != (sampler_handle.file == BAD_FILE));
|
|
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
|
2016-11-28 18:13:02 -08:00
|
|
|
|
inst->offset != 0 || inst->eot ||
|
2016-05-20 00:37:37 -07:00
|
|
|
|
op == SHADER_OPCODE_SAMPLEINFO ||
|
2019-02-06 15:42:17 -06:00
|
|
|
|
sampler_handle.file != BAD_FILE ||
|
2015-07-13 18:08:51 +03:00
|
|
|
|
is_high_sampler(devinfo, sampler)) {
|
|
|
|
|
|
/* For general texture offsets (no txf workaround), we need a header to
|
2018-02-28 19:57:44 -08:00
|
|
|
|
* put them in.
|
2015-07-13 18:08:51 +03:00
|
|
|
|
*
|
|
|
|
|
|
* TG4 needs to place its channel select in the header, for interaction
|
|
|
|
|
|
* with ARB_texture_swizzle. The sampler index is only 4-bits, so for
|
|
|
|
|
|
* larger sampler numbers we need to offset the Sampler State Pointer in
|
|
|
|
|
|
* the header.
|
|
|
|
|
|
*/
|
2018-02-28 19:57:44 -08:00
|
|
|
|
fs_reg header = retype(sources[0], BRW_REGISTER_TYPE_UD);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
header_size = 1;
|
|
|
|
|
|
length++;
|
i965/fs: Reduce the response length of sampler messages on Skylake.
Often, we don't need a full 4 channels worth of data from the sampler.
For example, depth comparisons and red textures only return one value.
To handle this, the sampler message header contains a mask which can
be used to disable channels, and reduce the message length (in SIMD16
mode on all hardware, and SIMD8 mode on Broadwell and later).
We've never used it before, since it required setting up a message
header. This meant trading a smaller response length for a larger
message length and additional MOVs to set it up.
However, Skylake introduces a terrific new feature: for headerless
messages, you can simply reduce the response length, and it makes
the implicit header contain an appropriate mask. So to read only
RG, you would simply set the message length to 2 or 4 (SIMD8/16).
This means we can finally take advantage of this at no cost.
total instructions in shared programs: 9091831 -> 9073067 (-0.21%)
instructions in affected programs: 191370 -> 172606 (-9.81%)
helped: 2609
HURT: 0
total cycles in shared programs: 70868114 -> 68454752 (-3.41%)
cycles in affected programs: 35841154 -> 33427792 (-6.73%)
helped: 16357
HURT: 8188
total spills in shared programs: 3492 -> 1707 (-51.12%)
spills in affected programs: 2749 -> 964 (-64.93%)
helped: 74
HURT: 0
total fills in shared programs: 4266 -> 2647 (-37.95%)
fills in affected programs: 3029 -> 1410 (-53.45%)
helped: 74
HURT: 0
LOST: 1
GAINED: 143
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-04-23 01:54:33 -07:00
|
|
|
|
|
|
|
|
|
|
/* If we're requesting fewer than four channels worth of response,
|
|
|
|
|
|
* and we have an explicit header, we need to set up the sampler
|
|
|
|
|
|
* writemask. It's reversed from normal: 1 means "don't write".
|
|
|
|
|
|
*/
|
2016-09-07 16:59:35 -07:00
|
|
|
|
if (!inst->eot && regs_written(inst) != 4 * reg_width) {
|
|
|
|
|
|
assert(regs_written(inst) % reg_width == 0);
|
|
|
|
|
|
unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
|
i965/fs: Reduce the response length of sampler messages on Skylake.
Often, we don't need a full 4 channels worth of data from the sampler.
For example, depth comparisons and red textures only return one value.
To handle this, the sampler message header contains a mask which can
be used to disable channels, and reduce the message length (in SIMD16
mode on all hardware, and SIMD8 mode on Broadwell and later).
We've never used it before, since it required setting up a message
header. This meant trading a smaller response length for a larger
message length and additional MOVs to set it up.
However, Skylake introduces a terrific new feature: for headerless
messages, you can simply reduce the response length, and it makes
the implicit header contain an appropriate mask. So to read only
RG, you would simply set the message length to 2 or 4 (SIMD8/16).
This means we can finally take advantage of this at no cost.
total instructions in shared programs: 9091831 -> 9073067 (-0.21%)
instructions in affected programs: 191370 -> 172606 (-9.81%)
helped: 2609
HURT: 0
total cycles in shared programs: 70868114 -> 68454752 (-3.41%)
cycles in affected programs: 35841154 -> 33427792 (-6.73%)
helped: 16357
HURT: 8188
total spills in shared programs: 3492 -> 1707 (-51.12%)
spills in affected programs: 2749 -> 964 (-64.93%)
helped: 74
HURT: 0
total fills in shared programs: 4266 -> 2647 (-37.95%)
fills in affected programs: 3029 -> 1410 (-53.45%)
helped: 74
HURT: 0
LOST: 1
GAINED: 143
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-04-23 01:54:33 -07:00
|
|
|
|
inst->offset |= mask << 12;
|
|
|
|
|
|
}
|
2018-02-28 19:57:44 -08:00
|
|
|
|
|
|
|
|
|
|
/* Build the actual header */
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(8, 0);
|
|
|
|
|
|
const fs_builder ubld1 = ubld.group(1, 0);
|
|
|
|
|
|
ubld.MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
if (inst->offset) {
|
|
|
|
|
|
ubld1.MOV(component(header, 2), brw_imm_ud(inst->offset));
|
|
|
|
|
|
} else if (bld.shader->stage != MESA_SHADER_VERTEX &&
|
|
|
|
|
|
bld.shader->stage != MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
/* The vertex and fragment stages have g0.2 set to 0, so
|
|
|
|
|
|
* header0.2 is 0 when g0 is copied. Other stages may not, so we
|
|
|
|
|
|
* must set it to 0 to avoid setting undesirable bits in the
|
|
|
|
|
|
* message.
|
|
|
|
|
|
*/
|
|
|
|
|
|
ubld1.MOV(component(header, 2), brw_imm_ud(0));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-02-06 15:42:17 -06:00
|
|
|
|
if (sampler_handle.file != BAD_FILE) {
|
|
|
|
|
|
/* Bindless sampler handles aren't relative to the sampler state
|
|
|
|
|
|
* pointer passed into the shader through SAMPLER_STATE_POINTERS_*.
|
|
|
|
|
|
* Instead, it's an absolute pointer relative to dynamic state base
|
|
|
|
|
|
* address.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Sampler states are 16 bytes each and the pointer we give here has
|
|
|
|
|
|
* to be 32-byte aligned. In order to avoid more indirect messages
|
|
|
|
|
|
* than required, we assume that all bindless sampler states are
|
|
|
|
|
|
* 32-byte aligned. This sacrifices a bit of general state base
|
|
|
|
|
|
* address space but means we can do something more efficient in the
|
|
|
|
|
|
* shader.
|
|
|
|
|
|
*/
|
|
|
|
|
|
ubld1.MOV(component(header, 3), sampler_handle);
|
|
|
|
|
|
} else if (is_high_sampler(devinfo, sampler)) {
|
2018-02-28 19:57:44 -08:00
|
|
|
|
if (sampler.file == BRW_IMMEDIATE_VALUE) {
|
|
|
|
|
|
assert(sampler.ud >= 16);
|
|
|
|
|
|
const int sampler_state_size = 16; /* 16 bytes */
|
|
|
|
|
|
|
|
|
|
|
|
ubld1.ADD(component(header, 3),
|
|
|
|
|
|
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
brw_imm_ud(16 * (sampler.ud / 16) * sampler_state_size));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg tmp = ubld1.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ubld1.AND(tmp, sampler, brw_imm_ud(0x0f0));
|
|
|
|
|
|
ubld1.SHL(tmp, tmp, brw_imm_ud(4));
|
|
|
|
|
|
ubld1.ADD(component(header, 3),
|
|
|
|
|
|
retype(brw_vec1_grf(0, 3), BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
tmp);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (shadow_c.file != BAD_FILE) {
|
|
|
|
|
|
bld.MOV(sources[length], shadow_c);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool coordinate_done = false;
|
|
|
|
|
|
|
|
|
|
|
|
/* Set up the LOD info */
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
2016-05-04 15:37:02 -07:00
|
|
|
|
if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
|
|
|
|
|
|
op = SHADER_OPCODE_TXL_LZ;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
bld.MOV(sources[length], lod);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
/* TXD should have been lowered in SIMD16 mode. */
|
|
|
|
|
|
assert(bld.dispatch_width() == 8);
|
|
|
|
|
|
|
|
|
|
|
|
/* Load dPdx and the coordinate together:
|
|
|
|
|
|
* [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++) {
|
2016-04-30 21:54:47 -07:00
|
|
|
|
bld.MOV(sources[length++], offset(coordinate, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
/* For cube map array, the coordinate is (u,v,r,ai) but there are
|
|
|
|
|
|
* only derivatives for (u, v, r).
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (i < grad_components) {
|
2016-04-30 21:54:47 -07:00
|
|
|
|
bld.MOV(sources[length++], offset(lod, bld, i));
|
|
|
|
|
|
bld.MOV(sources[length++], offset(lod2, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
break;
|
2019-02-21 09:59:35 -06:00
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
2018-10-31 09:52:33 -05:00
|
|
|
|
/* We need an LOD; just use 0 */
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
|
|
|
|
|
|
length++;
|
|
|
|
|
|
break;
|
2015-07-13 18:08:51 +03:00
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
|
|
|
|
|
|
* On Gen9 they are u, v, lod, r
|
|
|
|
|
|
*/
|
2016-04-30 21:54:47 -07:00
|
|
|
|
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 9) {
|
|
|
|
|
|
if (coord_components >= 2) {
|
2016-04-30 21:54:47 -07:00
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
|
|
|
|
|
|
offset(coordinate, bld, 1));
|
2016-06-06 19:15:39 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
sources[length] = brw_imm_d(0);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-04 15:37:02 -07:00
|
|
|
|
if (devinfo->gen >= 9 && lod.is_zero()) {
|
|
|
|
|
|
op = SHADER_OPCODE_TXF_LZ;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
2016-04-30 21:54:47 -07:00
|
|
|
|
for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
|
|
|
|
|
|
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
|
|
|
|
|
|
offset(coordinate, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
2016-05-04 15:46:45 -07:00
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2015-09-08 15:52:09 +01:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W:
|
2015-07-17 18:50:27 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
2015-09-08 15:52:09 +01:00
|
|
|
|
if (op == SHADER_OPCODE_TXF_UMS ||
|
|
|
|
|
|
op == SHADER_OPCODE_TXF_CMS ||
|
|
|
|
|
|
op == SHADER_OPCODE_TXF_CMS_W) {
|
2015-07-17 18:50:27 +03:00
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
2015-09-08 15:52:09 +01:00
|
|
|
|
if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
|
2015-07-17 18:50:27 +03:00
|
|
|
|
/* Data from the multisample control surface. */
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
|
|
|
|
|
|
length++;
|
2015-09-08 15:52:09 +01:00
|
|
|
|
|
|
|
|
|
|
/* On Gen9+ we'll use ld2dms_w instead which has two registers for
|
|
|
|
|
|
* the MCS data.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (op == SHADER_OPCODE_TXF_CMS_W) {
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
|
|
|
|
|
|
mcs.file == IMM ?
|
|
|
|
|
|
mcs :
|
|
|
|
|
|
offset(mcs, bld, 1));
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
2015-07-17 18:50:27 +03:00
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
/* There is no offsetting for this message; just copy in the integer
|
|
|
|
|
|
* texture coordinates.
|
|
|
|
|
|
*/
|
2016-04-30 21:54:47 -07:00
|
|
|
|
for (unsigned i = 0; i < coord_components; i++)
|
|
|
|
|
|
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
|
|
|
|
|
|
offset(coordinate, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
/* More crazy intermixing */
|
2016-04-30 21:54:47 -07:00
|
|
|
|
for (unsigned i = 0; i < 2; i++) /* u, v */
|
|
|
|
|
|
bld.MOV(sources[length++], offset(coordinate, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
2016-04-30 21:54:47 -07:00
|
|
|
|
for (unsigned i = 0; i < 2; i++) /* offu, offv */
|
|
|
|
|
|
bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
|
2016-11-28 18:13:02 -08:00
|
|
|
|
offset(tg4_offset, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
2016-04-30 21:54:47 -07:00
|
|
|
|
if (coord_components == 3) /* r if present */
|
|
|
|
|
|
bld.MOV(sources[length++], offset(coordinate, bld, 2));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Set up the coordinate (except for cases where it was done above) */
|
|
|
|
|
|
if (!coordinate_done) {
|
2016-04-30 21:54:47 -07:00
|
|
|
|
for (unsigned i = 0; i < coord_components; i++)
|
|
|
|
|
|
bld.MOV(sources[length++], offset(coordinate, bld, i));
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-10-11 15:57:50 -05:00
|
|
|
|
if (min_lod.file != BAD_FILE) {
|
|
|
|
|
|
/* Account for all of the missing coordinate sources */
|
|
|
|
|
|
length += 4 - coord_components;
|
|
|
|
|
|
if (op == SHADER_OPCODE_TXD)
|
|
|
|
|
|
length += (3 - grad_components) * 2;
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(sources[length++], min_lod);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-12-10 14:49:49 -08:00
|
|
|
|
unsigned mlen;
|
2015-07-13 18:08:51 +03:00
|
|
|
|
if (reg_width == 2)
|
|
|
|
|
|
mlen = length * reg_width - header_size;
|
|
|
|
|
|
else
|
|
|
|
|
|
mlen = length * reg_width;
|
|
|
|
|
|
|
2015-10-26 17:09:25 -07:00
|
|
|
|
const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
|
2015-07-13 18:08:51 +03:00
|
|
|
|
BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* Generate the SEND. */
|
2018-10-30 15:47:39 -05:00
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
2015-07-13 18:08:51 +03:00
|
|
|
|
inst->mlen = mlen;
|
|
|
|
|
|
inst->header_size = header_size;
|
|
|
|
|
|
|
2018-10-30 15:47:39 -05:00
|
|
|
|
const unsigned msg_type =
|
|
|
|
|
|
sampler_msg_type(devinfo, op, inst->shadow_compare);
|
|
|
|
|
|
const unsigned simd_mode =
|
|
|
|
|
|
inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
|
|
|
|
|
|
BRW_SAMPLER_SIMD_MODE_SIMD16;
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t base_binding_table_index;
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case SHADER_OPCODE_TG4:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
base_binding_table_index = prog_data->binding_table.gather_texture_start;
|
|
|
|
|
|
break;
|
2019-02-21 09:59:35 -06:00
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
2018-10-30 15:47:39 -05:00
|
|
|
|
base_binding_table_index = prog_data->binding_table.image_start;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
base_binding_table_index = prog_data->binding_table.texture_start;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->sfid = BRW_SFID_SAMPLER;
|
2019-02-06 15:42:17 -06:00
|
|
|
|
if (surface.file == IMM &&
|
|
|
|
|
|
(sampler.file == IMM || sampler_handle.file != BAD_FILE)) {
|
2018-10-30 15:47:39 -05:00
|
|
|
|
inst->desc = brw_sampler_desc(devinfo,
|
|
|
|
|
|
surface.ud + base_binding_table_index,
|
2019-02-06 15:42:17 -06:00
|
|
|
|
sampler.file == IMM ? sampler.ud % 16 : 0,
|
2018-10-30 15:47:39 -05:00
|
|
|
|
msg_type,
|
|
|
|
|
|
simd_mode,
|
|
|
|
|
|
0 /* return_format unused on gen7+ */);
|
|
|
|
|
|
inst->src[0] = brw_imm_ud(0);
|
2019-02-06 15:42:17 -06:00
|
|
|
|
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
|
|
|
|
|
} else if (surface_handle.file != BAD_FILE) {
|
|
|
|
|
|
/* Bindless surface */
|
|
|
|
|
|
assert(devinfo->gen >= 9);
|
|
|
|
|
|
inst->desc = brw_sampler_desc(devinfo,
|
|
|
|
|
|
GEN9_BTI_BINDLESS,
|
|
|
|
|
|
sampler.file == IMM ? sampler.ud % 16 : 0,
|
|
|
|
|
|
msg_type,
|
|
|
|
|
|
simd_mode,
|
|
|
|
|
|
0 /* return_format unused on gen7+ */);
|
|
|
|
|
|
|
|
|
|
|
|
/* For bindless samplers, the entire address is included in the message
|
|
|
|
|
|
* header so we can leave the portion in the message descriptor 0.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (sampler_handle.file != BAD_FILE || sampler.file == IMM) {
|
|
|
|
|
|
inst->src[0] = brw_imm_ud(0);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
const fs_builder ubld = bld.group(1, 0).exec_all();
|
|
|
|
|
|
fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ubld.SHL(desc, sampler, brw_imm_ud(8));
|
|
|
|
|
|
inst->src[0] = desc;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We assume that the driver provided the handle in the top 20 bits so
|
|
|
|
|
|
* we can use the surface handle directly as the extended descriptor.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
|
2018-10-30 15:47:39 -05:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Immediate portion of the descriptor */
|
|
|
|
|
|
inst->desc = brw_sampler_desc(devinfo,
|
|
|
|
|
|
0, /* surface */
|
|
|
|
|
|
0, /* sampler */
|
|
|
|
|
|
msg_type,
|
|
|
|
|
|
simd_mode,
|
|
|
|
|
|
0 /* return_format unused on gen7+ */);
|
|
|
|
|
|
const fs_builder ubld = bld.group(1, 0).exec_all();
|
|
|
|
|
|
fs_reg desc = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
if (surface.equals(sampler)) {
|
|
|
|
|
|
/* This case is common in GL */
|
|
|
|
|
|
ubld.MUL(desc, surface, brw_imm_ud(0x101));
|
|
|
|
|
|
} else {
|
2019-02-06 15:42:17 -06:00
|
|
|
|
if (sampler_handle.file != BAD_FILE) {
|
|
|
|
|
|
ubld.MOV(desc, surface);
|
|
|
|
|
|
} else if (sampler.file == IMM) {
|
2018-10-30 15:47:39 -05:00
|
|
|
|
ubld.OR(desc, surface, brw_imm_ud(sampler.ud << 8));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ubld.SHL(desc, sampler, brw_imm_ud(8));
|
|
|
|
|
|
ubld.OR(desc, desc, surface);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (base_binding_table_index)
|
|
|
|
|
|
ubld.ADD(desc, desc, brw_imm_ud(base_binding_table_index));
|
|
|
|
|
|
ubld.AND(desc, desc, brw_imm_ud(0xfff));
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[0] = component(desc, 0);
|
2019-02-06 15:42:17 -06:00
|
|
|
|
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
2018-10-30 15:47:39 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[2] = src_payload;
|
|
|
|
|
|
inst->resize_sources(3);
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->eot) {
|
|
|
|
|
|
/* EOT sampler messages don't make sense to split because it would
|
|
|
|
|
|
* involve ending half of the thread early.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->group == 0);
|
|
|
|
|
|
/* We need to use SENDC for EOT sampler messages */
|
|
|
|
|
|
inst->check_tdr = true;
|
|
|
|
|
|
inst->send_has_side_effects = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
|
|
|
|
|
|
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
|
|
|
|
|
|
{
|
2016-08-22 15:01:08 -07:00
|
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
2016-02-05 18:39:13 -08:00
|
|
|
|
const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
|
|
|
|
|
|
const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
|
|
|
|
|
|
const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
|
|
|
|
|
|
const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
|
2018-10-11 15:57:50 -05:00
|
|
|
|
const fs_reg &min_lod = inst->src[TEX_LOGICAL_SRC_MIN_LOD];
|
2016-02-05 18:39:13 -08:00
|
|
|
|
const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
|
|
|
|
|
|
const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
|
2016-02-05 18:24:02 -08:00
|
|
|
|
const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
|
2016-02-05 18:39:13 -08:00
|
|
|
|
const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
|
2019-02-06 15:42:17 -06:00
|
|
|
|
const fs_reg &surface_handle = inst->src[TEX_LOGICAL_SRC_SURFACE_HANDLE];
|
|
|
|
|
|
const fs_reg &sampler_handle = inst->src[TEX_LOGICAL_SRC_SAMPLER_HANDLE];
|
2016-11-28 18:13:02 -08:00
|
|
|
|
const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
|
2016-02-05 18:39:13 -08:00
|
|
|
|
assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
|
|
|
|
|
|
const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
|
|
|
|
|
|
assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
|
|
|
|
|
|
const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
if (devinfo->gen >= 7) {
|
|
|
|
|
|
lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
|
2018-10-11 15:57:50 -05:00
|
|
|
|
shadow_c, lod, lod2, min_lod,
|
|
|
|
|
|
sample_index,
|
2019-02-06 15:42:17 -06:00
|
|
|
|
mcs, surface, sampler,
|
|
|
|
|
|
surface_handle, sampler_handle,
|
|
|
|
|
|
tg4_offset,
|
2015-07-13 18:08:51 +03:00
|
|
|
|
coord_components, grad_components);
|
2015-07-18 16:52:06 +03:00
|
|
|
|
} else if (devinfo->gen >= 5) {
|
|
|
|
|
|
lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
|
|
|
|
|
|
shadow_c, lod, lod2, sample_index,
|
2016-11-28 18:13:02 -08:00
|
|
|
|
surface, sampler,
|
2015-07-18 16:52:06 +03:00
|
|
|
|
coord_components, grad_components);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
} else {
|
2015-07-18 17:09:37 +03:00
|
|
|
|
lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
|
2016-02-05 18:24:02 -08:00
|
|
|
|
shadow_c, lod, lod2,
|
|
|
|
|
|
surface, sampler,
|
2015-07-18 17:09:37 +03:00
|
|
|
|
coord_components, grad_components);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-14 18:42:57 +03:00
|
|
|
|
/**
|
|
|
|
|
|
* Initialize the header present in some typed and untyped surface
|
|
|
|
|
|
* messages.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static fs_reg
|
|
|
|
|
|
emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_builder ubld = bld.exec_all().group(8, 0);
|
|
|
|
|
|
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
2015-11-02 11:26:16 -08:00
|
|
|
|
ubld.MOV(dst, brw_imm_d(0));
|
2017-08-30 12:07:00 -07:00
|
|
|
|
ubld.group(1, 0).MOV(component(dst, 7), sample_mask);
|
2015-07-14 18:42:57 +03:00
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
static void
|
2018-10-30 12:23:44 -05:00
|
|
|
|
lower_surface_logical_send(const fs_builder &bld, fs_inst *inst)
|
2015-07-21 18:45:32 +03:00
|
|
|
|
{
|
2017-12-12 12:05:04 -08:00
|
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
/* Get the logical send arguments. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
const fs_reg &addr = inst->src[SURFACE_LOGICAL_SRC_ADDRESS];
|
|
|
|
|
|
const fs_reg &src = inst->src[SURFACE_LOGICAL_SRC_DATA];
|
|
|
|
|
|
const fs_reg &surface = inst->src[SURFACE_LOGICAL_SRC_SURFACE];
|
2019-02-12 00:47:54 -06:00
|
|
|
|
const fs_reg &surface_handle = inst->src[SURFACE_LOGICAL_SRC_SURFACE_HANDLE];
|
2019-02-11 14:51:02 -06:00
|
|
|
|
const UNUSED fs_reg &dims = inst->src[SURFACE_LOGICAL_SRC_IMM_DIMS];
|
|
|
|
|
|
const fs_reg &arg = inst->src[SURFACE_LOGICAL_SRC_IMM_ARG];
|
2018-10-30 12:23:44 -05:00
|
|
|
|
assert(arg.file == IMM);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
|
2019-02-12 00:47:54 -06:00
|
|
|
|
/* We must have exactly one of surface and surface_handle */
|
|
|
|
|
|
assert((surface.file == BAD_FILE) != (surface_handle.file == BAD_FILE));
|
|
|
|
|
|
|
2015-07-14 18:42:57 +03:00
|
|
|
|
/* Calculate the total number of components of the payload. */
|
2019-02-11 14:51:02 -06:00
|
|
|
|
const unsigned addr_sz = inst->components_read(SURFACE_LOGICAL_SRC_ADDRESS);
|
|
|
|
|
|
const unsigned src_sz = inst->components_read(SURFACE_LOGICAL_SRC_DATA);
|
2018-10-30 12:23:44 -05:00
|
|
|
|
|
|
|
|
|
|
const bool is_typed_access =
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL ||
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL ||
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_TYPED_ATOMIC_LOGICAL;
|
|
|
|
|
|
|
2017-12-12 12:05:04 -08:00
|
|
|
|
/* From the BDW PRM Volume 7, page 147:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "For the Data Cache Data Port*, the header must be present for the
|
|
|
|
|
|
* following message types: [...] Typed read/write/atomics"
|
|
|
|
|
|
*
|
|
|
|
|
|
* Earlier generations have a similar wording. Because of this restriction
|
|
|
|
|
|
* we don't attempt to implement sample masks via predication for such
|
|
|
|
|
|
* messages prior to Gen9, since we have to provide a header anyway. On
|
|
|
|
|
|
* Gen11+ the header has been removed so we can only use predication.
|
|
|
|
|
|
*/
|
2018-10-30 12:23:44 -05:00
|
|
|
|
const unsigned header_sz = devinfo->gen < 9 && is_typed_access ? 1 : 0;
|
2015-07-14 18:42:57 +03:00
|
|
|
|
|
2018-10-30 12:23:44 -05:00
|
|
|
|
const bool has_side_effects = inst->has_side_effects();
|
|
|
|
|
|
fs_reg sample_mask = has_side_effects ? bld.sample_mask_reg() :
|
|
|
|
|
|
fs_reg(brw_imm_d(0xffff));
|
|
|
|
|
|
|
2018-11-16 10:46:27 -06:00
|
|
|
|
fs_reg payload, payload2;
|
|
|
|
|
|
unsigned mlen, ex_mlen = 0;
|
|
|
|
|
|
if (devinfo->gen >= 9) {
|
|
|
|
|
|
/* We have split sends on gen9 and above */
|
|
|
|
|
|
assert(header_sz == 0);
|
|
|
|
|
|
payload = bld.move_to_vgrf(addr, addr_sz);
|
|
|
|
|
|
payload2 = bld.move_to_vgrf(src, src_sz);
|
|
|
|
|
|
mlen = addr_sz * (inst->exec_size / 8);
|
|
|
|
|
|
ex_mlen = src_sz * (inst->exec_size / 8);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Allocate space for the payload. */
|
|
|
|
|
|
const unsigned sz = header_sz + addr_sz + src_sz;
|
|
|
|
|
|
payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
|
|
|
|
|
|
fs_reg *const components = new fs_reg[sz];
|
|
|
|
|
|
unsigned n = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* Construct the payload. */
|
|
|
|
|
|
if (header_sz)
|
|
|
|
|
|
components[n++] = emit_surface_header(bld, sample_mask);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < addr_sz; i++)
|
|
|
|
|
|
components[n++] = offset(addr, bld, i);
|
2015-07-14 18:42:57 +03:00
|
|
|
|
|
2018-11-16 10:46:27 -06:00
|
|
|
|
for (unsigned i = 0; i < src_sz; i++)
|
|
|
|
|
|
components[n++] = offset(src, bld, i);
|
2015-07-14 18:42:57 +03:00
|
|
|
|
|
2018-11-16 10:46:27 -06:00
|
|
|
|
bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
|
|
|
|
|
|
mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
|
2015-07-14 18:42:57 +03:00
|
|
|
|
|
2018-11-16 10:46:27 -06:00
|
|
|
|
delete[] components;
|
|
|
|
|
|
}
|
2015-07-14 18:42:57 +03:00
|
|
|
|
|
2017-12-12 12:05:04 -08:00
|
|
|
|
/* Predicate the instruction on the sample mask if no header is
|
|
|
|
|
|
* provided.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (!header_sz && sample_mask.file != BAD_FILE &&
|
|
|
|
|
|
sample_mask.file != IMM) {
|
|
|
|
|
|
const fs_builder ubld = bld.group(1, 0).exec_all();
|
|
|
|
|
|
if (inst->predicate) {
|
|
|
|
|
|
assert(inst->predicate == BRW_PREDICATE_NORMAL);
|
|
|
|
|
|
assert(!inst->predicate_inverse);
|
|
|
|
|
|
assert(inst->flag_subreg < 2);
|
|
|
|
|
|
/* Combine the sample mask with the existing predicate by using a
|
|
|
|
|
|
* vertical predication mode.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_ALIGN1_ALLV;
|
|
|
|
|
|
ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg + 2),
|
|
|
|
|
|
sample_mask.type),
|
|
|
|
|
|
sample_mask);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
inst->flag_subreg = 2;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
|
|
|
|
|
ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
|
|
|
|
|
|
sample_mask);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-10-30 12:23:44 -05:00
|
|
|
|
uint32_t sfid;
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
|
/* Byte scattered opcodes go through the normal data cache */
|
|
|
|
|
|
sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
|
|
|
|
|
/* Untyped Surface messages go through the data cache but the SFID value
|
|
|
|
|
|
* changed on Haswell.
|
|
|
|
|
|
*/
|
|
|
|
|
|
sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
|
|
|
|
|
|
HSW_SFID_DATAPORT_DATA_CACHE_1 :
|
|
|
|
|
|
GEN7_SFID_DATAPORT_DATA_CACHE);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
/* Typed surface messages go through the render cache on IVB and the
|
|
|
|
|
|
* data cache on HSW+.
|
|
|
|
|
|
*/
|
|
|
|
|
|
sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
|
|
|
|
|
|
HSW_SFID_DATAPORT_DATA_CACHE_1 :
|
|
|
|
|
|
GEN6_SFID_DATAPORT_RENDER_CACHE);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Unsupported surface opcode");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
uint32_t desc;
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg.ud, /* num_channels */
|
|
|
|
|
|
false /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_untyped_surface_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg.ud, /* num_channels */
|
|
|
|
|
|
true /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg.ud, /* bit_size */
|
|
|
|
|
|
false /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_byte_scattered_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg.ud, /* bit_size */
|
|
|
|
|
|
true /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_untyped_atomic_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg.ud, /* atomic_op */
|
|
|
|
|
|
!inst->dst.is_null());
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_untyped_atomic_float_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg.ud, /* atomic_op */
|
|
|
|
|
|
!inst->dst.is_null());
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
|
|
|
|
|
|
arg.ud, /* num_channels */
|
|
|
|
|
|
false /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_typed_surface_rw_desc(devinfo, inst->exec_size, inst->group,
|
|
|
|
|
|
arg.ud, /* num_channels */
|
|
|
|
|
|
true /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_typed_atomic_desc(devinfo, inst->exec_size, inst->group,
|
|
|
|
|
|
arg.ud, /* atomic_op */
|
|
|
|
|
|
!inst->dst.is_null());
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Unknown surface logical instruction");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-14 18:42:57 +03:00
|
|
|
|
/* Update the original instruction. */
|
2018-10-30 12:23:44 -05:00
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
2018-11-16 10:46:27 -06:00
|
|
|
|
inst->mlen = mlen;
|
|
|
|
|
|
inst->ex_mlen = ex_mlen;
|
2015-07-14 18:42:57 +03:00
|
|
|
|
inst->header_size = header_sz;
|
2018-10-30 12:23:44 -05:00
|
|
|
|
inst->send_has_side_effects = has_side_effects;
|
|
|
|
|
|
inst->send_is_volatile = !has_side_effects;
|
|
|
|
|
|
|
|
|
|
|
|
/* Set up SFID and descriptors */
|
|
|
|
|
|
inst->sfid = sfid;
|
|
|
|
|
|
inst->desc = desc;
|
|
|
|
|
|
if (surface.file == IMM) {
|
|
|
|
|
|
inst->desc |= surface.ud & 0xff;
|
|
|
|
|
|
inst->src[0] = brw_imm_ud(0);
|
2019-02-12 00:47:54 -06:00
|
|
|
|
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
|
|
|
|
|
} else if (surface_handle.file != BAD_FILE) {
|
|
|
|
|
|
/* Bindless surface */
|
|
|
|
|
|
assert(devinfo->gen >= 9);
|
|
|
|
|
|
inst->desc |= GEN9_BTI_BINDLESS;
|
|
|
|
|
|
inst->src[0] = brw_imm_ud(0);
|
|
|
|
|
|
|
|
|
|
|
|
/* We assume that the driver provided the handle in the top 20 bits so
|
|
|
|
|
|
* we can use the surface handle directly as the extended descriptor.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->src[1] = retype(surface_handle, BRW_REGISTER_TYPE_UD);
|
2018-10-30 12:23:44 -05:00
|
|
|
|
} else {
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
|
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ubld.AND(tmp, surface, brw_imm_ud(0xff));
|
|
|
|
|
|
inst->src[0] = component(tmp, 0);
|
2019-02-12 00:47:54 -06:00
|
|
|
|
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
2018-10-30 12:23:44 -05:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Finally, the payload */
|
|
|
|
|
|
inst->src[2] = payload;
|
2018-11-16 10:46:27 -06:00
|
|
|
|
inst->src[3] = payload2;
|
2015-07-14 18:42:57 +03:00
|
|
|
|
|
2018-11-16 10:46:27 -06:00
|
|
|
|
inst->resize_sources(4);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
static void
|
|
|
|
|
|
lower_a64_logical_send(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
|
|
|
|
|
|
const fs_reg &addr = inst->src[0];
|
|
|
|
|
|
const fs_reg &src = inst->src[1];
|
|
|
|
|
|
const unsigned src_comps = inst->components_read(1);
|
|
|
|
|
|
assert(inst->src[2].file == IMM);
|
|
|
|
|
|
const unsigned arg = inst->src[2].ud;
|
|
|
|
|
|
const bool has_side_effects = inst->has_side_effects();
|
|
|
|
|
|
|
|
|
|
|
|
/* If the surface message has side effects and we're a fragment shader, we
|
|
|
|
|
|
* have to predicate with the sample mask to avoid helper invocations.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (has_side_effects && bld.shader->stage == MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
inst->flag_subreg = 2;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg sample_mask = bld.sample_mask_reg();
|
|
|
|
|
|
const fs_builder ubld = bld.group(1, 0).exec_all();
|
|
|
|
|
|
ubld.MOV(retype(brw_flag_subreg(inst->flag_subreg), sample_mask.type),
|
|
|
|
|
|
sample_mask);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-16 10:13:51 -06:00
|
|
|
|
fs_reg payload, payload2;
|
|
|
|
|
|
unsigned mlen, ex_mlen = 0;
|
|
|
|
|
|
if (devinfo->gen >= 9) {
|
|
|
|
|
|
/* On Skylake and above, we have SENDS */
|
|
|
|
|
|
mlen = 2 * (inst->exec_size / 8);
|
2019-01-12 18:30:47 -06:00
|
|
|
|
ex_mlen = src_comps * type_sz(src.type) * inst->exec_size / REG_SIZE;
|
2018-11-16 10:13:51 -06:00
|
|
|
|
payload = retype(bld.move_to_vgrf(addr, 1), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
payload2 = retype(bld.move_to_vgrf(src, src_comps),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Add two because the address is 64-bit */
|
|
|
|
|
|
const unsigned dwords = 2 + src_comps;
|
|
|
|
|
|
mlen = dwords * (inst->exec_size / 8);
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
2018-11-16 10:13:51 -06:00
|
|
|
|
fs_reg sources[5];
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
2018-11-16 10:13:51 -06:00
|
|
|
|
sources[0] = addr;
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
2018-11-16 10:13:51 -06:00
|
|
|
|
for (unsigned i = 0; i < src_comps; i++)
|
|
|
|
|
|
sources[1 + i] = offset(src, bld, i);
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
2018-11-16 10:13:51 -06:00
|
|
|
|
payload = bld.vgrf(BRW_REGISTER_TYPE_UD, dwords);
|
|
|
|
|
|
bld.LOAD_PAYLOAD(payload, sources, 1 + src_comps, 0);
|
|
|
|
|
|
}
|
2018-11-14 17:13:57 -06:00
|
|
|
|
|
|
|
|
|
|
uint32_t desc;
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg, /* num_channels */
|
|
|
|
|
|
false /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_untyped_surface_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg, /* num_channels */
|
|
|
|
|
|
true /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg, /* bit_size */
|
|
|
|
|
|
false /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_byte_scattered_rw_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg, /* bit_size */
|
|
|
|
|
|
true /* write */);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 32,
|
|
|
|
|
|
arg, /* atomic_op */
|
|
|
|
|
|
!inst->dst.is_null());
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2019-01-12 18:30:47 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_untyped_atomic_desc(devinfo, inst->exec_size, 64,
|
|
|
|
|
|
arg, /* atomic_op */
|
|
|
|
|
|
!inst->dst.is_null());
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
|
|
|
|
|
desc = brw_dp_a64_untyped_atomic_float_desc(devinfo, inst->exec_size,
|
|
|
|
|
|
arg, /* atomic_op */
|
|
|
|
|
|
!inst->dst.is_null());
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Unknown A64 logical instruction");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Update the original instruction. */
|
|
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
|
|
|
|
|
inst->mlen = mlen;
|
2018-11-16 10:13:51 -06:00
|
|
|
|
inst->ex_mlen = ex_mlen;
|
2018-11-14 17:13:57 -06:00
|
|
|
|
inst->header_size = 0;
|
|
|
|
|
|
inst->send_has_side_effects = has_side_effects;
|
|
|
|
|
|
inst->send_is_volatile = !has_side_effects;
|
|
|
|
|
|
|
|
|
|
|
|
/* Set up SFID and descriptors */
|
|
|
|
|
|
inst->sfid = HSW_SFID_DATAPORT_DATA_CACHE_1;
|
|
|
|
|
|
inst->desc = desc;
|
2018-11-16 10:13:51 -06:00
|
|
|
|
inst->resize_sources(4);
|
2018-11-14 17:13:57 -06:00
|
|
|
|
inst->src[0] = brw_imm_ud(0); /* desc */
|
|
|
|
|
|
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
|
|
|
|
|
inst->src[2] = payload;
|
2018-11-16 10:13:51 -06:00
|
|
|
|
inst->src[3] = payload2;
|
2018-11-14 17:13:57 -06:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-17 23:18:38 -07:00
|
|
|
|
static void
|
|
|
|
|
|
lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
|
{
|
2016-08-22 15:01:08 -07:00
|
|
|
|
const gen_device_info *devinfo = bld.shader->devinfo;
|
2016-05-17 23:18:38 -07:00
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 7) {
|
2018-11-01 16:04:01 -05:00
|
|
|
|
fs_reg index = inst->src[0];
|
2016-06-01 15:01:04 -07:00
|
|
|
|
/* We are switching the instruction from an ALU-like instruction to a
|
|
|
|
|
|
* send-from-grf instruction. Since sends can't handle strides or
|
|
|
|
|
|
* source modifiers, we have to make a copy of the offset source.
|
|
|
|
|
|
*/
|
2018-11-01 16:04:01 -05:00
|
|
|
|
fs_reg offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.MOV(offset, inst->src[1]);
|
2016-06-01 15:01:04 -07:00
|
|
|
|
|
2018-11-01 16:04:01 -05:00
|
|
|
|
const unsigned simd_mode =
|
|
|
|
|
|
inst->exec_size <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 :
|
|
|
|
|
|
BRW_SAMPLER_SIMD_MODE_SIMD16;
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = SHADER_OPCODE_SEND;
|
2017-12-26 19:08:10 -08:00
|
|
|
|
inst->mlen = inst->exec_size / 8;
|
2018-11-01 16:04:01 -05:00
|
|
|
|
inst->resize_sources(3);
|
|
|
|
|
|
|
|
|
|
|
|
inst->sfid = BRW_SFID_SAMPLER;
|
|
|
|
|
|
inst->desc = brw_sampler_desc(devinfo, 0, 0,
|
|
|
|
|
|
GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
|
|
|
|
|
|
simd_mode, 0);
|
|
|
|
|
|
if (index.file == IMM) {
|
|
|
|
|
|
inst->desc |= index.ud & 0xff;
|
|
|
|
|
|
inst->src[0] = brw_imm_ud(0);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
|
fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ubld.AND(tmp, index, brw_imm_ud(0xff));
|
|
|
|
|
|
inst->src[0] = component(tmp, 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
inst->src[1] = brw_imm_ud(0); /* ex_desc */
|
|
|
|
|
|
inst->src[2] = offset; /* payload */
|
2016-05-17 23:18:38 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
|
|
|
|
|
|
|
2016-05-20 13:03:31 -07:00
|
|
|
|
inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
|
2016-05-17 23:18:38 -07:00
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
|
inst->base_mrf = payload.nr;
|
|
|
|
|
|
inst->header_size = 1;
|
|
|
|
|
|
inst->mlen = 1 + inst->exec_size / 8;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-17 23:54:25 -07:00
|
|
|
|
static void
|
|
|
|
|
|
lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(bld.shader->devinfo->gen < 6);
|
|
|
|
|
|
|
|
|
|
|
|
inst->base_mrf = 2;
|
|
|
|
|
|
inst->mlen = inst->sources * inst->exec_size / 8;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->sources > 1) {
|
|
|
|
|
|
/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
|
|
|
|
|
|
* "Message Payload":
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Operand0[7]. For the INT DIV functions, this operand is the
|
|
|
|
|
|
* denominator."
|
|
|
|
|
|
* ...
|
|
|
|
|
|
* "Operand1[7]. For the INT DIV functions, this operand is the
|
|
|
|
|
|
* numerator."
|
|
|
|
|
|
*/
|
|
|
|
|
|
const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
|
|
|
|
|
|
const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
|
|
|
|
|
|
const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
|
|
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
|
inst->src[0] = src0;
|
|
|
|
|
|
|
|
|
|
|
|
assert(inst->exec_size == 8);
|
|
|
|
|
|
bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:44:58 +03:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_logical_sends()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2015-07-27 18:41:18 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
2015-07-13 17:44:58 +03:00
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
2015-07-27 16:14:36 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
lower_fb_write_logical_send(ibld, inst,
|
2016-09-08 23:48:51 -07:00
|
|
|
|
brw_wm_prog_data(prog_data),
|
2015-07-27 16:14:36 +03:00
|
|
|
|
(const brw_wm_prog_key *)key,
|
|
|
|
|
|
payload);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-07-21 16:55:45 -07:00
|
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
|
|
|
|
|
lower_fb_read_logical_send(ibld, inst);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2018-10-31 09:52:33 -05:00
|
|
|
|
case SHADER_OPCODE_IMAGE_SIZE_LOGICAL:
|
2019-02-21 09:59:35 -06:00
|
|
|
|
lower_sampler_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_IMAGE_SIZE_LOGICAL);
|
2018-10-31 09:52:33 -05:00
|
|
|
|
break;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-09-08 15:52:09 +01:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-05-20 00:37:37 -07:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
2017-07-01 08:19:17 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
2017-07-01 08:16:01 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
2018-04-18 14:02:33 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
2018-10-30 12:23:44 -05:00
|
|
|
|
lower_surface_logical_send(ibld, inst);
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
2019-01-12 18:30:47 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
2018-11-14 17:13:57 -06:00
|
|
|
|
lower_a64_logical_send(ibld, inst);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-05-17 23:18:38 -07:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
|
|
|
|
|
lower_varying_pull_constant_logical_send(ibld, inst);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2016-05-17 23:54:25 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
|
/* The math opcodes are overloaded for the send-like and
|
|
|
|
|
|
* expression-like instructions which seems kind of icky. Gen6+ has
|
|
|
|
|
|
* a native (but rather quirky) MATH instruction so we don't need to
|
|
|
|
|
|
* do anything here. On Gen4-5 we'll have to lower the Gen6-like
|
|
|
|
|
|
* logical instructions (which we can easily recognize because they
|
|
|
|
|
|
* have mlen = 0) into send-like virtual instructions.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 6 && inst->mlen == 0) {
|
|
|
|
|
|
lower_math_logical_send(ibld, inst);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:44:58 +03:00
|
|
|
|
default:
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-03-14 10:35:58 +01:00
|
|
|
|
static bool
|
|
|
|
|
|
is_mixed_float_with_fp32_dst(const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* This opcode sometimes uses :W type on the source even if the operand is
|
|
|
|
|
|
* a :HF, because in gen7 there is no support for :HF, and thus it uses :W.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_F16TO32)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->dst.type != BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].type == BRW_REGISTER_TYPE_HF)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static bool
|
|
|
|
|
|
is_mixed_float_with_packed_fp16_dst(const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* This opcode sometimes uses :W type on the destination even if the
|
|
|
|
|
|
* destination is a :HF, because in gen7 there is no support for :HF, and
|
|
|
|
|
|
* thus it uses :W.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_F32TO16 &&
|
|
|
|
|
|
inst->dst.stride == 1)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->dst.type != BRW_REGISTER_TYPE_HF ||
|
|
|
|
|
|
inst->dst.stride != 1)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].type == BRW_REGISTER_TYPE_F)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the closest allowed SIMD width for instruction \p inst accounting for
|
|
|
|
|
|
* some common regioning and execution control restrictions that apply to FPU
|
|
|
|
|
|
* instructions. These restrictions don't necessarily have any relevance to
|
|
|
|
|
|
* instructions not executed by the FPU pipeline like extended math, control
|
|
|
|
|
|
* flow or send message instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For virtual opcodes it's really up to the instruction -- In some cases
|
|
|
|
|
|
* (e.g. where a virtual instruction unrolls into a simple sequence of FPU
|
|
|
|
|
|
* instructions) it may simplify virtual instruction lowering if we can
|
|
|
|
|
|
* enforce FPU-like regioning restrictions already on the virtual instruction,
|
|
|
|
|
|
* in other cases (e.g. virtual send-like instructions) this may be
|
|
|
|
|
|
* excessively restrictive.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2016-08-22 15:01:08 -07:00
|
|
|
|
get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
|
2016-05-20 13:15:49 -07:00
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Maximum execution size representable in the instruction controls. */
|
|
|
|
|
|
unsigned max_width = MIN2(32, inst->exec_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* According to the PRMs:
|
|
|
|
|
|
* "A. In Direct Addressing mode, a source cannot span more than 2
|
|
|
|
|
|
* adjacent GRF registers.
|
|
|
|
|
|
* B. A destination cannot span more than 2 adjacent GRF registers."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Look for the source or destination with the largest register region
|
|
|
|
|
|
* which is the one that is going to limit the overall execution size of
|
|
|
|
|
|
* the instruction due to this rule.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
|
2016-05-20 13:15:49 -07:00
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++)
|
2016-09-07 17:00:07 -07:00
|
|
|
|
reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
|
2016-05-20 13:15:49 -07:00
|
|
|
|
|
|
|
|
|
|
/* Calculate the maximum execution size of the instruction based on the
|
|
|
|
|
|
* factor by which it goes over the hardware limit of 2 GRFs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (reg_count > 2)
|
|
|
|
|
|
max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
|
|
|
|
|
|
|
|
|
|
|
|
/* According to the IVB PRMs:
|
|
|
|
|
|
* "When destination spans two registers, the source MUST span two
|
|
|
|
|
|
* registers. The exception to the above rule:
|
|
|
|
|
|
*
|
|
|
|
|
|
* - When source is scalar, the source registers are not incremented.
|
|
|
|
|
|
* - When source is packed integer Word and destination is packed
|
|
|
|
|
|
* integer DWord, the source register is not incremented but the
|
|
|
|
|
|
* source sub register is incremented."
|
|
|
|
|
|
*
|
|
|
|
|
|
* The hardware specs from Gen4 to Gen7.5 mention similar regioning
|
|
|
|
|
|
* restrictions. The code below intentionally doesn't check whether the
|
|
|
|
|
|
* destination type is integer because empirically the hardware doesn't
|
|
|
|
|
|
* seem to care what the actual type is as long as it's dword-aligned.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 8) {
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
2017-01-11 08:17:57 +01:00
|
|
|
|
/* IVB implements DF scalars as <0;2,1> regions. */
|
|
|
|
|
|
const bool is_scalar_exception = is_uniform(inst->src[i]) &&
|
|
|
|
|
|
(devinfo->is_haswell || type_sz(inst->src[i].type) != 8);
|
|
|
|
|
|
const bool is_packed_word_exception =
|
|
|
|
|
|
type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
|
|
|
|
|
|
type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1;
|
|
|
|
|
|
|
2017-01-13 17:04:23 -08:00
|
|
|
|
/* We check size_read(i) against size_written instead of REG_SIZE
|
|
|
|
|
|
* because we want to properly handle SIMD32. In SIMD32, you can end
|
|
|
|
|
|
* up with writes to 4 registers and a source that reads 2 registers
|
|
|
|
|
|
* and we may still need to lower all the way to SIMD8 in that case.
|
|
|
|
|
|
*/
|
2016-09-07 13:32:25 -07:00
|
|
|
|
if (inst->size_written > REG_SIZE &&
|
2017-01-13 17:04:23 -08:00
|
|
|
|
inst->size_read(i) != 0 &&
|
|
|
|
|
|
inst->size_read(i) < inst->size_written &&
|
2017-01-11 08:17:57 +01:00
|
|
|
|
!is_scalar_exception && !is_packed_word_exception) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
|
|
|
|
|
|
max_width = MIN2(max_width, inst->exec_size / reg_count);
|
|
|
|
|
|
}
|
2016-05-20 13:15:49 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
intel: Fix SIMD16 unaligned payload GRF reads on Gen4-5.
When the SIMD16 Gen4-5 fragment shader payload contains source depth
(g2-3), destination stencil (g4), and destination depth (g5-6), the
single register of stencil makes the destination depth unaligned.
We were generating this instruction in the RT write payload setup:
mov(16) m14<1>F g5<8,8,1>F { align1 compr };
which is illegal, instructions with a source region spanning more than
one register need to be aligned to even registers. This is because the
hardware implicitly does (nr | 1) instead of (nr + 1) when splitting the
compressed instruction into two mov(8)'s.
I believe this would cause the hardware to load g5 twice, replicating
subspan 0-1's destination depth to subspan 2-3. This showed up as 2x2
artifact blocks in both TIS-100 and Reicast.
Normally, we rely on the register allocator to even-align our virtual
GRFs. But we don't control the payload, so we need to lower SIMD widths
to make it work. To fix this, we teach lower_simd_width about the
restriction, and then call it again after lower_load_payload (which is
what generates the offending MOV).
Fixes: 8aee87fe4cce0a883867df3546db0e0a36908086 (i965: Use SIMD16 instead of SIMD8 on Gen4 when possible.)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107212
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=13728
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Diego Viola <diego.viola@gmail.com>
2018-08-02 15:02:18 -07:00
|
|
|
|
if (devinfo->gen < 6) {
|
|
|
|
|
|
/* From the G45 PRM, Volume 4 Page 361:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Operand Alignment Rule: With the exceptions listed below, a
|
|
|
|
|
|
* source/destination operand in general should be aligned to even
|
|
|
|
|
|
* 256-bit physical register with a region size equal to two 256-bit
|
|
|
|
|
|
* physical registers."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Normally we enforce this by allocating virtual registers to the
|
|
|
|
|
|
* even-aligned class. But we need to handle payload registers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == FIXED_GRF && (inst->src[i].nr & 1) &&
|
|
|
|
|
|
inst->size_read(i) > REG_SIZE) {
|
|
|
|
|
|
max_width = MIN2(max_width, 8);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/* From the IVB PRMs:
|
|
|
|
|
|
* "When an instruction is SIMD32, the low 16 bits of the execution mask
|
|
|
|
|
|
* are applied for both halves of the SIMD32 instruction. If different
|
|
|
|
|
|
* execution mask channels are required, split the instruction into two
|
|
|
|
|
|
* SIMD16 instructions."
|
|
|
|
|
|
*
|
|
|
|
|
|
* There is similar text in the HSW PRMs. Gen4-6 don't even implement
|
|
|
|
|
|
* 32-wide control flow support in hardware and will behave similarly.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 8 && !inst->force_writemask_all)
|
|
|
|
|
|
max_width = MIN2(max_width, 16);
|
|
|
|
|
|
|
|
|
|
|
|
/* From the IVB PRMs (applies to HSW too):
|
|
|
|
|
|
* "Instructions with condition modifiers must not use SIMD32."
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the BDW PRMs (applies to later hardware too):
|
|
|
|
|
|
* "Ternary instruction with condition modifiers must not use SIMD32."
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
|
|
|
|
|
|
max_width = MIN2(max_width, 16);
|
|
|
|
|
|
|
|
|
|
|
|
/* From the IVB PRMs (applies to other devices that don't have the
|
2016-08-22 15:01:08 -07:00
|
|
|
|
* gen_device_info::supports_simd16_3src flag set):
|
2016-05-20 13:15:49 -07:00
|
|
|
|
* "In Align16 access mode, SIMD16 is not allowed for DW operations and
|
|
|
|
|
|
* SIMD8 is not allowed for DF operations."
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
|
|
|
|
|
|
max_width = MIN2(max_width, inst->exec_size / reg_count);
|
|
|
|
|
|
|
2016-03-30 14:00:31 +02:00
|
|
|
|
/* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
|
|
|
|
|
|
* the 8-bit quarter of the execution mask signals specified in the
|
|
|
|
|
|
* instruction control fields) for the second compressed half of any
|
|
|
|
|
|
* single-precision instruction (for double-precision instructions
|
|
|
|
|
|
* it's hardwired to use NibCtrl+1, at least on HSW), which means that
|
|
|
|
|
|
* the EU will apply the wrong execution controls for the second
|
|
|
|
|
|
* sequential GRF write if the number of channels per GRF is not exactly
|
|
|
|
|
|
* eight in single-precision mode (or four in double-float mode).
|
|
|
|
|
|
*
|
|
|
|
|
|
* In this situation we calculate the maximum size of the split
|
|
|
|
|
|
* instructions so they only ever write to a single register.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
|
2016-03-30 14:00:31 +02:00
|
|
|
|
!inst->force_writemask_all) {
|
2016-09-07 13:38:20 -07:00
|
|
|
|
const unsigned channels_per_grf = inst->exec_size /
|
|
|
|
|
|
DIV_ROUND_UP(inst->size_written, REG_SIZE);
|
2016-07-18 07:17:39 +00:00
|
|
|
|
const unsigned exec_type_size = get_exec_type_size(inst);
|
2016-03-30 14:00:31 +02:00
|
|
|
|
assert(exec_type_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* The hardware shifts exactly 8 channels per compressed half of the
|
|
|
|
|
|
* instruction in single-precision mode and exactly 4 in double-precision.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
|
|
|
|
|
|
max_width = MIN2(max_width, channels_per_grf);
|
2016-08-25 16:05:24 +02:00
|
|
|
|
|
|
|
|
|
|
/* Lower all non-force_writemask_all DF instructions to SIMD4 on IVB/BYT
|
|
|
|
|
|
* because HW applies the same channel enable signals to both halves of
|
|
|
|
|
|
* the compressed instruction which will be just wrong under
|
|
|
|
|
|
* non-uniform control flow.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen == 7 && !devinfo->is_haswell &&
|
|
|
|
|
|
(exec_type_size == 8 || type_sz(inst->dst.type) == 8))
|
|
|
|
|
|
max_width = MIN2(max_width, 4);
|
2016-03-30 14:00:31 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
2019-03-14 10:35:58 +01:00
|
|
|
|
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
|
|
|
|
|
|
* Float Operations:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "No SIMD16 in mixed mode when destination is f32. Instruction
|
|
|
|
|
|
* execution size must be no more than 8."
|
|
|
|
|
|
*
|
|
|
|
|
|
* FIXME: the simulator doesn't seem to complain if we don't do this and
|
|
|
|
|
|
* empirical testing with existing CTS tests show that they pass just fine
|
|
|
|
|
|
* without implementing this, however, since our interpretation of the PRM
|
|
|
|
|
|
* is that conversion MOVs between HF and F are still mixed-float
|
|
|
|
|
|
* instructions (and therefore subject to this restriction) we decided to
|
|
|
|
|
|
* split them to be safe. Might be useful to do additional investigation to
|
|
|
|
|
|
* lift the restriction if we can ensure that it is safe though, since these
|
|
|
|
|
|
* conversions are common when half-float types are involved since many
|
|
|
|
|
|
* instructions do not support HF types and conversions from/to F are
|
|
|
|
|
|
* required.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (is_mixed_float_with_fp32_dst(inst))
|
|
|
|
|
|
max_width = MIN2(max_width, 8);
|
|
|
|
|
|
|
|
|
|
|
|
/* From the SKL PRM, Special Restrictions for Handling Mixed Mode
|
|
|
|
|
|
* Float Operations:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "No SIMD16 in mixed mode when destination is packed f16 for both
|
|
|
|
|
|
* Align1 and Align16."
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (is_mixed_float_with_packed_fp16_dst(inst))
|
|
|
|
|
|
max_width = MIN2(max_width, 8);
|
|
|
|
|
|
|
2016-05-20 13:15:49 -07:00
|
|
|
|
/* Only power-of-two execution sizes are representable in the instruction
|
|
|
|
|
|
* control fields.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return 1 << _mesa_logbase2(max_width);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-08-12 14:05:19 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the maximum allowed SIMD width for instruction \p inst accounting for
|
|
|
|
|
|
* various payload size restrictions that apply to sampler message
|
|
|
|
|
|
* instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is only intended to provide a maximum theoretical bound for the
|
|
|
|
|
|
* execution size of the message based on the number of argument components
|
|
|
|
|
|
* alone, which in most cases will determine whether the SIMD8 or SIMD16
|
|
|
|
|
|
* variant of the message can be used, though some messages may have
|
|
|
|
|
|
* additional restrictions not accounted for here (e.g. pre-ILK hardware uses
|
|
|
|
|
|
* the message length to determine the exact SIMD width and argument count,
|
|
|
|
|
|
* which makes a number of sampler message combinations impossible to
|
|
|
|
|
|
* represent).
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2016-08-22 15:01:08 -07:00
|
|
|
|
get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
|
2016-08-12 14:05:19 -07:00
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
2018-10-11 15:57:50 -05:00
|
|
|
|
/* If we have a min_lod parameter on anything other than a simple sample
|
|
|
|
|
|
* message, it will push it over 5 arguments and we have to fall back to
|
|
|
|
|
|
* SIMD8.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_TEX &&
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_MIN_LOD))
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
2016-08-12 14:05:19 -07:00
|
|
|
|
/* Calculate the number of coordinate components that have to be present
|
|
|
|
|
|
* assuming that additional arguments follow the texel coordinates in the
|
|
|
|
|
|
* message payload. On IVB+ there is no need for padding, on ILK-SNB we
|
|
|
|
|
|
* need to pad to four or three components depending on the message,
|
|
|
|
|
|
* pre-ILK we need to pad to at most three components.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned req_coord_components =
|
|
|
|
|
|
(devinfo->gen >= 7 ||
|
|
|
|
|
|
!inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
|
|
|
|
|
|
(devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
|
|
|
|
|
|
inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
|
|
|
|
|
|
3;
|
|
|
|
|
|
|
|
|
|
|
|
/* On Gen9+ the LOD argument is for free if we're able to use the LZ
|
|
|
|
|
|
* variant of the TXL or TXF message.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const bool implicit_lod = devinfo->gen >= 9 &&
|
|
|
|
|
|
(inst->opcode == SHADER_OPCODE_TXL ||
|
|
|
|
|
|
inst->opcode == SHADER_OPCODE_TXF) &&
|
|
|
|
|
|
inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
|
|
|
|
|
|
|
|
|
|
|
|
/* Calculate the total number of argument components that need to be passed
|
|
|
|
|
|
* to the sampler unit.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned num_payload_components =
|
|
|
|
|
|
MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
|
|
|
|
|
|
req_coord_components) +
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
|
|
|
|
|
|
(implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_LOD2) +
|
|
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
|
|
|
|
|
|
(inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
|
2016-11-28 18:13:02 -08:00
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
|
2016-08-12 14:05:19 -07:00
|
|
|
|
inst->components_read(TEX_LOGICAL_SRC_MCS);
|
|
|
|
|
|
|
|
|
|
|
|
/* SIMD16 messages with more than five arguments exceed the maximum message
|
|
|
|
|
|
* size supported by the sampler, regardless of whether a header is
|
|
|
|
|
|
* provided or not.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return MIN2(inst->exec_size,
|
|
|
|
|
|
num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the closest native SIMD width supported by the hardware for instruction
|
|
|
|
|
|
* \p inst. The instruction will be left untouched by
|
|
|
|
|
|
* fs_visitor::lower_simd_width() if the returned value is equal to the
|
|
|
|
|
|
* original execution size.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2016-08-22 15:01:08 -07:00
|
|
|
|
get_lowered_simd_width(const struct gen_device_info *devinfo,
|
2015-07-13 21:15:31 +03:00
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (inst->opcode) {
|
2015-08-04 19:07:19 +03:00
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
|
case BRW_OPCODE_ASR:
|
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
|
case BRW_OPCODE_CSEL:
|
|
|
|
|
|
case BRW_OPCODE_F32TO16:
|
|
|
|
|
|
case BRW_OPCODE_F16TO32:
|
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
|
|
|
|
|
case BRW_OPCODE_SAD2:
|
|
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
|
case BRW_OPCODE_LRP:
|
2016-05-20 13:15:49 -07:00
|
|
|
|
case FS_OPCODE_PACK:
|
2017-08-31 21:45:30 -07:00
|
|
|
|
case SHADER_OPCODE_SEL_EXEC:
|
|
|
|
|
|
case SHADER_OPCODE_CLUSTER_BROADCAST:
|
2016-05-20 13:15:49 -07:00
|
|
|
|
return get_fpu_lowered_simd_width(devinfo, inst);
|
2016-05-18 01:26:03 -07:00
|
|
|
|
|
2016-05-17 15:58:04 -07:00
|
|
|
|
case BRW_OPCODE_CMP: {
|
|
|
|
|
|
/* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
|
|
|
|
|
|
* when the destination is a GRF the dependency-clear bit on the flag
|
|
|
|
|
|
* register is cleared early.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Suggested workarounds are to disable coissuing CMP instructions
|
|
|
|
|
|
* or to split CMP(16) instructions into two CMP(8) instructions.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We choose to split into CMP(8) instructions since disabling
|
|
|
|
|
|
* coissuing would affect CMP instructions not otherwise affected by
|
|
|
|
|
|
* the errata.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
|
|
|
|
|
|
!inst->dst.is_null() ? 8 : ~0);
|
|
|
|
|
|
return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
|
|
|
|
|
|
}
|
2016-05-17 16:00:19 -07:00
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
|
|
|
|
|
/* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
|
|
|
|
|
|
* should
|
|
|
|
|
|
* "Force BFI instructions to be executed always in SIMD8."
|
|
|
|
|
|
*/
|
|
|
|
|
|
return MIN2(devinfo->is_haswell ? 8 : ~0u,
|
|
|
|
|
|
get_fpu_lowered_simd_width(devinfo, inst));
|
2016-05-17 15:58:04 -07:00
|
|
|
|
|
2016-05-17 16:01:29 -07:00
|
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
|
assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
|
2016-05-20 13:14:20 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
2018-04-26 10:26:22 +02:00
|
|
|
|
case SHADER_OPCODE_COS: {
|
2016-05-20 13:14:20 -07:00
|
|
|
|
/* Unary extended math instructions are limited to SIMD8 on Gen4 and
|
2018-04-26 10:26:22 +02:00
|
|
|
|
* Gen6. Extended Math Function is limited to SIMD8 with half-float.
|
2016-05-20 13:14:20 -07:00
|
|
|
|
*/
|
2018-04-26 10:26:22 +02:00
|
|
|
|
if (devinfo->gen == 6 || (devinfo->gen == 4 && !devinfo->is_g4x))
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
}
|
2016-05-20 13:14:20 -07:00
|
|
|
|
|
2018-04-26 10:26:22 +02:00
|
|
|
|
case SHADER_OPCODE_POW: {
|
|
|
|
|
|
/* SIMD16 is only allowed on Gen7+. Extended Math Function is limited
|
|
|
|
|
|
* to SIMD8 with half-float
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 7)
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
if (inst->dst.type == BRW_REGISTER_TYPE_HF)
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
}
|
2016-05-20 13:14:20 -07:00
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
|
/* Integer division is limited to SIMD8 on all generations. */
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case FS_OPCODE_LINTERP:
|
2017-12-10 17:03:32 -08:00
|
|
|
|
case SHADER_OPCODE_GET_BUFFER_SIZE:
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
|
case FS_OPCODE_PACK_HALF_2x16_SPLIT:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2016-05-18 01:26:03 -07:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
|
|
|
|
|
|
/* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
|
|
|
|
|
|
* message used to implement varying pull constant loads, so expand it
|
|
|
|
|
|
* to SIMD16. An alternative with longer message payload length but
|
|
|
|
|
|
* shorter return payload would be to use the SIMD8 sampler message that
|
|
|
|
|
|
* takes (header, u, v, r) as parameters instead of (header, u).
|
|
|
|
|
|
*/
|
|
|
|
|
|
return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
|
|
|
|
|
|
|
2019-07-25 18:28:44 -05:00
|
|
|
|
case FS_OPCODE_DDX_COARSE:
|
|
|
|
|
|
case FS_OPCODE_DDX_FINE:
|
|
|
|
|
|
case FS_OPCODE_DDY_COARSE:
|
2016-05-17 16:27:09 -07:00
|
|
|
|
case FS_OPCODE_DDY_FINE:
|
|
|
|
|
|
/* The implementation of this virtual opcode may require emitting
|
|
|
|
|
|
* compressed Align16 instructions, which are severely limited on some
|
|
|
|
|
|
* generations.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
|
|
|
|
|
|
* Region Restrictions):
|
|
|
|
|
|
*
|
|
|
|
|
|
* "In Align16 access mode, SIMD16 is not allowed for DW operations
|
|
|
|
|
|
* and SIMD8 is not allowed for DF operations."
|
|
|
|
|
|
*
|
|
|
|
|
|
* In this context, "DW operations" means "operations acting on 32-bit
|
|
|
|
|
|
* values", so it includes operations on floats.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
|
|
|
|
|
|
* (Instruction Compression -> Rules and Restrictions):
|
|
|
|
|
|
*
|
|
|
|
|
|
* "A compressed instruction must be in Align1 access mode. Align16
|
|
|
|
|
|
* mode instructions cannot be compressed."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Similar text exists in the g45 PRM.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Empirically, compressed align16 instructions using odd register
|
|
|
|
|
|
* numbers don't appear to work on Sandybridge either.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return (devinfo->gen == 4 || devinfo->gen == 6 ||
|
|
|
|
|
|
(devinfo->gen == 7 && !devinfo->is_haswell) ?
|
|
|
|
|
|
MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
|
|
|
|
|
|
|
2015-08-06 14:04:00 +03:00
|
|
|
|
case SHADER_OPCODE_MULH:
|
|
|
|
|
|
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
|
|
|
|
|
|
* is 8-wide on Gen7+.
|
|
|
|
|
|
*/
|
2016-05-17 16:43:05 -07:00
|
|
|
|
return (devinfo->gen >= 7 ? 8 :
|
|
|
|
|
|
get_fpu_lowered_simd_width(devinfo, inst));
|
2015-08-06 14:04:00 +03:00
|
|
|
|
|
2015-07-13 21:19:28 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
|
|
|
|
/* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
|
|
|
|
|
|
* here.
|
|
|
|
|
|
*/
|
2015-10-20 14:29:37 -07:00
|
|
|
|
assert(devinfo->gen != 6 ||
|
|
|
|
|
|
inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
|
2015-07-13 21:19:28 +03:00
|
|
|
|
inst->exec_size == 8);
|
|
|
|
|
|
/* Dual-source FB writes are unsupported in SIMD16 mode. */
|
2015-10-20 14:29:37 -07:00
|
|
|
|
return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
|
2016-05-20 13:34:46 -07:00
|
|
|
|
8 : MIN2(16, inst->exec_size));
|
2015-07-13 21:19:28 +03:00
|
|
|
|
|
2016-07-21 16:55:45 -07:00
|
|
|
|
case FS_OPCODE_FB_READ_LOGICAL:
|
|
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
2016-05-20 00:37:37 -07:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
|
2016-08-12 14:05:19 -07:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
|
return get_sampler_lowered_simd_width(devinfo, inst);
|
2016-05-20 00:37:37 -07:00
|
|
|
|
|
2015-07-13 21:19:52 +03:00
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
/* TXD is unsupported in SIMD16 mode. */
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
2016-08-12 14:05:19 -07:00
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
/* Only one execution size is representable pre-ILK depending on whether
|
|
|
|
|
|
* the shadow reference argument is present.
|
2015-07-13 21:19:52 +03:00
|
|
|
|
*/
|
2016-08-12 14:05:19 -07:00
|
|
|
|
if (devinfo->gen == 4)
|
|
|
|
|
|
return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
|
2015-07-13 21:19:52 +03:00
|
|
|
|
else
|
2016-08-12 14:05:19 -07:00
|
|
|
|
return get_sampler_lowered_simd_width(devinfo, inst);
|
|
|
|
|
|
|
2015-07-13 21:19:52 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
|
/* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
|
|
|
|
|
|
* messages. Use SIMD16 instead.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen == 4)
|
|
|
|
|
|
return 16;
|
|
|
|
|
|
else
|
2016-08-12 14:05:19 -07:00
|
|
|
|
return get_sampler_lowered_simd_width(devinfo, inst);
|
2015-09-08 15:52:09 +01:00
|
|
|
|
|
2015-07-18 16:16:19 +03:00
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
2018-04-18 14:02:33 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
2017-07-01 08:16:01 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_WRITE_LOGICAL:
|
2017-07-01 08:19:17 +02:00
|
|
|
|
case SHADER_OPCODE_BYTE_SCATTERED_READ_LOGICAL:
|
2016-05-20 13:34:46 -07:00
|
|
|
|
return MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2018-11-14 17:13:57 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_A64_BYTE_SCATTERED_READ_LOGICAL:
|
|
|
|
|
|
return devinfo->gen <= 8 ? 8 : MIN2(16, inst->exec_size);
|
|
|
|
|
|
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_LOGICAL:
|
2019-01-12 18:30:47 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_INT64_LOGICAL:
|
2018-11-26 15:15:04 -06:00
|
|
|
|
case SHADER_OPCODE_A64_UNTYPED_ATOMIC_FLOAT_LOGICAL:
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
2016-05-20 13:34:46 -07:00
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8:
|
|
|
|
|
|
case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
|
|
|
|
|
|
return MIN2(8, inst->exec_size);
|
|
|
|
|
|
|
2018-12-06 14:11:34 -08:00
|
|
|
|
case SHADER_OPCODE_QUAD_SWIZZLE: {
|
|
|
|
|
|
const unsigned swiz = inst->src[1].ud;
|
|
|
|
|
|
return (is_uniform(inst->src[0]) ?
|
|
|
|
|
|
get_fpu_lowered_simd_width(devinfo, inst) :
|
|
|
|
|
|
devinfo->gen < 11 && type_sz(inst->src[0].type) == 4 ? 8 :
|
|
|
|
|
|
swiz == BRW_SWIZZLE_XYXY || swiz == BRW_SWIZZLE_ZWZW ? 4 :
|
|
|
|
|
|
get_fpu_lowered_simd_width(devinfo, inst));
|
|
|
|
|
|
}
|
2016-08-03 11:51:44 +00:00
|
|
|
|
case SHADER_OPCODE_MOV_INDIRECT: {
|
|
|
|
|
|
/* From IVB and HSW PRMs:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "2.When the destination requires two registers and the sources are
|
|
|
|
|
|
* indirect, the sources must use 1x1 regioning mode.
|
|
|
|
|
|
*
|
|
|
|
|
|
* In case of DF instructions in HSW/IVB, the exec_size is limited by
|
|
|
|
|
|
* the EU decompression logic not handling VxH indirect addressing
|
|
|
|
|
|
* correctly.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned max_size = (devinfo->gen >= 8 ? 2 : 1) * REG_SIZE;
|
|
|
|
|
|
/* Prior to Broadwell, we only have 8 address subregisters. */
|
2016-05-17 16:10:38 -07:00
|
|
|
|
return MIN3(devinfo->gen >= 8 ? 16 : 8,
|
2016-08-03 11:51:44 +00:00
|
|
|
|
max_size / (inst->dst.stride * type_sz(inst->dst.type)),
|
2016-05-17 16:10:38 -07:00
|
|
|
|
inst->exec_size);
|
2016-08-03 11:51:44 +00:00
|
|
|
|
}
|
2015-11-24 09:01:11 -08:00
|
|
|
|
|
2016-05-19 23:44:23 -07:00
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD: {
|
|
|
|
|
|
const unsigned reg_count =
|
|
|
|
|
|
DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
|
|
|
|
|
|
|
|
|
|
|
|
if (reg_count > 2) {
|
|
|
|
|
|
/* Only LOAD_PAYLOAD instructions with per-channel destination region
|
|
|
|
|
|
* can be easily lowered (which excludes headers and heterogeneous
|
|
|
|
|
|
* types).
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(!inst->header_size);
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++)
|
|
|
|
|
|
assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
|
|
|
|
|
|
inst->src[i].file == BAD_FILE);
|
|
|
|
|
|
|
|
|
|
|
|
return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-07-13 21:15:31 +03:00
|
|
|
|
default:
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-28 22:44:13 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Return true if splitting out the group of channels of instruction \p inst
|
|
|
|
|
|
* given by lbld.group() requires allocating a temporary for the i-th source
|
|
|
|
|
|
* of the lowered instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static inline bool
|
|
|
|
|
|
needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
|
|
|
|
|
|
{
|
|
|
|
|
|
return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
|
|
|
|
|
|
(inst->components_read(i) == 1 &&
|
2017-09-06 18:33:38 -07:00
|
|
|
|
lbld.dispatch_width() <= inst->exec_size)) ||
|
|
|
|
|
|
(inst->flags_written() &
|
|
|
|
|
|
flag_mask(inst->src[i], type_sz(inst->src[i].type)));
|
2016-05-28 22:44:13 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Extract the data that would be consumed by the channel group given by
|
|
|
|
|
|
* lbld.group() from the i-th source region of instruction \p inst and return
|
2017-09-06 18:24:17 -07:00
|
|
|
|
* it as result in packed form.
|
2016-05-26 23:07:58 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static fs_reg
|
2017-09-06 18:24:17 -07:00
|
|
|
|
emit_unzip(const fs_builder &lbld, fs_inst *inst, unsigned i)
|
2016-05-26 23:07:58 -07:00
|
|
|
|
{
|
2018-12-07 14:15:50 -08:00
|
|
|
|
assert(lbld.group() >= inst->group);
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* Specified channel group from the source region. */
|
2018-12-07 14:15:50 -08:00
|
|
|
|
const fs_reg src = horiz_offset(inst->src[i], lbld.group() - inst->group);
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2016-05-28 22:44:13 -07:00
|
|
|
|
if (needs_src_copy(lbld, inst, i)) {
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* Builder of the right width to perform the copy avoiding uninitialized
|
|
|
|
|
|
* data if the lowered execution size is greater than the original
|
|
|
|
|
|
* execution size of the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
|
|
|
|
|
|
inst->exec_size), 0);
|
|
|
|
|
|
const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned k = 0; k < inst->components_read(i); ++k)
|
2017-09-06 18:24:17 -07:00
|
|
|
|
cbld.MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
|
|
|
|
|
return tmp;
|
|
|
|
|
|
|
2016-05-28 22:44:13 -07:00
|
|
|
|
} else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/* The source is invariant for all dispatch_width-wide groups of the
|
|
|
|
|
|
* original region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return inst->src[i];
|
2016-05-28 22:44:13 -07:00
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We can just point the lowered instruction at the right channel group
|
|
|
|
|
|
* from the original region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return src;
|
2016-05-26 23:07:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-27 00:45:04 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Return true if splitting out the group of channels of instruction \p inst
|
|
|
|
|
|
* given by lbld.group() requires allocating a temporary for the destination
|
|
|
|
|
|
* of the lowered instruction and copying the data back to the original
|
|
|
|
|
|
* destination region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static inline bool
|
|
|
|
|
|
needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* If the instruction writes more than one component we'll have to shuffle
|
|
|
|
|
|
* the results of multiple lowered instructions in order to make sure that
|
|
|
|
|
|
* they end up arranged correctly in the original destination region.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (inst->size_written > inst->dst.component_size(inst->exec_size))
|
2016-05-27 00:45:04 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
/* If the lowered execution size is larger than the original the result of
|
|
|
|
|
|
* the instruction won't fit in the original destination, so we'll have to
|
|
|
|
|
|
* allocate a temporary in any case.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (lbld.dispatch_width() > inst->exec_size)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
/* If we already made a copy of the source for other reasons there won't
|
|
|
|
|
|
* be any overlap with the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (needs_src_copy(lbld, inst, i))
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* In order to keep the logic simple we emit a copy whenever the
|
|
|
|
|
|
* destination region doesn't exactly match an overlapping source, which
|
|
|
|
|
|
* may point at the source and destination not being aligned group by
|
|
|
|
|
|
* group which could cause one of the lowered instructions to overwrite
|
|
|
|
|
|
* the data read from the same source by other lowered instructions.
|
|
|
|
|
|
*/
|
2016-09-07 13:38:20 -07:00
|
|
|
|
if (regions_overlap(inst->dst, inst->size_written,
|
2016-09-07 17:00:07 -07:00
|
|
|
|
inst->src[i], inst->size_read(i)) &&
|
2016-05-27 00:45:04 -07:00
|
|
|
|
!inst->dst.equals(inst->src[i]))
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Insert data from a packed temporary into the channel group given by
|
|
|
|
|
|
* lbld.group() of the destination region of instruction \p inst and return
|
2017-09-06 18:24:17 -07:00
|
|
|
|
* the temporary as result. Any copy instructions that are required for
|
|
|
|
|
|
* unzipping the previous value (in the case of partial writes) will be
|
|
|
|
|
|
* inserted using \p lbld_before and any copy instructions required for
|
|
|
|
|
|
* zipping up the destination of \p inst will be inserted using \p lbld_after.
|
2016-05-26 23:07:58 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static fs_reg
|
2017-09-06 18:24:17 -07:00
|
|
|
|
emit_zip(const fs_builder &lbld_before, const fs_builder &lbld_after,
|
|
|
|
|
|
fs_inst *inst)
|
2016-05-26 23:07:58 -07:00
|
|
|
|
{
|
2017-09-06 18:24:17 -07:00
|
|
|
|
assert(lbld_before.dispatch_width() == lbld_after.dispatch_width());
|
|
|
|
|
|
assert(lbld_before.group() == lbld_after.group());
|
2018-12-07 14:15:50 -08:00
|
|
|
|
assert(lbld_after.group() >= inst->group);
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
|
|
|
|
|
/* Specified channel group from the destination region. */
|
2018-12-07 14:15:50 -08:00
|
|
|
|
const fs_reg dst = horiz_offset(inst->dst, lbld_after.group() - inst->group);
|
2016-09-07 13:38:20 -07:00
|
|
|
|
const unsigned dst_size = inst->size_written /
|
|
|
|
|
|
inst->dst.component_size(inst->exec_size);
|
2016-05-27 00:45:04 -07:00
|
|
|
|
|
2017-09-06 18:24:17 -07:00
|
|
|
|
if (needs_dst_copy(lbld_after, inst)) {
|
|
|
|
|
|
const fs_reg tmp = lbld_after.vgrf(inst->dst.type, dst_size);
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2016-05-27 01:02:19 -07:00
|
|
|
|
if (inst->predicate) {
|
|
|
|
|
|
/* Handle predication by copying the original contents of
|
|
|
|
|
|
* the destination into the temporary before emitting the
|
|
|
|
|
|
* lowered instruction.
|
|
|
|
|
|
*/
|
2017-09-06 18:24:17 -07:00
|
|
|
|
const fs_builder gbld_before =
|
|
|
|
|
|
lbld_before.group(MIN2(lbld_before.dispatch_width(),
|
|
|
|
|
|
inst->exec_size), 0);
|
|
|
|
|
|
for (unsigned k = 0; k < dst_size; ++k) {
|
|
|
|
|
|
gbld_before.MOV(offset(tmp, lbld_before, k),
|
|
|
|
|
|
offset(dst, inst->exec_size, k));
|
|
|
|
|
|
}
|
2016-05-27 01:02:19 -07:00
|
|
|
|
}
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2017-09-06 18:24:17 -07:00
|
|
|
|
const fs_builder gbld_after =
|
|
|
|
|
|
lbld_after.group(MIN2(lbld_after.dispatch_width(),
|
|
|
|
|
|
inst->exec_size), 0);
|
|
|
|
|
|
for (unsigned k = 0; k < dst_size; ++k) {
|
|
|
|
|
|
/* Use a builder of the right width to perform the copy avoiding
|
|
|
|
|
|
* uninitialized data if the lowered execution size is greater than
|
|
|
|
|
|
* the original execution size of the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
gbld_after.MOV(offset(dst, inst->exec_size, k),
|
|
|
|
|
|
offset(tmp, lbld_after, k));
|
|
|
|
|
|
}
|
2016-05-26 23:07:58 -07:00
|
|
|
|
|
2016-05-27 01:02:19 -07:00
|
|
|
|
return tmp;
|
2016-05-27 00:45:04 -07:00
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* No need to allocate a temporary for the lowered instruction, just
|
|
|
|
|
|
* take the right group of channels from the original region.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
2016-05-26 23:07:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_simd_width()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
|
|
|
|
|
|
|
|
|
|
|
|
if (lower_width != inst->exec_size) {
|
2015-07-27 18:42:31 +03:00
|
|
|
|
/* Builder matching the original instruction. We may also need to
|
|
|
|
|
|
* emit an instruction of width larger than the original, set the
|
|
|
|
|
|
* execution size of the builder to the highest of both for now so
|
|
|
|
|
|
* we're sure that both cases can be handled.
|
|
|
|
|
|
*/
|
2016-05-20 16:14:13 -07:00
|
|
|
|
const unsigned max_width = MAX2(inst->exec_size, lower_width);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
const fs_builder ibld = bld.at(block, inst)
|
|
|
|
|
|
.exec_all(inst->force_writemask_all)
|
2016-05-20 16:14:13 -07:00
|
|
|
|
.group(max_width, inst->group / max_width);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
|
|
|
|
|
/* Split the copies in chunks of the execution width of either the
|
|
|
|
|
|
* original or the lowered instruction, whichever is lower.
|
|
|
|
|
|
*/
|
2016-05-26 23:07:58 -07:00
|
|
|
|
const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
|
2016-09-07 13:38:20 -07:00
|
|
|
|
const unsigned dst_size = inst->size_written /
|
2015-07-13 21:15:31 +03:00
|
|
|
|
inst->dst.component_size(inst->exec_size);
|
|
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
assert(!inst->writes_accumulator && !inst->mlen);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2017-09-06 18:31:11 -07:00
|
|
|
|
/* Inserting the zip, unzip, and duplicated instructions in all of
|
|
|
|
|
|
* the right spots is somewhat tricky. All of the unzip and any
|
|
|
|
|
|
* instructions from the zip which unzip the destination prior to
|
|
|
|
|
|
* writing need to happen before all of the per-group instructions
|
|
|
|
|
|
* and the zip instructions need to happen after. In order to sort
|
|
|
|
|
|
* this all out, we insert the unzip instructions before \p inst,
|
|
|
|
|
|
* insert the per-group instructions after \p inst (i.e. before
|
|
|
|
|
|
* inst->next), and insert the zip instructions before the
|
|
|
|
|
|
* instruction after \p inst. Since we are inserting instructions
|
|
|
|
|
|
* after \p inst, inst->next is a moving target and we need to save
|
|
|
|
|
|
* it off here so that we insert the zip instructions in the right
|
|
|
|
|
|
* place.
|
2018-05-21 09:51:50 -07:00
|
|
|
|
*
|
|
|
|
|
|
* Since we're inserting split instructions after after_inst, the
|
|
|
|
|
|
* instructions will end up in the reverse order that we insert them.
|
|
|
|
|
|
* However, certain render target writes require that the low group
|
|
|
|
|
|
* instructions come before the high group. From the Ivy Bridge PRM
|
|
|
|
|
|
* Vol. 4, Pt. 1, Section 3.9.11:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "If multiple SIMD8 Dual Source messages are delivered by the
|
|
|
|
|
|
* pixel shader thread, each SIMD8_DUALSRC_LO message must be
|
|
|
|
|
|
* issued before the SIMD8_DUALSRC_HI message with the same Slot
|
|
|
|
|
|
* Group Select setting."
|
|
|
|
|
|
*
|
|
|
|
|
|
* And, from Section 3.9.11.1 of the same PRM:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "When SIMD32 or SIMD16 PS threads send render target writes
|
|
|
|
|
|
* with multiple SIMD8 and SIMD16 messages, the following must
|
|
|
|
|
|
* hold:
|
|
|
|
|
|
*
|
|
|
|
|
|
* All the slots (as described above) must have a corresponding
|
|
|
|
|
|
* render target write irrespective of the slot's validity. A slot
|
|
|
|
|
|
* is considered valid when at least one sample is enabled. For
|
|
|
|
|
|
* example, a SIMD16 PS thread must send two SIMD8 render target
|
|
|
|
|
|
* writes to cover all the slots.
|
|
|
|
|
|
*
|
|
|
|
|
|
* PS thread must send SIMD render target write messages with
|
|
|
|
|
|
* increasing slot numbers. For example, SIMD16 thread has
|
|
|
|
|
|
* Slot[15:0] and if two SIMD8 render target writes are used, the
|
|
|
|
|
|
* first SIMD8 render target write must send Slot[7:0] and the
|
|
|
|
|
|
* next one must send Slot[15:8]."
|
|
|
|
|
|
*
|
|
|
|
|
|
* In order to make low group instructions come before high group
|
|
|
|
|
|
* instructions (this is required for some render target writes), we
|
|
|
|
|
|
* split from the highest group to lowest.
|
2017-09-06 18:31:11 -07:00
|
|
|
|
*/
|
|
|
|
|
|
exec_node *const after_inst = inst->next;
|
2018-05-21 09:51:50 -07:00
|
|
|
|
for (int i = n - 1; i >= 0; i--) {
|
2015-07-13 21:15:31 +03:00
|
|
|
|
/* Emit a copy of the original instruction with the lowered width.
|
|
|
|
|
|
* If the EOT flag was set throw it away except for the last
|
|
|
|
|
|
* instruction to avoid killing the thread prematurely.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst split_inst = *inst;
|
|
|
|
|
|
split_inst.exec_size = lower_width;
|
2018-07-16 13:32:36 -07:00
|
|
|
|
split_inst.eot = inst->eot && i == int(n - 1);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2015-07-27 18:42:31 +03:00
|
|
|
|
/* Select the correct channel enables for the i-th group, then
|
|
|
|
|
|
* transform the sources and destination and emit the lowered
|
|
|
|
|
|
* instruction.
|
2015-07-13 21:15:31 +03:00
|
|
|
|
*/
|
2015-07-27 18:42:31 +03:00
|
|
|
|
const fs_builder lbld = ibld.group(lower_width, i);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2016-05-26 23:07:58 -07:00
|
|
|
|
for (unsigned j = 0; j < inst->sources; j++)
|
2017-09-06 18:24:17 -07:00
|
|
|
|
split_inst.src[j] = emit_unzip(lbld.at(block, inst), inst, j);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2017-09-06 18:24:17 -07:00
|
|
|
|
split_inst.dst = emit_zip(lbld.at(block, inst),
|
2017-09-06 18:31:11 -07:00
|
|
|
|
lbld.at(block, after_inst), inst);
|
2016-09-07 13:38:20 -07:00
|
|
|
|
split_inst.size_written =
|
|
|
|
|
|
split_inst.dst.component_size(lower_width) * dst_size;
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
2017-09-06 18:31:11 -07:00
|
|
|
|
lbld.at(block, inst->next).emit(split_inst);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:34:01 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions()
|
2014-05-29 13:08:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instructions(NULL);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions(const char *name)
|
2013-08-04 23:34:01 -07:00
|
|
|
|
{
|
2014-05-29 13:08:59 -07:00
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
|
if (name && geteuid() != 0) {
|
|
|
|
|
|
file = fopen(name, "w");
|
|
|
|
|
|
if (!file)
|
|
|
|
|
|
file = stderr;
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
|
2015-02-13 10:46:32 -08:00
|
|
|
|
if (cfg) {
|
|
|
|
|
|
calculate_register_pressure();
|
|
|
|
|
|
int ip = 0, max_pressure = 0;
|
|
|
|
|
|
foreach_block_and_inst(block, backend_instruction, inst, cfg) {
|
|
|
|
|
|
max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
|
|
|
|
|
|
fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
ip++;
|
|
|
|
|
|
}
|
|
|
|
|
|
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
|
foreach_in_list(backend_instruction, inst, &instructions) {
|
|
|
|
|
|
fprintf(file, "%4d: ", ip++);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
2014-05-29 13:08:59 -07:00
|
|
|
|
|
|
|
|
|
|
if (file != stderr) {
|
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
void
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instruction(be_inst, stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
|
2012-10-30 15:35:44 -07:00
|
|
|
|
{
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_inst *inst = (fs_inst *)be_inst;
|
|
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->predicate) {
|
2017-12-12 12:05:02 -08:00
|
|
|
|
fprintf(file, "(%cf%d.%d) ",
|
|
|
|
|
|
inst->predicate_inverse ? '-' : '+',
|
|
|
|
|
|
inst->flag_subreg / 2,
|
|
|
|
|
|
inst->flag_subreg % 2);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-28 00:19:12 -07:00
|
|
|
|
fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->saturate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".sat");
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->conditional_mod) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (!inst->predicate &&
|
2015-04-15 18:00:05 -07:00
|
|
|
|
(devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
|
2015-11-22 20:12:17 -08:00
|
|
|
|
inst->opcode != BRW_OPCODE_CSEL &&
|
2017-06-06 16:24:14 -07:00
|
|
|
|
inst->opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
inst->opcode != BRW_OPCODE_WHILE))) {
|
2017-12-12 12:05:02 -08:00
|
|
|
|
fprintf(file, ".f%d.%d", inst->flag_subreg / 2,
|
|
|
|
|
|
inst->flag_subreg % 2);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 18:02:52 -07:00
|
|
|
|
fprintf(file, "(%d) ", inst->exec_size);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2015-06-02 20:40:54 -07:00
|
|
|
|
if (inst->mlen) {
|
|
|
|
|
|
fprintf(file, "(mlen: %d) ", inst->mlen);
|
|
|
|
|
|
}
|
2012-12-06 10:36:11 -08:00
|
|
|
|
|
2018-10-29 15:06:14 -05:00
|
|
|
|
if (inst->ex_mlen) {
|
|
|
|
|
|
fprintf(file, "(ex_mlen: %d) ", inst->ex_mlen);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-06-26 00:39:32 -07:00
|
|
|
|
if (inst->eot) {
|
|
|
|
|
|
fprintf(file, "(EOT) ");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->dst.file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
|
fprintf(file, "g%d", inst->dst.nr);
|
|
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case MRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "m%d", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case UNIFORM:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "***u%d***", inst->dst.nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "***attr%d***", inst->dst.nr);
|
2014-10-20 23:16:48 -07:00
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
switch (inst->dst.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
|
|
|
|
|
fprintf(file, "null");
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
|
|
|
|
|
fprintf(file, "a0.%d", inst->dst.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
|
|
|
|
|
fprintf(file, "acc%d", inst->dst.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
|
|
|
|
|
fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
|
|
|
|
|
|
break;
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2015-10-26 06:58:56 -07:00
|
|
|
|
case IMM:
|
|
|
|
|
|
unreachable("not reached");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
2016-09-01 20:31:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->dst.offset ||
|
|
|
|
|
|
(inst->dst.file == VGRF &&
|
|
|
|
|
|
alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
|
|
|
|
|
|
const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
|
|
|
|
|
|
fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
|
|
|
|
|
|
inst->dst.offset % reg_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-05 09:41:18 -07:00
|
|
|
|
if (inst->dst.stride != 1)
|
|
|
|
|
|
fprintf(file, "<%u>", inst->dst.stride);
|
2017-07-26 17:31:36 -07:00
|
|
|
|
fprintf(file, ":%s, ", brw_reg_type_to_letters(inst->dst.type));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-09-16 15:56:47 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->src[i].file) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
case VGRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case FIXED_GRF:
|
|
|
|
|
|
fprintf(file, "g%d", inst->src[i].nr);
|
|
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case MRF:
|
2015-10-26 04:35:14 -07:00
|
|
|
|
fprintf(file, "***m%d***", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "attr%d", inst->src[i].nr);
|
2014-10-20 23:16:48 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case UNIFORM:
|
2016-09-01 20:31:47 -07:00
|
|
|
|
fprintf(file, "u%d", inst->src[i].nr);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case IMM:
|
|
|
|
|
|
switch (inst->src[i].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2016-02-11 19:03:56 +13:00
|
|
|
|
fprintf(file, "%-gf", inst->src[i].f);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-08-03 15:00:51 -07:00
|
|
|
|
case BRW_REGISTER_TYPE_DF:
|
|
|
|
|
|
fprintf(file, "%fdf", inst->src[i].df);
|
|
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_W:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
fprintf(file, "%dd", inst->src[i].d);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UW:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
2015-10-24 14:55:57 -07:00
|
|
|
|
fprintf(file, "%uu", inst->src[i].ud);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2018-09-06 11:15:55 -07:00
|
|
|
|
case BRW_REGISTER_TYPE_Q:
|
|
|
|
|
|
fprintf(file, "%" PRId64 "q", inst->src[i].d64);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UQ:
|
|
|
|
|
|
fprintf(file, "%" PRIu64 "uq", inst->src[i].u64);
|
|
|
|
|
|
break;
|
2014-03-08 17:25:34 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_VF:
|
2014-12-31 16:54:44 -08:00
|
|
|
|
fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
|
2015-10-24 14:55:57 -07:00
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 0) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 8) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
|
2014-03-08 17:25:34 -08:00
|
|
|
|
break;
|
2018-11-05 09:52:09 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_V:
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UV:
|
|
|
|
|
|
fprintf(file, "%08x%s", inst->src[i].ud,
|
|
|
|
|
|
inst->src[i].type == BRW_REGISTER_TYPE_V ? "V" : "UV");
|
|
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-10-26 17:52:57 -07:00
|
|
|
|
case ARF:
|
|
|
|
|
|
switch (inst->src[i].nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
|
|
|
|
|
fprintf(file, "null");
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
|
|
|
|
|
fprintf(file, "a0.%d", inst->src[i].subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
|
|
|
|
|
fprintf(file, "acc%d", inst->src[i].subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
|
|
|
|
|
fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
|
|
|
|
|
|
break;
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
2016-09-01 20:31:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].offset ||
|
|
|
|
|
|
(inst->src[i].file == VGRF &&
|
|
|
|
|
|
alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
|
|
|
|
|
|
const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
|
|
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
|
|
|
|
|
|
inst->src[i].offset % reg_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2013-12-02 13:10:29 -08:00
|
|
|
|
if (inst->src[i].file != IMM) {
|
2015-08-05 09:41:18 -07:00
|
|
|
|
unsigned stride;
|
|
|
|
|
|
if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
|
|
|
|
|
|
unsigned hstride = inst->src[i].hstride;
|
|
|
|
|
|
stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
|
|
|
|
|
|
} else {
|
|
|
|
|
|
stride = inst->src[i].stride;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (stride != 1)
|
|
|
|
|
|
fprintf(file, "<%u>", stride);
|
|
|
|
|
|
|
2017-07-26 17:31:36 -07:00
|
|
|
|
fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type));
|
2013-12-02 13:10:29 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ", ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2015-11-09 23:55:58 -08:00
|
|
|
|
if (inst->force_writemask_all)
|
|
|
|
|
|
fprintf(file, "NoMask ");
|
|
|
|
|
|
|
2016-05-20 16:14:13 -07:00
|
|
|
|
if (inst->exec_size != dispatch_width)
|
|
|
|
|
|
fprintf(file, "group%d ", inst->group);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "\n");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-13 19:36:18 -08:00
|
|
|
|
void
|
2016-02-10 21:20:01 -08:00
|
|
|
|
fs_visitor::setup_fs_payload_gen6()
|
2012-11-13 19:36:18 -08:00
|
|
|
|
{
|
2016-02-10 21:20:01 -08:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
|
2017-01-13 15:40:38 -08:00
|
|
|
|
const unsigned payload_width = MIN2(16, dispatch_width);
|
|
|
|
|
|
assert(dispatch_width % payload_width == 0);
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
2017-01-13 15:40:38 -08:00
|
|
|
|
prog_data->uses_src_depth = prog_data->uses_src_w =
|
2019-07-18 09:59:44 -05:00
|
|
|
|
(nir->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD)) != 0;
|
2017-01-13 15:40:38 -08:00
|
|
|
|
|
|
|
|
|
|
prog_data->uses_sample_mask =
|
|
|
|
|
|
(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "MSDISPMODE_PERSAMPLE is required in order to select
|
|
|
|
|
|
* POSOFFSET_SAMPLE"
|
|
|
|
|
|
*
|
|
|
|
|
|
* So we can only really get sample positions if we are doing real
|
|
|
|
|
|
* per-sample dispatch. If we need gl_SamplePosition and we don't have
|
|
|
|
|
|
* persample dispatch, we hard-code it to 0.5.
|
2012-11-13 19:36:18 -08:00
|
|
|
|
*/
|
2017-01-13 15:40:38 -08:00
|
|
|
|
prog_data->uses_pos_offset = prog_data->persample_dispatch &&
|
|
|
|
|
|
(nir->info.system_values_read & SYSTEM_BIT_SAMPLE_POS);
|
|
|
|
|
|
|
|
|
|
|
|
/* R0: PS thread payload header. */
|
|
|
|
|
|
payload.num_regs++;
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
|
|
|
|
|
|
/* R1: masks, pixel X/Y coordinates. */
|
|
|
|
|
|
payload.subspan_coord_reg[j] = payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-01-13 15:40:38 -08:00
|
|
|
|
for (unsigned j = 0; j < dispatch_width / payload_width; j++) {
|
|
|
|
|
|
/* R3-26: barycentric interpolation coordinates. These appear in the
|
|
|
|
|
|
* same order that they appear in the brw_barycentric_mode enum. Each
|
|
|
|
|
|
* set of coordinates occupies 2 registers if dispatch width == 8 and 4
|
|
|
|
|
|
* registers if dispatch width == 16. Coordinates only appear if they
|
|
|
|
|
|
* were enabled using the "Barycentric Interpolation Mode" bits in
|
|
|
|
|
|
* WM_STATE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
|
|
|
|
|
|
if (prog_data->barycentric_interp_modes & (1 << i)) {
|
|
|
|
|
|
payload.barycentric_coord_reg[i][j] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += payload_width / 4;
|
|
|
|
|
|
}
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
2016-02-10 21:27:57 -08:00
|
|
|
|
|
2017-01-13 15:40:38 -08:00
|
|
|
|
/* R27-28: interpolated depth if uses source depth */
|
|
|
|
|
|
if (prog_data->uses_src_depth) {
|
|
|
|
|
|
payload.source_depth_reg[j] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += payload_width / 8;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2017-01-13 15:40:38 -08:00
|
|
|
|
/* R29-30: interpolated W set if GEN6_WM_USES_SOURCE_W. */
|
|
|
|
|
|
if (prog_data->uses_src_w) {
|
|
|
|
|
|
payload.source_w_reg[j] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += payload_width / 8;
|
|
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2017-01-13 15:40:38 -08:00
|
|
|
|
/* R31: MSAA position offsets. */
|
|
|
|
|
|
if (prog_data->uses_pos_offset) {
|
|
|
|
|
|
payload.sample_pos_reg[j] = payload.num_regs;
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-01-13 15:40:38 -08:00
|
|
|
|
/* R32-33: MSAA input coverage mask */
|
|
|
|
|
|
if (prog_data->uses_sample_mask) {
|
|
|
|
|
|
assert(devinfo->gen >= 7);
|
|
|
|
|
|
payload.sample_mask_in_reg[j] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += payload_width / 8;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
2014-05-14 00:08:58 -07:00
|
|
|
|
source_depth_to_render_target = true;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_vs_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* R0: thread header, R1: urb handles */
|
|
|
|
|
|
payload.num_regs = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_gs_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
|
|
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
/* R0: thread header, R1: output URB handles */
|
|
|
|
|
|
payload.num_regs = 2;
|
|
|
|
|
|
|
|
|
|
|
|
if (gs_prog_data->include_primitive_id) {
|
|
|
|
|
|
/* R2: Primitive ID 0..7 */
|
|
|
|
|
|
payload.num_regs++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-09-27 11:36:31 +02:00
|
|
|
|
/* Always enable VUE handles so we can safely use pull model if needed.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The push model for a GS uses a ton of register space even for trivial
|
|
|
|
|
|
* scenarios with just a few inputs, so just make things easier and a bit
|
|
|
|
|
|
* safer by always having pull model available.
|
|
|
|
|
|
*/
|
|
|
|
|
|
gs_prog_data->base.include_vue_handles = true;
|
|
|
|
|
|
|
|
|
|
|
|
/* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
|
|
|
|
|
|
payload.num_regs += nir->info.gs.vertices_in;
|
|
|
|
|
|
|
2016-05-09 18:11:00 -07:00
|
|
|
|
/* Use a maximum of 24 registers for push-model inputs. */
|
|
|
|
|
|
const unsigned max_push_components = 24;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
/* If pushing our inputs would take too many registers, reduce the URB read
|
|
|
|
|
|
* length (which is in HWords, or 8 registers), and resort to pulling.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that the GS reads <URB Read Length> HWords for every vertex - so we
|
|
|
|
|
|
* have to multiply by VerticesIn to obtain the total storage requirement.
|
|
|
|
|
|
*/
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
|
2017-09-27 11:36:31 +02:00
|
|
|
|
max_push_components) {
|
2015-03-11 23:14:31 -07:00
|
|
|
|
vue_prog_data->urb_read_length =
|
2017-05-08 09:20:21 -07:00
|
|
|
|
ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
|
2015-03-11 23:14:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_cs_payload()
|
|
|
|
|
|
{
|
2015-06-19 17:19:38 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
payload.num_regs = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:27:14 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::calculate_register_pressure()
|
|
|
|
|
|
{
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2014-09-26 16:08:52 -07:00
|
|
|
|
unsigned num_instructions = 0;
|
|
|
|
|
|
foreach_block(block, cfg)
|
|
|
|
|
|
num_instructions += block->instructions.length();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
|
|
|
|
|
|
regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned reg = 0; reg < alloc.count; reg++) {
|
2013-08-04 23:27:14 -07:00
|
|
|
|
for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
|
2015-02-10 15:51:34 +02:00
|
|
|
|
regs_live_at_ip[ip] += alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::optimize()
|
|
|
|
|
|
{
|
2015-07-02 15:41:02 -07:00
|
|
|
|
/* Start by validating the shader we currently have. */
|
|
|
|
|
|
validate();
|
|
|
|
|
|
|
2015-06-03 19:59:44 +03:00
|
|
|
|
/* bld is the common builder object pointing at the end of the program we
|
|
|
|
|
|
* used to translate it into i965 IR. For the optimization and lowering
|
|
|
|
|
|
* passes coming next, any code added after the end of the program without
|
|
|
|
|
|
* having explicitly called fs_builder::at() clearly points at a mistake.
|
|
|
|
|
|
* Ideally optimization passes wouldn't be part of the visitor so they
|
|
|
|
|
|
* wouldn't have access to bld at all, but they do, so just in case some
|
|
|
|
|
|
* pass forgets to ask for a location explicitly set it to NULL here to
|
2015-07-27 18:51:01 +03:00
|
|
|
|
* make it trip. The dispatch width is initialized to a bogus value to
|
|
|
|
|
|
* make sure that optimizations set the execution controls explicitly to
|
|
|
|
|
|
* match the code they are manipulating instead of relying on the defaults.
|
2015-06-03 19:59:44 +03:00
|
|
|
|
*/
|
2015-07-27 18:51:01 +03:00
|
|
|
|
bld = fs_builder(this, 64);
|
2015-06-03 19:59:44 +03:00
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
assign_constant_locations();
|
2015-12-08 17:14:49 -08:00
|
|
|
|
lower_constant_loads();
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate();
|
|
|
|
|
|
|
2015-08-19 14:29:53 -07:00
|
|
|
|
split_virtual_grfs();
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate();
|
2015-08-19 14:29:53 -07:00
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
#define OPT(pass, args...) ({ \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
pass_num++; \
|
|
|
|
|
|
bool this_progress = pass(args); \
|
|
|
|
|
|
\
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
|
|
|
|
|
|
char filename[64]; \
|
2015-10-01 15:12:59 -07:00
|
|
|
|
snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
|
2017-05-08 09:20:21 -07:00
|
|
|
|
stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
\
|
2015-05-20 09:44:01 -07:00
|
|
|
|
backend_shader::dump_instructions(filename); \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
} \
|
|
|
|
|
|
\
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate(); \
|
|
|
|
|
|
\
|
2014-11-13 16:28:18 -08:00
|
|
|
|
progress = progress || this_progress; \
|
2015-01-16 01:05:21 -08:00
|
|
|
|
this_progress; \
|
|
|
|
|
|
})
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
|
|
|
|
|
|
char filename[64];
|
2016-02-11 10:55:48 -08:00
|
|
|
|
snprintf(filename, 64, "%s%d-%s-00-00-start",
|
2017-05-08 09:20:21 -07:00
|
|
|
|
stage_abbrev, dispatch_width, nir->info.name);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
2015-05-20 09:44:01 -07:00
|
|
|
|
backend_shader::dump_instructions(filename);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:44:58 +03:00
|
|
|
|
bool progress = false;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
int iteration = 0;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
int pass_num = 0;
|
2015-07-13 17:44:58 +03:00
|
|
|
|
|
2018-09-09 11:37:24 -07:00
|
|
|
|
/* Before anything else, eliminate dead code. The results of some NIR
|
|
|
|
|
|
* instructions may effectively be calculated twice. Once when the
|
|
|
|
|
|
* instruction is encountered, and again when the user of that result is
|
|
|
|
|
|
* encountered. Wipe those away before algebraic optimizations and
|
|
|
|
|
|
* especially copy propagation can mix things up.
|
|
|
|
|
|
*/
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
|
2017-07-01 08:14:56 +02:00
|
|
|
|
OPT(remove_extra_rounding_modes);
|
2016-04-20 14:22:53 -07:00
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
do {
|
|
|
|
|
|
progress = false;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
iteration++;
|
|
|
|
|
|
|
|
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
OPT(opt_cse);
|
2016-11-28 10:45:08 -08:00
|
|
|
|
OPT(opt_copy_propagation);
|
2015-10-02 20:30:41 -07:00
|
|
|
|
OPT(opt_predicated_break, this);
|
2014-08-22 10:54:43 -07:00
|
|
|
|
OPT(opt_cmod_propagation);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
OPT(dead_control_flow_eliminate, this);
|
|
|
|
|
|
OPT(opt_register_renaming);
|
|
|
|
|
|
OPT(opt_saturate_propagation);
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
2015-02-20 20:25:04 +02:00
|
|
|
|
OPT(eliminate_find_live_channel);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
OPT(compact_virtual_grfs);
|
|
|
|
|
|
} while (progress);
|
|
|
|
|
|
|
2018-02-21 18:06:56 -08:00
|
|
|
|
/* Do this after cmod propagation has had every possible opportunity to
|
|
|
|
|
|
* propagate results into SEL instructions.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (OPT(opt_peephole_csel))
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
progress = false;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
|
|
|
|
|
|
2016-04-01 11:54:47 +02:00
|
|
|
|
if (OPT(lower_pack)) {
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
OPT(lower_simd_width);
|
2016-05-20 00:38:17 -07:00
|
|
|
|
|
|
|
|
|
|
/* After SIMD lowering just in case we had to unroll the EOT send. */
|
|
|
|
|
|
OPT(opt_sampler_eot);
|
|
|
|
|
|
|
2016-04-30 15:08:29 -07:00
|
|
|
|
OPT(lower_logical_sends);
|
|
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2016-11-28 10:45:08 -08:00
|
|
|
|
OPT(opt_copy_propagation);
|
2016-04-30 15:08:29 -07:00
|
|
|
|
/* Only run after logical send lowering because it's easier to implement
|
|
|
|
|
|
* in terms of physical sends.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (OPT(opt_zero_samples))
|
2016-11-28 10:45:08 -08:00
|
|
|
|
OPT(opt_copy_propagation);
|
2016-04-30 15:08:29 -07:00
|
|
|
|
/* Run after logical send lowering to give it a chance to CSE the
|
|
|
|
|
|
* LOAD_PAYLOAD instructions created to construct the payloads of
|
|
|
|
|
|
* e.g. texturing messages in cases where it wasn't possible to CSE the
|
|
|
|
|
|
* whole logical instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
OPT(opt_cse);
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-03 18:36:02 -07:00
|
|
|
|
OPT(opt_redundant_discard_jumps);
|
2015-02-08 13:59:57 -08:00
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
if (OPT(lower_load_payload)) {
|
2014-11-13 16:28:18 -08:00
|
|
|
|
split_virtual_grfs();
|
2015-01-16 01:05:21 -08:00
|
|
|
|
OPT(register_coalesce);
|
intel: Fix SIMD16 unaligned payload GRF reads on Gen4-5.
When the SIMD16 Gen4-5 fragment shader payload contains source depth
(g2-3), destination stencil (g4), and destination depth (g5-6), the
single register of stencil makes the destination depth unaligned.
We were generating this instruction in the RT write payload setup:
mov(16) m14<1>F g5<8,8,1>F { align1 compr };
which is illegal, instructions with a source region spanning more than
one register need to be aligned to even registers. This is because the
hardware implicitly does (nr | 1) instead of (nr + 1) when splitting the
compressed instruction into two mov(8)'s.
I believe this would cause the hardware to load g5 twice, replicating
subspan 0-1's destination depth to subspan 2-3. This showed up as 2x2
artifact blocks in both TIS-100 and Reicast.
Normally, we rely on the register allocator to even-align our virtual
GRFs. But we don't control the payload, so we need to lower SIMD widths
to make it work. To fix this, we teach lower_simd_width about the
restriction, and then call it again after lower_load_payload (which is
what generates the offending MOV).
Fixes: 8aee87fe4cce0a883867df3546db0e0a36908086 (i965: Use SIMD16 instead of SIMD8 on Gen4 when possible.)
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=107212
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=13728
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
Tested-by: Diego Viola <diego.viola@gmail.com>
2018-08-02 15:02:18 -07:00
|
|
|
|
OPT(lower_simd_width);
|
2015-01-16 01:05:21 -08:00
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-12 11:00:46 -08:00
|
|
|
|
OPT(opt_combine_constants);
|
2015-05-11 09:29:56 -07:00
|
|
|
|
OPT(lower_integer_multiplication);
|
2014-02-12 11:00:46 -08:00
|
|
|
|
|
2016-02-11 12:27:02 -08:00
|
|
|
|
if (devinfo->gen <= 5 && OPT(lower_minmax)) {
|
|
|
|
|
|
OPT(opt_cmod_propagation);
|
|
|
|
|
|
OPT(opt_cse);
|
2016-11-28 10:45:08 -08:00
|
|
|
|
OPT(opt_copy_propagation);
|
2016-02-11 12:27:02 -08:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-12-07 23:56:27 -08:00
|
|
|
|
if (OPT(lower_regioning)) {
|
2017-01-20 08:47:05 +01:00
|
|
|
|
OPT(opt_copy_propagation);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(lower_simd_width);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-15 21:05:08 -06:00
|
|
|
|
OPT(fixup_sends_duplicate_payload);
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
lower_uniform_pull_constant_loads();
|
2015-07-02 15:41:02 -07:00
|
|
|
|
|
|
|
|
|
|
validate();
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-11-15 21:05:08 -06:00
|
|
|
|
/**
|
|
|
|
|
|
* From the Skylake PRM Vol. 2a docs for sends:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "It is required that the second block of GRFs does not overlap with the
|
|
|
|
|
|
* first block."
|
|
|
|
|
|
*
|
|
|
|
|
|
* There are plenty of cases where we may accidentally violate this due to
|
|
|
|
|
|
* having, for instance, both sources be the constant 0. This little pass
|
|
|
|
|
|
* just adds a new vgrf for the second payload and copies it over.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::fixup_sends_duplicate_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 &&
|
|
|
|
|
|
regions_overlap(inst->src[2], inst->mlen * REG_SIZE,
|
|
|
|
|
|
inst->src[3], inst->ex_mlen * REG_SIZE)) {
|
|
|
|
|
|
fs_reg tmp = fs_reg(VGRF, alloc.allocate(inst->ex_mlen),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
/* Sadly, we've lost all notion of channels and bit sizes at this
|
|
|
|
|
|
* point. Just WE_all it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_builder ibld = bld.at(block, inst).exec_all().group(16, 0);
|
|
|
|
|
|
fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg copy_dst = tmp;
|
|
|
|
|
|
for (unsigned i = 0; i < inst->ex_mlen; i += 2) {
|
|
|
|
|
|
if (inst->ex_mlen == i + 1) {
|
|
|
|
|
|
/* Only one register left; do SIMD8 */
|
|
|
|
|
|
ibld.group(8, 0).MOV(copy_dst, copy_src);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MOV(copy_dst, copy_src);
|
|
|
|
|
|
}
|
|
|
|
|
|
copy_src = offset(copy_src, ibld, 1);
|
|
|
|
|
|
copy_dst = offset(copy_dst, ibld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
inst->src[3] = tmp;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Three source instruction must have a GRF/MRF destination register.
|
|
|
|
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fixup_3src_null_dest()
|
|
|
|
|
|
{
|
2016-03-11 15:27:22 -08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2016-04-28 00:19:13 -07:00
|
|
|
|
if (inst->is_3src(devinfo) && inst->dst.is_null()) {
|
2015-10-26 17:09:25 -07:00
|
|
|
|
inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
|
2014-12-29 20:33:12 -08:00
|
|
|
|
inst->dst.type);
|
2016-03-11 15:27:22 -08:00
|
|
|
|
progress = true;
|
2014-12-29 20:33:12 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2016-03-11 15:27:22 -08:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
2014-12-29 20:33:12 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
void
|
2017-08-21 19:16:45 -07:00
|
|
|
|
fs_visitor::allocate_registers(unsigned min_dispatch_width, bool allow_spilling)
|
2014-11-13 16:28:19 -08:00
|
|
|
|
{
|
2019-05-09 14:44:16 -05:00
|
|
|
|
bool allocated;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2016-10-17 14:12:28 -07:00
|
|
|
|
static const char *scheduler_mode_name[] = {
|
|
|
|
|
|
"top-down",
|
|
|
|
|
|
"non-lifo",
|
|
|
|
|
|
"lifo"
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2016-05-16 14:30:25 -07:00
|
|
|
|
bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
|
|
|
|
|
schedule_instructions(pre_modes[i]);
|
2016-10-17 14:12:28 -07:00
|
|
|
|
this->shader_stats.scheduler_mode = scheduler_mode_name[i];
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
|
assign_regs_trivial();
|
2019-05-09 14:44:16 -05:00
|
|
|
|
allocated = true;
|
|
|
|
|
|
break;
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
2019-05-09 14:44:16 -05:00
|
|
|
|
|
|
|
|
|
|
/* We only allow spilling for the last schedule mode and only if the
|
|
|
|
|
|
* allow_spilling parameter and dispatch width work out ok.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool can_spill = allow_spilling &&
|
|
|
|
|
|
(i == ARRAY_SIZE(pre_modes) - 1) &&
|
|
|
|
|
|
dispatch_width == min_dispatch_width;
|
|
|
|
|
|
|
|
|
|
|
|
/* We should only spill registers on the last scheduling. */
|
|
|
|
|
|
assert(!spilled_any_registers);
|
|
|
|
|
|
|
2019-05-07 18:14:46 -05:00
|
|
|
|
allocated = assign_regs(can_spill, spill_all);
|
2019-05-09 14:44:16 -05:00
|
|
|
|
if (allocated)
|
2014-11-13 16:28:19 -08:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
|
if (!allocated) {
|
2016-09-08 14:08:02 -07:00
|
|
|
|
if (!allow_spilling)
|
|
|
|
|
|
fail("Failure to register allocate and spilling is not allowed.");
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
/* We assume that any spilling is worse than just dropping back to
|
|
|
|
|
|
* SIMD8. There's probably actually some intermediate point where
|
|
|
|
|
|
* SIMD16 with a couple of spills is still better.
|
|
|
|
|
|
*/
|
2016-05-18 13:52:25 -07:00
|
|
|
|
if (dispatch_width > min_dispatch_width) {
|
2014-11-13 16:28:19 -08:00
|
|
|
|
fail("Failure to register allocate. Reduce number of "
|
|
|
|
|
|
"live scalar values to avoid this.");
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-05-09 14:44:16 -05:00
|
|
|
|
/* If we failed to allocate, we must have a reason */
|
|
|
|
|
|
assert(failed);
|
|
|
|
|
|
} else if (spilled_any_registers) {
|
|
|
|
|
|
compiler->shader_perf_log(log_data,
|
|
|
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
|
"values to improve performance.\n",
|
|
|
|
|
|
stage_name);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* This must come after all optimization and register allocation, since
|
|
|
|
|
|
* it inserts dead code that happens to have side effects, and it does
|
|
|
|
|
|
* so based on the actual physical registers in use.
|
|
|
|
|
|
*/
|
|
|
|
|
|
insert_gen4_send_dependency_workarounds();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2017-06-15 15:23:57 -07:00
|
|
|
|
opt_bank_conflicts();
|
|
|
|
|
|
|
2015-06-06 13:32:21 -04:00
|
|
|
|
schedule_instructions(SCHEDULE_POST);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
|
2016-06-09 16:56:31 -07:00
|
|
|
|
if (last_scratch > 0) {
|
2019-06-19 12:47:19 +01:00
|
|
|
|
ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024;
|
2016-06-09 16:56:31 -07:00
|
|
|
|
|
2016-06-13 23:09:31 -07:00
|
|
|
|
prog_data->total_scratch = brw_get_scratch_size(last_scratch);
|
2016-06-09 18:13:26 -07:00
|
|
|
|
|
2016-06-13 23:09:31 -07:00
|
|
|
|
if (stage == MESA_SHADER_COMPUTE) {
|
|
|
|
|
|
if (devinfo->is_haswell) {
|
|
|
|
|
|
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
|
|
|
|
|
|
* field documentation, Haswell supports a minimum of 2kB of
|
|
|
|
|
|
* scratch space for compute shaders, unlike every other stage
|
|
|
|
|
|
* and platform.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
|
|
|
|
|
|
} else if (devinfo->gen <= 7) {
|
|
|
|
|
|
/* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
|
|
|
|
|
|
* field documentation, platforms prior to Haswell measure scratch
|
|
|
|
|
|
* size linearly with a range of [1kB, 12kB] and 1kB granularity.
|
|
|
|
|
|
*/
|
|
|
|
|
|
prog_data->total_scratch = ALIGN(last_scratch, 1024);
|
|
|
|
|
|
max_scratch_size = 12 * 1024;
|
|
|
|
|
|
}
|
2016-06-09 16:56:31 -07:00
|
|
|
|
}
|
2016-06-09 18:13:26 -07:00
|
|
|
|
|
|
|
|
|
|
/* We currently only support up to 2MB of scratch space. If we
|
|
|
|
|
|
* need to support more eventually, the documentation suggests
|
|
|
|
|
|
* that we could allocate a larger buffer, and partition it out
|
|
|
|
|
|
* ourselves. We'd just have to undo the hardware's address
|
|
|
|
|
|
* calculation by subtracting (FFTID * Per Thread Scratch Space)
|
|
|
|
|
|
* and then add FFTID * (Larger Per Thread Scratch Space).
|
|
|
|
|
|
*
|
|
|
|
|
|
* See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
|
|
|
|
|
|
* Thread Group Tracking > Local Memory/Scratch Space.
|
|
|
|
|
|
*/
|
2016-06-13 23:09:31 -07:00
|
|
|
|
assert(prog_data->total_scratch < max_scratch_size);
|
2016-06-09 16:56:31 -07:00
|
|
|
|
}
|
2018-11-09 14:13:37 -08:00
|
|
|
|
|
|
|
|
|
|
lower_scoreboard();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
bool
|
2017-09-28 16:25:31 -07:00
|
|
|
|
fs_visitor::run_vs()
|
2014-10-27 22:42:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
|
|
|
|
|
setup_vs_payload();
|
|
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2014-10-27 22:42:50 -07:00
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2015-05-20 10:03:50 -07:00
|
|
|
|
emit_nir_code();
|
2015-03-09 01:58:59 -07:00
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-06-26 15:05:13 -07:00
|
|
|
|
emit_urb_writes();
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-04-12 03:52:39 -07:00
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_vs_urb_setup();
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2017-08-21 19:16:45 -07:00
|
|
|
|
allocate_registers(8, true);
|
2015-11-10 14:35:27 -08:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-05-03 14:20:00 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::set_tcs_invocation_id()
|
2015-11-14 17:40:43 -08:00
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = &tcs_prog_data->base;
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
2019-05-03 14:24:49 -07:00
|
|
|
|
const unsigned instance_id_mask =
|
2019-05-03 14:20:00 -07:00
|
|
|
|
devinfo->gen >= 11 ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
|
2019-05-03 14:24:49 -07:00
|
|
|
|
const unsigned instance_id_shift =
|
2019-05-03 14:20:00 -07:00
|
|
|
|
devinfo->gen >= 11 ? 16 : 17;
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
/* Get instance number from g0.2 bits 22:16 or 23:17 */
|
|
|
|
|
|
fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
|
|
|
|
|
|
brw_imm_ud(instance_id_mask));
|
|
|
|
|
|
|
|
|
|
|
|
invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH) {
|
|
|
|
|
|
/* gl_InvocationID is just the thread number */
|
|
|
|
|
|
bld.SHR(invocation_id, t, brw_imm_ud(instance_id_shift));
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH);
|
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
|
fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
|
|
|
|
|
|
fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
|
|
|
|
|
|
bld.MOV(channels_ud, channels_uw);
|
|
|
|
|
|
|
|
|
|
|
|
if (tcs_prog_data->instances == 1) {
|
|
|
|
|
|
invocation_id = channels_ud;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
2019-05-03 14:24:49 -07:00
|
|
|
|
bld.SHR(instance_times_8, t, brw_imm_ud(instance_id_shift - 3));
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.ADD(invocation_id, instance_times_8, channels_ud);
|
|
|
|
|
|
}
|
2019-05-03 14:20:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
fs_visitor::run_tcs()
|
2019-05-03 14:20:00 -07:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_CTRL);
|
|
|
|
|
|
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
|
|
|
|
|
|
struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
|
|
|
|
|
|
struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
|
|
|
|
|
|
|
|
|
|
|
|
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH ||
|
|
|
|
|
|
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
|
|
|
|
|
|
|
|
|
|
|
|
if (vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH) {
|
|
|
|
|
|
/* r1-r4 contain the ICP handles. */
|
|
|
|
|
|
payload.num_regs = 5;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
assert(vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_8_PATCH);
|
|
|
|
|
|
assert(tcs_key->input_vertices > 0);
|
|
|
|
|
|
/* r1 contains output handles, r2 may contain primitive ID, then the
|
|
|
|
|
|
* ICP handles occupy the next 1-32 registers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
payload.num_regs = 2 + tcs_prog_data->include_primitive_id +
|
|
|
|
|
|
tcs_key->input_vertices;
|
|
|
|
|
|
}
|
2019-05-03 14:20:00 -07:00
|
|
|
|
|
|
|
|
|
|
if (shader_time_index >= 0)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
|
|
|
|
|
/* Initialize gl_InvocationID */
|
|
|
|
|
|
set_tcs_invocation_id();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
2019-05-03 14:28:51 -07:00
|
|
|
|
const bool fix_dispatch_mask =
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
vue_prog_data->dispatch_mode == DISPATCH_MODE_TCS_SINGLE_PATCH &&
|
2019-05-03 14:28:51 -07:00
|
|
|
|
(nir->info.tess.tcs_vertices_out % 8) != 0;
|
|
|
|
|
|
|
2015-11-14 17:40:43 -08:00
|
|
|
|
/* Fix the disptach mask */
|
2019-05-03 14:28:51 -07:00
|
|
|
|
if (fix_dispatch_mask) {
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.CMP(bld.null_reg_ud(), invocation_id,
|
2017-05-08 09:20:21 -07:00
|
|
|
|
brw_imm_ud(nir->info.tess.tcs_vertices_out), BRW_CONDITIONAL_L);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.IF(BRW_PREDICATE_NORMAL);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
2019-05-03 14:28:51 -07:00
|
|
|
|
if (fix_dispatch_mask) {
|
2015-11-14 17:40:43 -08:00
|
|
|
|
bld.emit(BRW_OPCODE_ENDIF);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Emit EOT write; set TR DS Cache bit */
|
|
|
|
|
|
fs_reg srcs[3] = {
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
fs_reg(get_tcs_output_urb_handle()),
|
2015-11-14 17:40:43 -08:00
|
|
|
|
fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
|
|
|
|
|
|
fs_reg(brw_imm_ud(0)),
|
|
|
|
|
|
};
|
|
|
|
|
|
fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
|
|
|
|
|
|
bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
|
|
|
|
|
|
bld.null_reg_ud(), payload);
|
|
|
|
|
|
inst->mlen = 3;
|
|
|
|
|
|
inst->eot = true;
|
|
|
|
|
|
|
|
|
|
|
|
if (shader_time_index >= 0)
|
|
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
intel/compiler: Implement TCS 8_PATCH mode and INTEL_DEBUG=tcs8
Our tessellation control shaders can be dispatched in several modes.
- SINGLE_PATCH (Gen7+) processes a single patch per thread, with each
channel corresponding to a different patch vertex. PATCHLIST_N will
launch (N / 8) threads. If N is less than 8, some channels will be
disabled, leaving some untapped hardware capabilities. Conditionals
based on gl_InvocationID are non-uniform, which means that they'll
often have to execute both paths. However, if there are fewer than
8 vertices, all invocations will happen within a single thread, so
barriers can become no-ops, which is nice. We also burn a maximum
of 4 registers for ICP handles, so we can compile without regard for
the value of N. It also works in all cases.
- DUAL_PATCH mode processes up to two patches at a time, where the first
four channels come from patch 1, and the second group of four come
from patch 2. This tries to provide better EU utilization for small
patches (N <= 4). It cannot be used in all cases.
- 8_PATCH mode processes 8 patches at a time, with a thread launched per
vertex in the patch. Each channel corresponds to the same vertex, but
in each of the 8 patches. This utilizes all channels even for small
patches. It also makes conditions on gl_InvocationID uniform, leading
to proper jumps. Barriers, unfortunately, become real. Worse, for
PATCHLIST_N, the thread payload burns N registers for ICP handles.
This can burn up to 32 registers, or 1/4 of our register file, for
URB handles. For Vulkan (and DX), we know the number of vertices at
compile time, so we can limit the amount of waste. In GL, the patch
dimension is dynamic state, so we either would have to waste all 32
(not reasonable) or guess (badly) and recompile. This is unfortunate.
Because we can only spawn 16 thread instances, we can only use this
mode for PATCHLIST_16 and smaller. The rest must use SINGLE_PATCH.
This patch implements the new 8_PATCH TCS mode, but leaves us using
SINGLE_PATCH by default. A new INTEL_DEBUG=tcs8 flag will switch to
using 8_PATCH mode for testing and benchmarking purposes. We may
want to consider using 8_PATCH mode in Vulkan in some cases.
The data I've seen shows that 8_PATCH mode can be more efficient in
some cases, but SINGLE_PATCH mode (the one we use today) is faster
in other cases. Ultimately, the TES matters much more than the TCS
for performance, so the decision may not matter much.
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2019-05-03 14:57:54 -07:00
|
|
|
|
assign_tcs_urb_setup();
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2017-08-21 19:16:45 -07:00
|
|
|
|
allocate_registers(8, true);
|
2015-11-14 17:40:43 -08:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-11-10 14:35:27 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_tes()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_TESS_EVAL);
|
|
|
|
|
|
|
|
|
|
|
|
/* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
|
|
|
|
|
|
payload.num_regs = 5;
|
|
|
|
|
|
|
|
|
|
|
|
if (shader_time_index >= 0)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_urb_writes();
|
|
|
|
|
|
|
|
|
|
|
|
if (shader_time_index >= 0)
|
|
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_tes_urb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2017-08-21 19:16:45 -07:00
|
|
|
|
allocate_registers(8, true);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-11 23:14:31 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_gs()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_GEOMETRY);
|
|
|
|
|
|
|
|
|
|
|
|
setup_gs_payload();
|
|
|
|
|
|
|
|
|
|
|
|
this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits > 0) {
|
|
|
|
|
|
/* Create a VGRF to store accumulated control data bits. */
|
|
|
|
|
|
this->control_data_bits = vgrf(glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
|
|
/* If we're outputting more than 32 control data bits, then EmitVertex()
|
|
|
|
|
|
* will set control_data_bits to 0 after emitting the first vertex.
|
|
|
|
|
|
* Otherwise, we need to initialize it to 0 here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (gs_compile->control_data_header_size_bits <= 32) {
|
|
|
|
|
|
const fs_builder abld = bld.annotate("initialize control data bits");
|
2015-11-02 11:26:16 -08:00
|
|
|
|
abld.MOV(this->control_data_bits, brw_imm_ud(0u));
|
2015-03-11 23:14:31 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (shader_time_index >= 0)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
|
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
emit_gs_thread_end();
|
|
|
|
|
|
|
|
|
|
|
|
if (shader_time_index >= 0)
|
|
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_gs_urb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2017-08-21 19:16:45 -07:00
|
|
|
|
allocate_registers(8, true);
|
2015-03-11 23:14:31 -07:00
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-10-25 16:50:11 +03:00
|
|
|
|
/* From the SKL PRM, Volume 16, Workarounds:
|
|
|
|
|
|
*
|
|
|
|
|
|
* 0877 3D Pixel Shader Hang possible when pixel shader dispatched with
|
|
|
|
|
|
* only header phases (R0-R2)
|
|
|
|
|
|
*
|
|
|
|
|
|
* WA: Enable a non-header phase (e.g. push constant) when dispatch would
|
|
|
|
|
|
* have been header only.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Instead of enabling push constants one can alternatively enable one of the
|
|
|
|
|
|
* inputs. Here one simply chooses "layer" which shouldn't impose much
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static void
|
|
|
|
|
|
gen9_ps_header_only_workaround(struct brw_wm_prog_data *wm_prog_data)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (wm_prog_data->num_varying_inputs)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
if (wm_prog_data->base.curb_read_length)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
wm_prog_data->urb_setup[VARYING_SLOT_LAYER] = 0;
|
|
|
|
|
|
wm_prog_data->num_varying_inputs = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
bool
|
2016-05-16 14:30:25 -07:00
|
|
|
|
fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
|
2010-08-26 12:12:00 -07:00
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
|
2014-10-27 23:36:31 -07:00
|
|
|
|
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
|
|
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6)
|
2016-02-10 21:20:01 -08:00
|
|
|
|
setup_fs_payload_gen6();
|
2012-11-13 19:36:18 -08:00
|
|
|
|
else
|
2016-02-10 21:20:01 -08:00
|
|
|
|
setup_fs_payload_gen4();
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
if (0) {
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_dummy_fs();
|
2015-06-19 17:25:28 -07:00
|
|
|
|
} else if (do_rep_send) {
|
|
|
|
|
|
assert(dispatch_width == 16);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
emit_repclear_shader();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
} else {
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2012-11-27 14:10:52 -08:00
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (nir->info.inputs_read > 0 ||
|
2019-07-18 09:59:44 -05:00
|
|
|
|
(nir->info.system_values_read & (1ull << SYSTEM_VALUE_FRAG_COORD)) ||
|
2017-05-08 09:20:21 -07:00
|
|
|
|
(nir->info.outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 6)
|
2013-10-19 21:27:37 -07:00
|
|
|
|
emit_interpolation_setup_gen4();
|
|
|
|
|
|
else
|
|
|
|
|
|
emit_interpolation_setup_gen6();
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2012-12-06 12:15:13 -08:00
|
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
|
*/
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_prog_data->uses_kill) {
|
2018-05-23 18:09:48 -07:00
|
|
|
|
const fs_reg dispatch_mask =
|
|
|
|
|
|
devinfo->gen >= 6 ? brw_vec1_grf(1, 7) : brw_vec1_grf(0, 0);
|
|
|
|
|
|
bld.exec_all().group(1, 0)
|
|
|
|
|
|
.MOV(retype(brw_flag_reg(0, 1), BRW_REGISTER_TYPE_UW),
|
|
|
|
|
|
retype(dispatch_mask, BRW_REGISTER_TYPE_UW));
|
2012-12-06 12:15:13 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-05-20 10:03:50 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
2011-06-10 16:00:03 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
|
2015-04-10 10:04:55 -07:00
|
|
|
|
if (wm_prog_data->uses_kill)
|
2015-06-03 20:45:54 +03:00
|
|
|
|
bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
|
2013-03-27 23:19:39 -07:00
|
|
|
|
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_key->alpha_test_func)
|
2013-10-27 12:32:03 +13:00
|
|
|
|
emit_alpha_test();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_fb_writes();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-02-26 22:55:54 -08:00
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
optimize();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_curb_setup();
|
2017-10-25 16:50:11 +03:00
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 9)
|
|
|
|
|
|
gen9_ps_header_only_workaround(wm_prog_data);
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_urb_setup();
|
2011-01-18 22:03:34 -08:00
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2017-08-21 19:16:45 -07:00
|
|
|
|
allocate_registers(8, allow_spilling);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
2014-11-13 16:28:17 -08:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2014-05-13 20:51:32 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
bool
|
2017-08-21 19:16:45 -07:00
|
|
|
|
fs_visitor::run_cs(unsigned min_dispatch_width)
|
2014-08-30 19:57:39 -07:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_COMPUTE);
|
2017-08-21 19:16:45 -07:00
|
|
|
|
assert(dispatch_width >= min_dispatch_width);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
|
|
setup_cs_payload();
|
|
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-04-15 18:27:50 -07:00
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2016-02-20 01:22:08 -08:00
|
|
|
|
if (devinfo->is_haswell && prog_data->total_shared > 0) {
|
|
|
|
|
|
/* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
|
|
|
|
|
|
const fs_builder abld = bld.exec_all().group(1, 0);
|
2016-09-14 15:09:32 -07:00
|
|
|
|
abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
|
2016-02-20 01:22:08 -08:00
|
|
|
|
suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-04-15 18:27:50 -07:00
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
2017-08-21 19:16:45 -07:00
|
|
|
|
allocate_registers(min_dispatch_width, true);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2019-04-11 14:12:58 -05:00
|
|
|
|
static bool
|
|
|
|
|
|
is_used_in_not_interp_frag_coord(nir_ssa_def *def)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_foreach_use(src, def) {
|
|
|
|
|
|
if (src->parent_instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src->parent_instr);
|
2019-07-18 09:59:44 -05:00
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_frag_coord)
|
2019-04-11 14:12:58 -05:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
nir_foreach_if_use(src, def)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Return a bitfield where bit n is set if barycentric interpolation mode n
|
2016-07-11 16:24:12 -07:00
|
|
|
|
* (see enum brw_barycentric_mode) is needed by the fragment shader.
|
2016-07-12 03:57:25 -07:00
|
|
|
|
*
|
|
|
|
|
|
* We examine the load_barycentric intrinsics rather than looking at input
|
|
|
|
|
|
* variables so that we catch interpolateAtCentroid() messages too, which
|
|
|
|
|
|
* also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
|
2015-10-08 16:01:44 -07:00
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
2016-08-22 15:01:08 -07:00
|
|
|
|
brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
|
2015-10-08 16:01:44 -07:00
|
|
|
|
const nir_shader *shader)
|
|
|
|
|
|
{
|
|
|
|
|
|
unsigned barycentric_interp_modes = 0;
|
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
nir_foreach_function(f, shader) {
|
|
|
|
|
|
if (!f->impl)
|
2015-10-08 16:01:44 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
nir_foreach_block(block, f->impl) {
|
|
|
|
|
|
nir_foreach_instr(instr, block) {
|
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
continue;
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2019-04-11 14:12:58 -05:00
|
|
|
|
switch (intrin->intrinsic) {
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_pixel:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_centroid:
|
|
|
|
|
|
case nir_intrinsic_load_barycentric_sample:
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2016-07-12 03:57:25 -07:00
|
|
|
|
continue;
|
2019-04-11 14:12:58 -05:00
|
|
|
|
}
|
2016-07-12 03:57:25 -07:00
|
|
|
|
|
|
|
|
|
|
/* Ignore WPOS; it doesn't require interpolation. */
|
2019-04-11 14:12:58 -05:00
|
|
|
|
assert(intrin->dest.is_ssa);
|
|
|
|
|
|
if (!is_used_in_not_interp_frag_coord(&intrin->dest.ssa))
|
2016-07-12 03:57:25 -07:00
|
|
|
|
continue;
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
enum glsl_interp_mode interp = (enum glsl_interp_mode)
|
|
|
|
|
|
nir_intrinsic_interp_mode(intrin);
|
|
|
|
|
|
nir_intrinsic_op bary_op = intrin->intrinsic;
|
|
|
|
|
|
enum brw_barycentric_mode bary =
|
|
|
|
|
|
brw_barycentric_mode(interp, bary_op);
|
2016-07-11 15:00:37 -07:00
|
|
|
|
|
2016-07-12 03:57:25 -07:00
|
|
|
|
barycentric_interp_modes |= 1 << bary;
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->needs_unlit_centroid_workaround &&
|
|
|
|
|
|
bary_op == nir_intrinsic_load_barycentric_centroid)
|
|
|
|
|
|
barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-10-08 16:01:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return barycentric_interp_modes;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-05 18:19:34 -07:00
|
|
|
|
static void
|
|
|
|
|
|
brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
|
2016-07-07 00:47:18 -07:00
|
|
|
|
const nir_shader *shader)
|
2016-04-05 18:19:34 -07:00
|
|
|
|
{
|
|
|
|
|
|
prog_data->flat_inputs = 0;
|
|
|
|
|
|
|
|
|
|
|
|
nir_foreach_variable(var, &shader->inputs) {
|
2018-07-31 05:31:47 -07:00
|
|
|
|
unsigned slots = glsl_count_attribute_slots(var->type, false);
|
|
|
|
|
|
for (unsigned s = 0; s < slots; s++) {
|
|
|
|
|
|
int input_index = prog_data->urb_setup[var->data.location + s];
|
2016-04-05 18:19:34 -07:00
|
|
|
|
|
2018-07-31 05:31:47 -07:00
|
|
|
|
if (input_index < 0)
|
|
|
|
|
|
continue;
|
2016-04-05 18:19:34 -07:00
|
|
|
|
|
2018-07-31 05:31:47 -07:00
|
|
|
|
/* flat shading */
|
|
|
|
|
|
if (var->data.interpolation == INTERP_MODE_FLAT)
|
|
|
|
|
|
prog_data->flat_inputs |= 1 << input_index;
|
|
|
|
|
|
}
|
2016-04-05 18:19:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
|
static uint8_t
|
|
|
|
|
|
computed_depth_mode(const nir_shader *shader)
|
|
|
|
|
|
{
|
2017-05-08 09:20:21 -07:00
|
|
|
|
if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
|
|
|
|
|
switch (shader->info.fs.depth_layout) {
|
2015-10-08 16:01:44 -07:00
|
|
|
|
case FRAG_DEPTH_LAYOUT_NONE:
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_ANY:
|
|
|
|
|
|
return BRW_PSCDEPTH_ON;
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_GREATER:
|
|
|
|
|
|
return BRW_PSCDEPTH_ON_GE;
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_LESS:
|
|
|
|
|
|
return BRW_PSCDEPTH_ON_LE;
|
|
|
|
|
|
case FRAG_DEPTH_LAYOUT_UNCHANGED:
|
|
|
|
|
|
return BRW_PSCDEPTH_OFF;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
return BRW_PSCDEPTH_OFF;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Move load_interpolated_input with simple (payload-based) barycentric modes
|
|
|
|
|
|
* to the top of the program so we don't emit multiple PLNs for the same input.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This works around CSE not being able to handle non-dominating cases
|
|
|
|
|
|
* such as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* if (...) {
|
|
|
|
|
|
* interpolate input
|
|
|
|
|
|
* } else {
|
|
|
|
|
|
* interpolate the same exact input
|
|
|
|
|
|
* }
|
|
|
|
|
|
*
|
|
|
|
|
|
* This should be replaced by global value numbering someday.
|
|
|
|
|
|
*/
|
2017-03-09 11:05:08 -08:00
|
|
|
|
static bool
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
move_interpolation_to_top(nir_shader *nir)
|
|
|
|
|
|
{
|
2017-03-09 11:05:08 -08:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
nir_foreach_function(f, nir) {
|
|
|
|
|
|
if (!f->impl)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_block *top = nir_start_block(f->impl);
|
2016-07-26 13:19:46 -07:00
|
|
|
|
exec_node *cursor_node = NULL;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
|
|
|
|
|
|
nir_foreach_block(block, f->impl) {
|
|
|
|
|
|
if (block == top)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
|
nir_foreach_instr_safe(instr, block) {
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
2016-07-26 13:19:46 -07:00
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
continue;
|
2016-07-26 13:19:46 -07:00
|
|
|
|
nir_intrinsic_instr *bary_intrinsic =
|
|
|
|
|
|
nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
|
|
|
|
|
|
nir_intrinsic_op op = bary_intrinsic->intrinsic;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
|
2016-07-26 13:19:46 -07:00
|
|
|
|
/* Leave interpolateAtSample/Offset() where they are. */
|
|
|
|
|
|
if (op == nir_intrinsic_load_barycentric_at_sample ||
|
|
|
|
|
|
op == nir_intrinsic_load_barycentric_at_offset)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_instr *move[3] = {
|
|
|
|
|
|
&bary_intrinsic->instr,
|
|
|
|
|
|
intrin->src[1].ssa->parent_instr,
|
|
|
|
|
|
instr
|
|
|
|
|
|
};
|
|
|
|
|
|
|
2016-08-01 10:35:06 +10:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
|
2016-07-26 13:19:46 -07:00
|
|
|
|
if (move[i]->block != top) {
|
|
|
|
|
|
move[i]->block = top;
|
|
|
|
|
|
exec_node_remove(&move[i]->node);
|
|
|
|
|
|
if (cursor_node) {
|
|
|
|
|
|
exec_node_insert_after(cursor_node, &move[i]->node);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
exec_list_push_head(&top->instr_list, &move[i]->node);
|
|
|
|
|
|
}
|
|
|
|
|
|
cursor_node = &move[i]->node;
|
2017-03-09 11:05:08 -08:00
|
|
|
|
progress = true;
|
2016-07-26 13:19:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
nir_metadata_preserve(f->impl, (nir_metadata)
|
|
|
|
|
|
((unsigned) nir_metadata_block_index |
|
|
|
|
|
|
(unsigned) nir_metadata_dominance));
|
|
|
|
|
|
}
|
2017-03-09 11:05:08 -08:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-07-17 18:37:08 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Demote per-sample barycentric intrinsics to centroid.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Useful when rendering to a non-multisampled buffer.
|
|
|
|
|
|
*/
|
2017-03-09 11:13:52 -08:00
|
|
|
|
static bool
|
2016-07-17 18:37:08 -07:00
|
|
|
|
demote_sample_qualifiers(nir_shader *nir)
|
|
|
|
|
|
{
|
2017-03-09 11:13:52 -08:00
|
|
|
|
bool progress = true;
|
|
|
|
|
|
|
2016-07-17 18:37:08 -07:00
|
|
|
|
nir_foreach_function(f, nir) {
|
|
|
|
|
|
if (!f->impl)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_builder b;
|
|
|
|
|
|
nir_builder_init(&b, f->impl);
|
|
|
|
|
|
|
|
|
|
|
|
nir_foreach_block(block, f->impl) {
|
|
|
|
|
|
nir_foreach_instr_safe(instr, block) {
|
|
|
|
|
|
if (instr->type != nir_instr_type_intrinsic)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
|
|
|
|
|
|
if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
|
|
|
|
|
|
intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
b.cursor = nir_before_instr(instr);
|
|
|
|
|
|
nir_ssa_def *centroid =
|
|
|
|
|
|
nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
|
|
|
|
|
|
nir_intrinsic_interp_mode(intrin));
|
|
|
|
|
|
nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
|
|
|
|
|
|
nir_src_for_ssa(centroid));
|
|
|
|
|
|
nir_instr_remove(instr);
|
2017-03-09 11:13:52 -08:00
|
|
|
|
progress = true;
|
2016-07-17 18:37:08 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
nir_metadata_preserve(f->impl, (nir_metadata)
|
|
|
|
|
|
((unsigned) nir_metadata_block_index |
|
|
|
|
|
|
(unsigned) nir_metadata_dominance));
|
|
|
|
|
|
}
|
2017-03-09 11:13:52 -08:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
2016-07-17 18:37:08 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-02-28 18:11:33 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Pre-gen6, the register file of the EUs was shared between threads,
|
|
|
|
|
|
* and each thread used some subset allocated on a 16-register block
|
|
|
|
|
|
* granularity. The unit states wanted these block counts.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static inline int
|
|
|
|
|
|
brw_register_blocks(int reg_count)
|
|
|
|
|
|
{
|
|
|
|
|
|
return ALIGN(reg_count, 16) / 16 - 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
const unsigned *
|
2015-10-08 16:20:34 -07:00
|
|
|
|
brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
|
2014-05-14 01:21:02 -07:00
|
|
|
|
void *mem_ctx,
|
2014-05-14 00:41:41 -07:00
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
2018-11-08 22:10:03 -08:00
|
|
|
|
nir_shader *shader,
|
2015-10-07 05:06:30 -07:00
|
|
|
|
int shader_time_index8, int shader_time_index16,
|
2016-04-25 17:02:05 -07:00
|
|
|
|
int shader_time_index32, bool allow_spilling,
|
2016-10-20 09:59:00 +11:00
|
|
|
|
bool use_rep_send, struct brw_vue_map *vue_map,
|
2019-04-23 23:19:56 -05:00
|
|
|
|
struct brw_compile_stats *stats,
|
2015-10-05 19:27:28 -07:00
|
|
|
|
char **error_str)
|
2011-03-11 19:19:01 -08:00
|
|
|
|
{
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
const struct gen_device_info *devinfo = compiler->devinfo;
|
|
|
|
|
|
|
2019-02-22 10:48:39 -06:00
|
|
|
|
unsigned max_subgroup_size = unlikely(INTEL_DEBUG & DEBUG_DO32) ? 32 : 16;
|
|
|
|
|
|
|
|
|
|
|
|
brw_nir_apply_key(shader, compiler, &key->base, max_subgroup_size, true);
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
brw_nir_lower_fs_inputs(shader, devinfo, key);
|
2016-02-24 22:11:35 -08:00
|
|
|
|
brw_nir_lower_fs_outputs(shader);
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
|
2019-02-26 11:24:56 -08:00
|
|
|
|
if (devinfo->gen < 6)
|
|
|
|
|
|
brw_setup_vue_interpolation(vue_map, shader, prog_data);
|
i965: Move Gen4-5 interpolation stuff to brw_wm_prog_data.
This fixes glxgears rendering, which had surprisingly been broken since
late October! Specifically, commit 91d61fbf7cb61a44adcaae51ee08ad0dd6b.
glxgears uses glShadeModel(GL_FLAT) when drawing the main portion of the
gears, then uses glShadeModel(GL_SMOOTH) for drawing the Gouraud-shaded
inner portion of the gears. This results in the same fragment program
having two different state-dependent interpolation maps: one where
gl_Color is flat, and another where it's smooth.
The problem is that there's only one gen4_fragment_program, so it can't
store both. Each FS compile would trash the last one. But, the FS
compiles are cached, so the first one would store FLAT, and the second
would see a matching program in the cache and never bother to compile
one with SMOOTH. (Clearing the program cache on every draw made it
render correctly.)
Instead, move it to brw_wm_prog_data, where we can keep a copy for
every specialization of the program. The only downside is bloating
the structure a bit, but we can tighten that up a bit if we need to.
This also lets us kill gen4_fragment_program entirely!
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Timothy Arceri <timothy.arceri@collabora.com>
2017-01-13 14:29:52 -08:00
|
|
|
|
|
2016-07-17 18:37:08 -07:00
|
|
|
|
if (!key->multisample_fbo)
|
|
|
|
|
|
NIR_PASS_V(shader, demote_sample_qualifiers);
|
i965: Move load_interpolated_input/barycentric_* intrinsics to the top.
Currently, i965 interpolates all FS inputs at the top of the program.
This has advantages and disadvantages, but I'd like to keep that policy
while reworking this code. We can consider changing it independently.
The next patch will make the compiler generate PLN instructions "on the
fly", when it encounters an input load intrinsic, rather than doing it
for all inputs at the start of the program.
To emulate this behavior, we introduce an ugly pass to move all NIR
load_interpolated_input and payload-based (not interpolator message)
load_barycentric_* intrinsics to the shader's start block.
This helps avoid regressions in shader-db for cases such as:
if (...) {
...load some input...
} else {
...load that same input...
}
which CSE can't handle, because there's no dominance relationship
between the two loads. Because the start block dominates all others,
we can CSE all inputs and emit PLNs exactly once, as we did before.
Ideally, global value numbering would eliminate these redundant loads,
while not forcing them all the way to the start block. When that lands,
we should consider dropping this hacky pass.
Again, this pass currently does nothing, as i965 doesn't generate these
intrinsics yet. But it will shortly, and I figured I'd separate this
code as it's relatively self-contained.
v2: Dramatically simplify pass - instead of creating new instructions,
just remove/re-insert their list nodes (suggested by Jason Ekstrand).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisforbes@google.com> [v1]
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
2016-07-17 18:44:58 -07:00
|
|
|
|
NIR_PASS_V(shader, move_interpolation_to_top);
|
2019-06-04 18:19:06 -05:00
|
|
|
|
brw_postprocess_nir(shader, compiler, true);
|
2015-11-11 10:04:43 -08:00
|
|
|
|
|
2015-10-08 16:01:44 -07:00
|
|
|
|
/* key->alpha_test_func means simulating alpha testing via discards,
|
|
|
|
|
|
* so the shader definitely kills pixels.
|
|
|
|
|
|
*/
|
2017-05-08 09:20:21 -07:00
|
|
|
|
prog_data->uses_kill = shader->info.fs.uses_discard ||
|
2016-10-13 11:41:23 +11:00
|
|
|
|
key->alpha_test_func;
|
2016-04-05 02:09:08 -07:00
|
|
|
|
prog_data->uses_omask = key->multisample_fbo &&
|
2017-05-08 09:20:21 -07:00
|
|
|
|
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
|
2015-10-08 16:01:44 -07:00
|
|
|
|
prog_data->computed_depth_mode = computed_depth_mode(shader);
|
2015-10-20 14:29:39 -07:00
|
|
|
|
prog_data->computed_stencil =
|
2017-05-08 09:20:21 -07:00
|
|
|
|
shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
|
2015-10-08 16:01:44 -07:00
|
|
|
|
|
2016-05-09 17:48:24 -07:00
|
|
|
|
prog_data->persample_dispatch =
|
|
|
|
|
|
key->multisample_fbo &&
|
|
|
|
|
|
(key->persample_interp ||
|
2017-05-08 09:20:21 -07:00
|
|
|
|
(shader->info.system_values_read & (SYSTEM_BIT_SAMPLE_ID |
|
2016-10-13 11:41:23 +11:00
|
|
|
|
SYSTEM_BIT_SAMPLE_POS)) ||
|
2017-05-08 09:20:21 -07:00
|
|
|
|
shader->info.fs.uses_sample_qualifier ||
|
|
|
|
|
|
shader->info.outputs_read);
|
2016-05-09 17:48:24 -07:00
|
|
|
|
|
2017-08-16 16:47:07 -07:00
|
|
|
|
prog_data->has_render_target_reads = shader->info.outputs_read != 0ull;
|
|
|
|
|
|
|
2017-05-08 09:20:21 -07:00
|
|
|
|
prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests;
|
|
|
|
|
|
prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage;
|
|
|
|
|
|
prog_data->inner_coverage = shader->info.fs.inner_coverage;
|
2015-10-08 16:01:44 -07:00
|
|
|
|
|
|
|
|
|
|
prog_data->barycentric_interp_modes =
|
2016-07-07 00:47:18 -07:00
|
|
|
|
brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
|
2015-10-08 16:01:44 -07:00
|
|
|
|
|
2019-07-18 09:15:15 -05:00
|
|
|
|
calculate_urb_setup(devinfo, key, prog_data, shader);
|
|
|
|
|
|
brw_compute_flat_inputs(prog_data, shader);
|
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
|
cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL, *simd32_cfg = NULL;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
|
2019-02-21 17:20:39 -06:00
|
|
|
|
fs_visitor v8(compiler, log_data, mem_ctx, &key->base,
|
2019-08-23 15:33:24 -05:00
|
|
|
|
&prog_data->base, shader, 8,
|
2016-04-28 12:40:14 -07:00
|
|
|
|
shader_time_index8);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
|
2015-10-05 19:27:28 -07:00
|
|
|
|
if (error_str)
|
2016-04-28 12:40:14 -07:00
|
|
|
|
*error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
|
2012-02-09 10:23:45 -08:00
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
return NULL;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
} else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
|
|
|
|
|
|
simd8_cfg = v8.cfg;
|
2018-05-17 23:49:29 -07:00
|
|
|
|
prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
|
|
|
|
|
|
prog_data->reg_blocks_8 = brw_register_blocks(v8.grf_used);
|
2016-04-28 12:40:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-18 14:39:52 -07:00
|
|
|
|
if (v8.max_dispatch_width >= 16 &&
|
2016-04-28 12:40:14 -07:00
|
|
|
|
likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
|
|
|
|
|
|
/* Try a SIMD16 compile */
|
2019-02-21 17:20:39 -06:00
|
|
|
|
fs_visitor v16(compiler, log_data, mem_ctx, &key->base,
|
2019-08-23 15:33:24 -05:00
|
|
|
|
&prog_data->base, shader, 16,
|
2016-04-28 12:40:14 -07:00
|
|
|
|
shader_time_index16);
|
|
|
|
|
|
v16.import_uniforms(&v8);
|
2016-05-16 14:30:25 -07:00
|
|
|
|
if (!v16.run_fs(allow_spilling, use_rep_send)) {
|
2016-04-28 12:40:14 -07:00
|
|
|
|
compiler->shader_perf_log(log_data,
|
|
|
|
|
|
"SIMD16 shader failed to compile: %s",
|
|
|
|
|
|
v16.fail_msg);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
simd16_cfg = v16.cfg;
|
2018-05-17 23:49:29 -07:00
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
|
|
|
|
|
|
prog_data->reg_blocks_16 = brw_register_blocks(v16.grf_used);
|
2012-07-12 12:48:58 -07:00
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
|
/* Currently, the compiler only supports SIMD32 on SNB+ */
|
|
|
|
|
|
if (v8.max_dispatch_width >= 32 && !use_rep_send &&
|
|
|
|
|
|
compiler->devinfo->gen >= 6 &&
|
|
|
|
|
|
unlikely(INTEL_DEBUG & DEBUG_DO32)) {
|
|
|
|
|
|
/* Try a SIMD32 compile */
|
2019-02-21 17:20:39 -06:00
|
|
|
|
fs_visitor v32(compiler, log_data, mem_ctx, &key->base,
|
2019-08-23 15:33:24 -05:00
|
|
|
|
&prog_data->base, shader, 32,
|
2016-04-26 19:45:41 -07:00
|
|
|
|
shader_time_index32);
|
|
|
|
|
|
v32.import_uniforms(&v8);
|
|
|
|
|
|
if (!v32.run_fs(allow_spilling, false)) {
|
|
|
|
|
|
compiler->shader_perf_log(log_data,
|
|
|
|
|
|
"SIMD32 shader failed to compile: %s",
|
|
|
|
|
|
v32.fail_msg);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
simd32_cfg = v32.cfg;
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32 = v32.payload.num_regs;
|
|
|
|
|
|
prog_data->reg_blocks_32 = brw_register_blocks(v32.grf_used);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-28 12:40:14 -07:00
|
|
|
|
/* When the caller requests a repclear shader, they want SIMD16-only */
|
|
|
|
|
|
if (use_rep_send)
|
|
|
|
|
|
simd8_cfg = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
/* Prior to Iron Lake, the PS had a single shader offset with a jump table
|
|
|
|
|
|
* at the top to select the shader. We've never implemented that.
|
|
|
|
|
|
* Instead, we just give them exactly one shader and we pick the widest one
|
|
|
|
|
|
* available.
|
|
|
|
|
|
*/
|
2016-04-26 19:45:41 -07:00
|
|
|
|
if (compiler->devinfo->gen < 5) {
|
|
|
|
|
|
if (simd32_cfg || simd16_cfg)
|
|
|
|
|
|
simd8_cfg = NULL;
|
|
|
|
|
|
if (simd32_cfg)
|
|
|
|
|
|
simd16_cfg = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* If computed depth is enabled SNB only allows SIMD8. */
|
|
|
|
|
|
if (compiler->devinfo->gen == 6 &&
|
|
|
|
|
|
prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF)
|
|
|
|
|
|
assert(simd16_cfg == NULL && simd32_cfg == NULL);
|
2016-04-28 12:40:14 -07:00
|
|
|
|
|
2018-05-17 23:49:29 -07:00
|
|
|
|
if (compiler->devinfo->gen <= 5 && !simd8_cfg) {
|
|
|
|
|
|
/* Iron lake and earlier only have one Dispatch GRF start field. Make
|
|
|
|
|
|
* the data available in the base prog data struct for convenience.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (simd16_cfg) {
|
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg =
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16;
|
2016-04-26 19:45:41 -07:00
|
|
|
|
} else if (simd32_cfg) {
|
|
|
|
|
|
prog_data->base.dispatch_grf_start_reg =
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_32;
|
2018-05-17 23:49:29 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-28 15:37:39 -07:00
|
|
|
|
if (prog_data->persample_dispatch) {
|
|
|
|
|
|
/* Starting with SandyBridge (where we first get MSAA), the different
|
|
|
|
|
|
* pixel dispatch combinations are grouped into classifications A
|
|
|
|
|
|
* through F (SNB PRM Vol. 2 Part 1 Section 7.7.1). On all hardware
|
|
|
|
|
|
* generations, the only configurations supporting persample dispatch
|
|
|
|
|
|
* are are this in which only one dispatch width is enabled.
|
|
|
|
|
|
*/
|
2016-04-26 19:45:41 -07:00
|
|
|
|
if (simd32_cfg || simd16_cfg)
|
2016-04-28 15:37:39 -07:00
|
|
|
|
simd8_cfg = NULL;
|
2016-04-26 19:45:41 -07:00
|
|
|
|
if (simd32_cfg)
|
|
|
|
|
|
simd16_cfg = NULL;
|
2016-04-28 15:37:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-25 17:20:35 -07:00
|
|
|
|
fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
|
2016-10-17 14:10:26 -07:00
|
|
|
|
v8.shader_stats, v8.runtime_check_aads_emit,
|
2016-01-14 20:27:51 -08:00
|
|
|
|
MESA_SHADER_FRAGMENT);
|
2014-10-27 19:40:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
|
2015-10-05 19:27:28 -07:00
|
|
|
|
g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
|
2017-05-08 09:20:21 -07:00
|
|
|
|
shader->info.label ?
|
|
|
|
|
|
shader->info.label : "unnamed",
|
|
|
|
|
|
shader->info.name));
|
2014-10-27 19:40:47 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-28 12:40:14 -07:00
|
|
|
|
if (simd8_cfg) {
|
2016-04-28 15:37:39 -07:00
|
|
|
|
prog_data->dispatch_8 = true;
|
2019-04-23 23:19:56 -05:00
|
|
|
|
g.generate_code(simd8_cfg, 8, stats);
|
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2018-05-17 23:49:29 -07:00
|
|
|
|
}
|
2016-04-28 15:37:39 -07:00
|
|
|
|
|
2018-05-17 23:49:29 -07:00
|
|
|
|
if (simd16_cfg) {
|
2016-04-28 15:37:39 -07:00
|
|
|
|
prog_data->dispatch_16 = true;
|
2019-04-23 23:19:56 -05:00
|
|
|
|
prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16, stats);
|
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2016-04-28 12:40:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2016-04-26 19:45:41 -07:00
|
|
|
|
if (simd32_cfg) {
|
|
|
|
|
|
prog_data->dispatch_32 = true;
|
2019-04-23 23:19:56 -05:00
|
|
|
|
prog_data->prog_offset_32 = g.generate_code(simd32_cfg, 32, stats);
|
|
|
|
|
|
stats = stats ? stats + 1 : NULL;
|
2016-04-26 19:45:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2018-02-26 16:34:55 -08:00
|
|
|
|
return g.get_assembly();
|
2010-08-26 12:12:00 -07:00
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2015-09-04 16:35:34 -07:00
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_cs_work_group_id_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_COMPUTE);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
|
|
|
|
|
|
|
|
|
|
|
|
struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(*reg, r0_1);
|
|
|
|
|
|
bld.MOV(offset(*reg, bld, 1), r0_6);
|
|
|
|
|
|
bld.MOV(offset(*reg, bld, 2), r0_7);
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-22 21:46:28 -07:00
|
|
|
|
static void
|
|
|
|
|
|
fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
|
|
|
|
|
|
{
|
|
|
|
|
|
block->dwords = dwords;
|
|
|
|
|
|
block->regs = DIV_ROUND_UP(dwords, 8);
|
|
|
|
|
|
block->size = block->regs * 32;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
2016-08-22 15:01:08 -07:00
|
|
|
|
cs_fill_push_const_info(const struct gen_device_info *devinfo,
|
2016-05-22 21:46:28 -07:00
|
|
|
|
struct brw_cs_prog_data *cs_prog_data)
|
|
|
|
|
|
{
|
2016-09-08 23:48:51 -07:00
|
|
|
|
const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
|
2017-08-24 11:40:31 -07:00
|
|
|
|
int subgroup_id_index = get_subgroup_id_param_index(prog_data);
|
2016-05-31 15:45:24 -07:00
|
|
|
|
bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
|
2016-05-22 21:46:28 -07:00
|
|
|
|
|
|
|
|
|
|
/* The thread ID should be stored in the last param dword */
|
2017-08-24 11:40:31 -07:00
|
|
|
|
assert(subgroup_id_index == -1 ||
|
|
|
|
|
|
subgroup_id_index == (int)prog_data->nr_params - 1);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
|
|
|
|
|
|
unsigned cross_thread_dwords, per_thread_dwords;
|
|
|
|
|
|
if (!cross_thread_supported) {
|
|
|
|
|
|
cross_thread_dwords = 0u;
|
2016-05-22 22:31:06 -07:00
|
|
|
|
per_thread_dwords = prog_data->nr_params;
|
2017-08-24 11:40:31 -07:00
|
|
|
|
} else if (subgroup_id_index >= 0) {
|
2016-05-22 21:46:28 -07:00
|
|
|
|
/* Fill all but the last register with cross-thread payload */
|
2017-08-24 11:40:31 -07:00
|
|
|
|
cross_thread_dwords = 8 * (subgroup_id_index / 8);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
|
|
|
|
|
|
assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Fill all data using cross-thread payload */
|
|
|
|
|
|
cross_thread_dwords = prog_data->nr_params;
|
|
|
|
|
|
per_thread_dwords = 0u;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
|
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
|
|
|
|
|
|
|
|
|
|
|
|
unsigned total_dwords =
|
|
|
|
|
|
(cs_prog_data->push.per_thread.size * cs_prog_data->threads +
|
|
|
|
|
|
cs_prog_data->push.cross_thread.size) / 4;
|
|
|
|
|
|
fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
|
|
|
|
|
|
|
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
|
|
|
|
|
|
cs_prog_data->push.per_thread.size == 0);
|
|
|
|
|
|
assert(cs_prog_data->push.cross_thread.dwords +
|
|
|
|
|
|
cs_prog_data->push.per_thread.dwords ==
|
|
|
|
|
|
prog_data->nr_params);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2016-05-26 13:49:07 -07:00
|
|
|
|
static void
|
|
|
|
|
|
cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
|
|
|
|
|
|
{
|
|
|
|
|
|
cs_prog_data->simd_size = size;
|
|
|
|
|
|
unsigned group_size = cs_prog_data->local_size[0] *
|
|
|
|
|
|
cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
|
|
|
|
|
|
cs_prog_data->threads = (group_size + size - 1) / size;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-08-21 21:27:19 -07:00
|
|
|
|
static nir_shader *
|
|
|
|
|
|
compile_cs_to_nir(const struct brw_compiler *compiler,
|
|
|
|
|
|
void *mem_ctx,
|
|
|
|
|
|
const struct brw_cs_prog_key *key,
|
|
|
|
|
|
const nir_shader *src_shader,
|
|
|
|
|
|
unsigned dispatch_width)
|
|
|
|
|
|
{
|
|
|
|
|
|
nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
|
2019-02-22 10:48:39 -06:00
|
|
|
|
brw_nir_apply_key(shader, compiler, &key->base, dispatch_width, true);
|
2019-04-02 17:29:52 -07:00
|
|
|
|
|
|
|
|
|
|
NIR_PASS_V(shader, brw_nir_lower_cs_intrinsics, dispatch_width);
|
2019-03-27 15:07:59 -07:00
|
|
|
|
|
|
|
|
|
|
/* Clean up after the local index and ID calculations. */
|
2019-04-02 17:29:52 -07:00
|
|
|
|
NIR_PASS_V(shader, nir_opt_constant_folding);
|
|
|
|
|
|
NIR_PASS_V(shader, nir_opt_dce);
|
2019-03-27 15:07:59 -07:00
|
|
|
|
|
2019-06-04 18:19:06 -05:00
|
|
|
|
brw_postprocess_nir(shader, compiler, true);
|
|
|
|
|
|
|
|
|
|
|
|
return shader;
|
2017-08-21 21:27:19 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-09-04 16:35:34 -07:00
|
|
|
|
const unsigned *
|
2015-10-08 16:20:34 -07:00
|
|
|
|
brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
|
|
|
|
|
|
void *mem_ctx,
|
|
|
|
|
|
const struct brw_cs_prog_key *key,
|
|
|
|
|
|
struct brw_cs_prog_data *prog_data,
|
2015-11-11 10:04:43 -08:00
|
|
|
|
const nir_shader *src_shader,
|
2015-10-08 16:20:34 -07:00
|
|
|
|
int shader_time_index,
|
2019-04-23 23:19:56 -05:00
|
|
|
|
struct brw_compile_stats *stats,
|
2015-10-08 16:20:34 -07:00
|
|
|
|
char **error_str)
|
2015-09-04 16:35:34 -07:00
|
|
|
|
{
|
2019-07-02 15:32:15 +01:00
|
|
|
|
prog_data->base.total_shared = src_shader->info.cs.shared_size;
|
2017-08-21 21:27:19 -07:00
|
|
|
|
prog_data->local_size[0] = src_shader->info.cs.local_size[0];
|
|
|
|
|
|
prog_data->local_size[1] = src_shader->info.cs.local_size[1];
|
|
|
|
|
|
prog_data->local_size[2] = src_shader->info.cs.local_size[2];
|
2019-07-25 12:00:23 -05:00
|
|
|
|
prog_data->slm_size = src_shader->num_shared;
|
2015-09-04 16:35:34 -07:00
|
|
|
|
unsigned local_workgroup_size =
|
2017-08-21 21:27:19 -07:00
|
|
|
|
src_shader->info.cs.local_size[0] * src_shader->info.cs.local_size[1] *
|
|
|
|
|
|
src_shader->info.cs.local_size[2];
|
2015-10-08 15:28:26 -07:00
|
|
|
|
|
2017-08-21 19:16:45 -07:00
|
|
|
|
unsigned min_dispatch_width =
|
|
|
|
|
|
DIV_ROUND_UP(local_workgroup_size, compiler->devinfo->max_cs_threads);
|
|
|
|
|
|
min_dispatch_width = MAX2(8, min_dispatch_width);
|
|
|
|
|
|
min_dispatch_width = util_next_power_of_two(min_dispatch_width);
|
|
|
|
|
|
assert(min_dispatch_width <= 32);
|
2019-07-09 14:28:18 -05:00
|
|
|
|
unsigned max_dispatch_width = 32;
|
2015-09-04 16:35:34 -07:00
|
|
|
|
|
2017-08-21 21:27:19 -07:00
|
|
|
|
fs_visitor *v8 = NULL, *v16 = NULL, *v32 = NULL;
|
2016-10-17 14:10:26 -07:00
|
|
|
|
fs_visitor *v = NULL;
|
2015-09-04 16:35:34 -07:00
|
|
|
|
const char *fail_msg = NULL;
|
|
|
|
|
|
|
2019-07-09 14:28:18 -05:00
|
|
|
|
if ((int)key->base.subgroup_size_type >= (int)BRW_SUBGROUP_SIZE_REQUIRE_8) {
|
|
|
|
|
|
/* These enum values are expressly chosen to be equal to the subgroup
|
|
|
|
|
|
* size that they require.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned required_dispatch_width =
|
|
|
|
|
|
(unsigned)key->base.subgroup_size_type;
|
|
|
|
|
|
assert(required_dispatch_width == 8 ||
|
|
|
|
|
|
required_dispatch_width == 16 ||
|
|
|
|
|
|
required_dispatch_width == 32);
|
|
|
|
|
|
if (required_dispatch_width < min_dispatch_width ||
|
|
|
|
|
|
required_dispatch_width > max_dispatch_width) {
|
|
|
|
|
|
fail_msg = "Cannot satisfy explicit subgroup size";
|
|
|
|
|
|
} else {
|
|
|
|
|
|
min_dispatch_width = max_dispatch_width = required_dispatch_width;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-09-04 16:35:34 -07:00
|
|
|
|
/* Now the main event: Visit the shader IR and generate our CS IR for it.
|
|
|
|
|
|
*/
|
2019-07-09 14:28:18 -05:00
|
|
|
|
if (!fail_msg && min_dispatch_width <= 8 && max_dispatch_width >= 8) {
|
2017-08-21 21:27:19 -07:00
|
|
|
|
nir_shader *nir8 = compile_cs_to_nir(compiler, mem_ctx, key,
|
2018-03-28 16:35:10 -07:00
|
|
|
|
src_shader, 8);
|
2019-02-21 17:20:39 -06:00
|
|
|
|
v8 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
|
|
|
|
|
|
&prog_data->base,
|
2017-08-21 21:27:19 -07:00
|
|
|
|
nir8, 8, shader_time_index);
|
|
|
|
|
|
if (!v8->run_cs(min_dispatch_width)) {
|
|
|
|
|
|
fail_msg = v8->fail_msg;
|
2016-02-22 10:42:07 -08:00
|
|
|
|
} else {
|
2017-08-21 19:30:24 -07:00
|
|
|
|
/* We should always be able to do SIMD32 for compute shaders */
|
2017-08-21 21:27:19 -07:00
|
|
|
|
assert(v8->max_dispatch_width >= 32);
|
2017-08-21 19:30:24 -07:00
|
|
|
|
|
2016-10-17 14:10:26 -07:00
|
|
|
|
v = v8;
|
2016-05-26 13:49:07 -07:00
|
|
|
|
cs_set_simd_size(prog_data, 8);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
2016-02-22 10:42:07 -08:00
|
|
|
|
}
|
2015-09-04 16:35:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
|
2019-07-09 14:28:18 -05:00
|
|
|
|
!fail_msg && min_dispatch_width <= 16 && max_dispatch_width >= 16) {
|
2015-09-04 16:35:34 -07:00
|
|
|
|
/* Try a SIMD16 compile */
|
2017-08-21 21:27:19 -07:00
|
|
|
|
nir_shader *nir16 = compile_cs_to_nir(compiler, mem_ctx, key,
|
2018-03-28 16:35:10 -07:00
|
|
|
|
src_shader, 16);
|
2019-02-21 17:20:39 -06:00
|
|
|
|
v16 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
|
|
|
|
|
|
&prog_data->base,
|
2017-08-21 21:27:19 -07:00
|
|
|
|
nir16, 16, shader_time_index);
|
|
|
|
|
|
if (v8)
|
|
|
|
|
|
v16->import_uniforms(v8);
|
|
|
|
|
|
|
|
|
|
|
|
if (!v16->run_cs(min_dispatch_width)) {
|
2015-10-08 15:28:26 -07:00
|
|
|
|
compiler->shader_perf_log(log_data,
|
|
|
|
|
|
"SIMD16 shader failed to compile: %s",
|
2017-08-21 21:27:19 -07:00
|
|
|
|
v16->fail_msg);
|
2016-10-17 14:10:26 -07:00
|
|
|
|
if (!v) {
|
2015-09-04 16:35:34 -07:00
|
|
|
|
fail_msg =
|
|
|
|
|
|
"Couldn't generate SIMD16 program and not "
|
|
|
|
|
|
"enough threads for SIMD8";
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2017-08-21 19:30:24 -07:00
|
|
|
|
/* We should always be able to do SIMD32 for compute shaders */
|
2017-08-21 21:27:19 -07:00
|
|
|
|
assert(v16->max_dispatch_width >= 32);
|
2017-08-21 19:30:24 -07:00
|
|
|
|
|
2016-10-17 14:10:26 -07:00
|
|
|
|
v = v16;
|
2016-05-26 13:49:07 -07:00
|
|
|
|
cs_set_simd_size(prog_data, 16);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
2015-09-04 16:35:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-08-21 21:27:19 -07:00
|
|
|
|
/* We should always be able to do SIMD32 for compute shaders */
|
|
|
|
|
|
assert(!v16 || v16->max_dispatch_width >= 32);
|
|
|
|
|
|
|
2019-07-09 14:28:18 -05:00
|
|
|
|
if (!fail_msg && (min_dispatch_width > 16 || (INTEL_DEBUG & DEBUG_DO32)) &&
|
|
|
|
|
|
max_dispatch_width >= 32) {
|
2016-05-16 18:25:22 -07:00
|
|
|
|
/* Try a SIMD32 compile */
|
2017-08-21 21:27:19 -07:00
|
|
|
|
nir_shader *nir32 = compile_cs_to_nir(compiler, mem_ctx, key,
|
2018-03-28 16:35:10 -07:00
|
|
|
|
src_shader, 32);
|
2019-02-21 17:20:39 -06:00
|
|
|
|
v32 = new fs_visitor(compiler, log_data, mem_ctx, &key->base,
|
|
|
|
|
|
&prog_data->base,
|
2017-08-21 21:27:19 -07:00
|
|
|
|
nir32, 32, shader_time_index);
|
|
|
|
|
|
if (v8)
|
|
|
|
|
|
v32->import_uniforms(v8);
|
|
|
|
|
|
else if (v16)
|
|
|
|
|
|
v32->import_uniforms(v16);
|
|
|
|
|
|
|
|
|
|
|
|
if (!v32->run_cs(min_dispatch_width)) {
|
2016-05-16 18:25:22 -07:00
|
|
|
|
compiler->shader_perf_log(log_data,
|
|
|
|
|
|
"SIMD32 shader failed to compile: %s",
|
2019-08-13 17:02:13 -07:00
|
|
|
|
v32->fail_msg);
|
2016-10-17 14:10:26 -07:00
|
|
|
|
if (!v) {
|
2016-05-16 18:25:22 -07:00
|
|
|
|
fail_msg =
|
|
|
|
|
|
"Couldn't generate SIMD32 program and not "
|
|
|
|
|
|
"enough threads for SIMD16";
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2016-10-17 14:10:26 -07:00
|
|
|
|
v = v32;
|
2016-05-26 13:49:07 -07:00
|
|
|
|
cs_set_simd_size(prog_data, 32);
|
2016-05-22 21:46:28 -07:00
|
|
|
|
cs_fill_push_const_info(compiler->devinfo, prog_data);
|
2016-05-16 18:25:22 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2017-11-06 16:29:42 -08:00
|
|
|
|
const unsigned *ret = NULL;
|
2016-10-17 14:10:26 -07:00
|
|
|
|
if (unlikely(v == NULL)) {
|
2015-09-04 16:35:34 -07:00
|
|
|
|
assert(fail_msg);
|
2015-10-08 15:28:26 -07:00
|
|
|
|
if (error_str)
|
|
|
|
|
|
*error_str = ralloc_strdup(mem_ctx, fail_msg);
|
2017-11-06 16:29:42 -08:00
|
|
|
|
} else {
|
2016-04-25 17:20:35 -07:00
|
|
|
|
fs_generator g(compiler, log_data, mem_ctx, &prog_data->base,
|
2016-10-17 14:10:26 -07:00
|
|
|
|
v->shader_stats, v->runtime_check_aads_emit,
|
|
|
|
|
|
MESA_SHADER_COMPUTE);
|
2017-11-06 16:29:42 -08:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_CS) {
|
|
|
|
|
|
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
|
2017-08-21 21:27:19 -07:00
|
|
|
|
src_shader->info.label ?
|
|
|
|
|
|
src_shader->info.label : "unnamed",
|
|
|
|
|
|
src_shader->info.name);
|
2017-11-06 16:29:42 -08:00
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
2015-10-08 15:28:26 -07:00
|
|
|
|
|
2019-04-23 23:19:56 -05:00
|
|
|
|
g.generate_code(v->cfg, prog_data->simd_size, stats);
|
2015-09-04 16:35:34 -07:00
|
|
|
|
|
2018-02-26 16:34:55 -08:00
|
|
|
|
ret = g.get_assembly();
|
2015-09-04 16:35:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2017-08-21 21:27:19 -07:00
|
|
|
|
delete v8;
|
|
|
|
|
|
delete v16;
|
|
|
|
|
|
delete v32;
|
|
|
|
|
|
|
2017-11-06 16:29:42 -08:00
|
|
|
|
return ret;
|
2015-09-04 16:35:34 -07:00
|
|
|
|
}
|
2016-09-15 21:43:18 -07:00
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Test the dispatch mask packing assumptions of
|
|
|
|
|
|
* brw_stage_has_packed_dispatch(). Call this from e.g. the top of
|
|
|
|
|
|
* fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
|
|
|
|
|
|
* executed with an unexpected dispatch mask.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static UNUSED void
|
|
|
|
|
|
brw_fs_test_dispatch_packing(const fs_builder &bld)
|
|
|
|
|
|
{
|
|
|
|
|
|
const gl_shader_stage stage = bld.shader->stage;
|
|
|
|
|
|
|
|
|
|
|
|
if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
|
|
|
|
|
|
bld.shader->stage_prog_data)) {
|
|
|
|
|
|
const fs_builder ubld = bld.exec_all().group(1, 0);
|
|
|
|
|
|
const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
|
|
|
|
|
|
const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
|
|
|
|
|
|
brw_dmask_reg());
|
|
|
|
|
|
|
|
|
|
|
|
ubld.ADD(tmp, mask, brw_imm_ud(1));
|
|
|
|
|
|
ubld.AND(tmp, mask, tmp);
|
|
|
|
|
|
|
|
|
|
|
|
/* This will loop forever if the dispatch mask doesn't have the expected
|
|
|
|
|
|
* form '2^n-1', in which case tmp will be non-zero.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bld.emit(BRW_OPCODE_DO);
|
|
|
|
|
|
bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
|
|
|
|
|
|
set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|