2010-08-10 20:39:06 -07:00
|
|
|
|
/*
|
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
|
*
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
|
*
|
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
|
* Software.
|
|
|
|
|
|
*
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/** @file brw_fs.cpp
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
2010-08-26 15:43:00 -07:00
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
2014-02-25 01:08:45 -08:00
|
|
|
|
#include "util/hash_table.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "main/macros.h"
|
|
|
|
|
|
#include "main/shaderobj.h"
|
2012-04-20 07:58:59 -06:00
|
|
|
|
#include "main/fbobject.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "program/prog_parameter.h"
|
|
|
|
|
|
#include "program/prog_print.h"
|
2014-09-22 12:24:21 -07:00
|
|
|
|
#include "util/register_allocate.h"
|
2010-08-15 18:58:58 -07:00
|
|
|
|
#include "program/hash_table.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "brw_context.h"
|
|
|
|
|
|
#include "brw_eu.h"
|
|
|
|
|
|
#include "brw_wm.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
|
#include "brw_fs.h"
|
2014-11-21 18:47:49 -08:00
|
|
|
|
#include "brw_cs.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
|
#include "brw_cfg.h"
|
2013-10-30 10:32:12 -07:00
|
|
|
|
#include "brw_dead_control_flow.h"
|
2013-09-11 10:59:13 -07:00
|
|
|
|
#include "main/uniforms.h"
|
2012-06-05 11:42:25 -07:00
|
|
|
|
#include "brw_fs_live_variables.h"
|
2011-08-26 13:58:41 -07:00
|
|
|
|
#include "glsl/glsl_types.h"
|
2014-12-16 14:29:28 -08:00
|
|
|
|
#include "program/sampler.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
|
2015-06-03 20:36:47 +03:00
|
|
|
|
using namespace brw;
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
memset(this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(sources, 3)];
|
|
|
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
this->opcode = opcode;
|
|
|
|
|
|
this->dst = dst;
|
2014-02-20 08:18:22 -08:00
|
|
|
|
this->sources = sources;
|
2014-08-14 13:56:24 -07:00
|
|
|
|
this->exec_size = exec_size;
|
|
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
|
switch (dst.file) {
|
|
|
|
|
|
case GRF:
|
|
|
|
|
|
case HW_REG:
|
|
|
|
|
|
case MRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2015-07-14 15:43:44 +03:00
|
|
|
|
this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
|
|
|
|
|
|
REG_SIZE);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
|
this->regs_written = 0;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid register file");
|
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-18 12:30:43 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-06-18 12:30:43 -07:00
|
|
|
|
init(opcode, exec_size, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
|
{
|
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
|
{
|
|
|
|
|
|
memcpy(this, &that, sizeof(that));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(that.sources, 3)];
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
for (unsigned i = 0; i < that.sources; i++)
|
2014-02-19 21:18:44 -08:00
|
|
|
|
this->src[i] = that.src[i];
|
2014-02-20 09:40:02 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
|
{
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 13:14:05 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->sources != num_sources) {
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
|
|
|
|
|
|
src[i] = this->src[i];
|
|
|
|
|
|
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
this->src = src;
|
2014-02-20 13:14:05 -08:00
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 22:22:39 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
|
|
|
|
|
|
const fs_reg &dst,
|
2014-02-19 20:31:14 -08:00
|
|
|
|
const fs_reg &surf_index,
|
|
|
|
|
|
const fs_reg &varying_offset,
|
2013-03-13 12:27:17 -07:00
|
|
|
|
uint32_t const_offset)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
|
|
|
|
|
* be any component of a vector, and then we load 4 contiguous
|
|
|
|
|
|
* components starting from that.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We break down the const_offset to a portion added to the variable
|
|
|
|
|
|
* offset and a portion done using reg_offset, which means that if you
|
|
|
|
|
|
* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
|
|
|
|
|
|
* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
|
|
|
|
|
|
* CSE can later notice that those loads are all the same and eliminate
|
|
|
|
|
|
* the redundant ones.
|
|
|
|
|
|
*/
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg vec4_offset = vgrf(glsl_type::int_type);
|
2015-06-03 22:22:39 +03:00
|
|
|
|
bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
|
2013-03-18 10:16:42 -07:00
|
|
|
|
|
|
|
|
|
|
int scale = 1;
|
2015-06-18 13:41:38 -07:00
|
|
|
|
if (devinfo->gen == 4 && bld.dispatch_width() == 8) {
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
|
|
|
|
|
|
* u, v, r) as parameters, or we can just use the SIMD16 message
|
|
|
|
|
|
* consisting of (header, u). We choose the second, at the cost of a
|
|
|
|
|
|
* longer return length.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*/
|
2013-03-18 10:16:42 -07:00
|
|
|
|
scale = 2;
|
|
|
|
|
|
}
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
|
enum opcode op;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7)
|
2013-03-18 10:16:42 -07:00
|
|
|
|
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
else
|
|
|
|
|
|
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
|
2015-06-18 13:41:38 -07:00
|
|
|
|
int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
|
2015-06-18 12:44:35 -07:00
|
|
|
|
fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
|
2015-06-03 22:22:39 +03:00
|
|
|
|
fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
inst->regs_written = regs_written;
|
2013-03-18 10:16:42 -07:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 7) {
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst->base_mrf = 13;
|
2015-03-24 10:17:32 -07:00
|
|
|
|
inst->header_size = 1;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 4)
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst->mlen = 3;
|
|
|
|
|
|
else
|
2015-06-18 13:41:38 -07:00
|
|
|
|
inst->mlen = 1 + bld.dispatch_width() / 8;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-18 12:07:27 -07:00
|
|
|
|
bld.MOV(dst, offset(vec4_result, bld, (const_offset & 3) * scale));
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* A helper for MOV generation for fixing up broken hardware SEND dependency
|
|
|
|
|
|
* handling.
|
|
|
|
|
|
*/
|
2015-06-03 22:22:10 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* The caller always wants uncompressed to emit the minimal extra
|
|
|
|
|
|
* dependencies, and to avoid having to deal with aligning its regs to 2.
|
|
|
|
|
|
*/
|
2015-06-03 22:22:10 +03:00
|
|
|
|
const fs_builder ubld = bld.annotate("send dependency resolve")
|
|
|
|
|
|
.half(0);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
2015-06-03 22:22:10 +03:00
|
|
|
|
ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::equals(fs_inst *inst) const
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
return (opcode == inst->opcode &&
|
|
|
|
|
|
dst.equals(inst->dst) &&
|
|
|
|
|
|
src[0].equals(inst->src[0]) &&
|
|
|
|
|
|
src[1].equals(inst->src[1]) &&
|
|
|
|
|
|
src[2].equals(inst->src[2]) &&
|
|
|
|
|
|
saturate == inst->saturate &&
|
2012-10-03 13:23:05 -07:00
|
|
|
|
predicate == inst->predicate &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
conditional_mod == inst->conditional_mod &&
|
|
|
|
|
|
mlen == inst->mlen &&
|
|
|
|
|
|
base_mrf == inst->base_mrf &&
|
|
|
|
|
|
target == inst->target &&
|
|
|
|
|
|
eot == inst->eot &&
|
2015-03-24 10:17:32 -07:00
|
|
|
|
header_size == inst->header_size &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
shadow_compare == inst->shadow_compare &&
|
2014-08-14 13:56:24 -07:00
|
|
|
|
exec_size == inst->exec_size &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
offset == inst->offset);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-06 15:06:59 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::overwrites_reg(const fs_reg ®) const
|
2012-07-06 15:06:59 -07:00
|
|
|
|
{
|
2015-03-18 19:35:31 +02:00
|
|
|
|
return reg.in_range(dst, regs_written);
|
2012-07-06 15:06:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
|
|
|
|
|
|
case SHADER_OPCODE_SHADER_TIME_ADD:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_CENTROID:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2014-09-11 16:13:15 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC:
|
2014-09-11 16:43:37 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
|
2015-04-23 14:24:14 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
|
2015-04-23 14:28:25 +03:00
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE:
|
2014-10-20 23:00:50 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
2014-09-13 11:49:55 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
|
return src[1].file == GRF;
|
2014-09-12 16:17:37 -07:00
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
return src[0].file == GRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
|
default:
|
|
|
|
|
|
if (is_tex())
|
|
|
|
|
|
return src[0].file == GRF;
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-01 15:38:23 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg reg = this->src[0];
|
|
|
|
|
|
if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
if (grf_alloc.sizes[reg.reg] != this->regs_written)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
for (int i = 0; i < this->sources; i++) {
|
|
|
|
|
|
reg.type = this->src[i].type;
|
|
|
|
|
|
if (!this->src[i].equals(reg))
|
2015-04-01 15:38:23 -07:00
|
|
|
|
return false;
|
2015-06-18 12:07:27 -07:00
|
|
|
|
|
|
|
|
|
|
if (i < this->header_size) {
|
|
|
|
|
|
reg.reg_offset += 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
reg.reg_offset += this->exec_size / 8;
|
|
|
|
|
|
}
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
}
|
2015-04-01 15:38:23 -07:00
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2015-04-15 18:00:05 -07:00
|
|
|
|
fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6 && is_math())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (!backend_instruction::can_do_source_mods())
|
2013-09-19 19:48:22 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-31 15:49:42 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::has_side_effects() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return this->eot || backend_instruction::has_side_effects();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_reg::init()
|
|
|
|
|
|
{
|
|
|
|
|
|
memset(this, 0, sizeof(*this));
|
2013-12-08 04:57:35 +01:00
|
|
|
|
stride = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Generic unset register constructor. */
|
|
|
|
|
|
fs_reg::fs_reg()
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = BAD_FILE;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(float f)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
2015-07-13 14:50:24 +03:00
|
|
|
|
this->stride = 0;
|
2014-06-29 15:13:24 -07:00
|
|
|
|
this->fixed_hw_reg.dw1.f = f;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(int32_t i)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_D;
|
2015-07-13 14:50:24 +03:00
|
|
|
|
this->stride = 0;
|
2014-06-29 15:13:24 -07:00
|
|
|
|
this->fixed_hw_reg.dw1.d = i;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint32_t u)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_UD;
|
2015-07-13 14:50:24 +03:00
|
|
|
|
this->stride = 0;
|
2014-06-29 15:13:24 -07:00
|
|
|
|
this->fixed_hw_reg.dw1.ud = u;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-20 11:47:40 -08:00
|
|
|
|
/** Vector float immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint8_t vf[4])
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_VF;
|
|
|
|
|
|
memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-08 17:25:34 -08:00
|
|
|
|
/** Vector float immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_VF;
|
|
|
|
|
|
this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
|
|
|
|
|
|
(vf1 << 8) |
|
|
|
|
|
|
(vf2 << 16) |
|
|
|
|
|
|
(vf3 << 24);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-22 16:26:53 -08:00
|
|
|
|
/** Fixed brw_reg. */
|
2012-07-04 13:12:50 -07:00
|
|
|
|
fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
2013-04-29 16:05:05 -07:00
|
|
|
|
this->file = HW_REG;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->fixed_hw_reg = fixed_hw_reg;
|
2014-02-21 23:52:24 -08:00
|
|
|
|
this->type = fixed_hw_reg.type;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
|
|
|
|
|
return (file == r.file &&
|
|
|
|
|
|
reg == r.reg &&
|
|
|
|
|
|
reg_offset == r.reg_offset &&
|
2013-12-08 04:57:08 +01:00
|
|
|
|
subreg_offset == r.subreg_offset &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
type == r.type &&
|
|
|
|
|
|
negate == r.negate &&
|
|
|
|
|
|
abs == r.abs &&
|
2012-11-08 16:06:24 -08:00
|
|
|
|
!reladdr && !r.reladdr &&
|
2015-08-18 14:28:03 -07:00
|
|
|
|
((file != HW_REG && file != IMM) ||
|
|
|
|
|
|
memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
|
|
|
|
|
|
sizeof(fixed_hw_reg)) == 0) &&
|
2014-06-29 15:13:24 -07:00
|
|
|
|
stride == r.stride);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-01-15 22:21:30 +01:00
|
|
|
|
fs_reg &
|
|
|
|
|
|
fs_reg::set_smear(unsigned subreg)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(file != HW_REG && file != IMM);
|
|
|
|
|
|
subreg_offset = subreg * type_sz(type);
|
|
|
|
|
|
stride = 0;
|
|
|
|
|
|
return *this;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_contiguous() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return stride == 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-14 15:43:44 +03:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_reg::component_size(unsigned width) const
|
|
|
|
|
|
{
|
|
|
|
|
|
const unsigned stride = (file != HW_REG ? this->stride :
|
|
|
|
|
|
fixed_hw_reg.hstride == 0 ? 0 :
|
|
|
|
|
|
1 << (fixed_hw_reg.hstride - 1));
|
|
|
|
|
|
return MAX2(width * stride, 1) * type_sz(type);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-12 14:19:17 -07:00
|
|
|
|
extern "C" int
|
|
|
|
|
|
type_size_scalar(const struct glsl_type *type)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
|
|
|
|
|
unsigned int size, i;
|
|
|
|
|
|
|
|
|
|
|
|
switch (type->base_type) {
|
|
|
|
|
|
case GLSL_TYPE_UINT:
|
|
|
|
|
|
case GLSL_TYPE_INT:
|
|
|
|
|
|
case GLSL_TYPE_FLOAT:
|
|
|
|
|
|
case GLSL_TYPE_BOOL:
|
2010-08-27 10:44:04 -07:00
|
|
|
|
return type->components();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
case GLSL_TYPE_ARRAY:
|
2015-08-12 14:19:17 -07:00
|
|
|
|
return type_size_scalar(type->fields.array) * type->length;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
case GLSL_TYPE_STRUCT:
|
|
|
|
|
|
size = 0;
|
|
|
|
|
|
for (i = 0; i < type->length; i++) {
|
2015-08-12 14:19:17 -07:00
|
|
|
|
size += type_size_scalar(type->fields.structure[i].type);
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
return size;
|
|
|
|
|
|
case GLSL_TYPE_SAMPLER:
|
|
|
|
|
|
/* Samplers take up no register space, since they're baked in at
|
|
|
|
|
|
* link time.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return 0;
|
2013-10-20 12:35:47 -07:00
|
|
|
|
case GLSL_TYPE_ATOMIC_UINT:
|
|
|
|
|
|
return 0;
|
2015-07-21 14:22:11 +10:00
|
|
|
|
case GLSL_TYPE_SUBROUTINE:
|
|
|
|
|
|
return 1;
|
2013-11-25 13:50:47 -08:00
|
|
|
|
case GLSL_TYPE_IMAGE:
|
2015-05-05 21:05:45 +03:00
|
|
|
|
return BRW_IMAGE_PARAM_SIZE;
|
2012-12-11 12:56:03 -08:00
|
|
|
|
case GLSL_TYPE_VOID:
|
|
|
|
|
|
case GLSL_TYPE_ERROR:
|
2012-12-11 12:11:16 -08:00
|
|
|
|
case GLSL_TYPE_INTERFACE:
|
2014-08-14 18:49:20 +10:00
|
|
|
|
case GLSL_TYPE_DOUBLE:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
2012-12-11 12:56:03 -08:00
|
|
|
|
|
|
|
|
|
|
return 0;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-26 23:51:27 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Create a MOV to read the timestamp register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The caller is responsible for emitting the MOV. The return value is
|
|
|
|
|
|
* the destination of the MOV, with extra parameters set.
|
|
|
|
|
|
*/
|
2012-11-27 14:10:52 -08:00
|
|
|
|
fs_reg
|
2015-06-03 20:43:09 +03:00
|
|
|
|
fs_visitor::get_timestamp(const fs_builder &bld)
|
2012-11-27 14:10:52 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2014-10-31 11:12:30 -07:00
|
|
|
|
fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
2012-11-27 14:10:52 -08:00
|
|
|
|
BRW_ARF_TIMESTAMP,
|
|
|
|
|
|
0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
2015-06-18 12:44:35 -07:00
|
|
|
|
fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2014-10-31 11:12:30 -07:00
|
|
|
|
/* We want to read the 3 fields we care about even if it's not enabled in
|
|
|
|
|
|
* the dispatch.
|
2012-11-27 14:10:52 -08:00
|
|
|
|
*/
|
2015-06-18 12:24:27 -07:00
|
|
|
|
bld.group(4, 0).exec_all().MOV(dst, ts);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* The caller wants the low 32 bits of the timestamp. Since it's running
|
|
|
|
|
|
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
|
|
|
|
|
|
* which is plenty of time for our purposes. It is identical across the
|
|
|
|
|
|
* EUs, but since it's tracking GPU core speed it will increment at a
|
|
|
|
|
|
* varying rate as render P-states change.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The caller could also check if render P-states have changed (or anything
|
|
|
|
|
|
* else that might disrupt timing) by setting smear to 2 and checking if
|
|
|
|
|
|
* that field is != 0.
|
|
|
|
|
|
*/
|
2014-01-15 22:21:30 +01:00
|
|
|
|
dst.set_smear(0);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_begin()
|
|
|
|
|
|
{
|
2015-06-03 20:43:09 +03:00
|
|
|
|
shader_start_time = get_timestamp(bld.annotate("shader time start"));
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_end()
|
|
|
|
|
|
{
|
2015-02-26 22:55:54 -08:00
|
|
|
|
/* Insert our code just before the final SEND with EOT. */
|
|
|
|
|
|
exec_node *end = this->instructions.get_tail();
|
|
|
|
|
|
assert(end && ((fs_inst *) end)->eot);
|
2015-06-03 20:43:09 +03:00
|
|
|
|
const fs_builder ibld = bld.annotate("shader time end")
|
|
|
|
|
|
.exec_all().at(NULL, end);
|
2015-02-26 22:55:54 -08:00
|
|
|
|
|
2015-06-03 20:43:09 +03:00
|
|
|
|
fs_reg shader_end_time = get_timestamp(ibld);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* Check that there weren't any timestamp reset events (assuming these
|
|
|
|
|
|
* were the only two timestamp reads that happened).
|
|
|
|
|
|
*/
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg reset = shader_end_time;
|
2014-01-15 22:21:30 +01:00
|
|
|
|
reset.set_smear(2);
|
2015-06-03 20:43:09 +03:00
|
|
|
|
set_condmod(BRW_CONDITIONAL_Z,
|
|
|
|
|
|
ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
|
|
|
|
|
|
ibld.IF(BRW_PREDICATE_NORMAL);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg start = shader_start_time;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
start.negate = true;
|
2015-06-18 12:44:35 -07:00
|
|
|
|
fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
|
2015-03-08 00:13:41 -08:00
|
|
|
|
diff.set_smear(0);
|
2015-06-18 12:24:27 -07:00
|
|
|
|
|
|
|
|
|
|
const fs_builder cbld = ibld.group(1, 0);
|
|
|
|
|
|
cbld.group(1, 0).ADD(diff, start, shader_end_time);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* If there were no instructions between the two timestamp gets, the diff
|
|
|
|
|
|
* is 2 cycles. Remove that overhead, so I can forget about that when
|
|
|
|
|
|
* trying to determine the time taken for single instructions.
|
|
|
|
|
|
*/
|
2015-06-18 12:24:27 -07:00
|
|
|
|
cbld.ADD(diff, diff, fs_reg(-2u));
|
|
|
|
|
|
SHADER_TIME_ADD(cbld, 0, diff);
|
|
|
|
|
|
SHADER_TIME_ADD(cbld, 1, fs_reg(1u));
|
2015-06-03 20:43:09 +03:00
|
|
|
|
ibld.emit(BRW_OPCODE_ELSE);
|
2015-06-18 12:24:27 -07:00
|
|
|
|
SHADER_TIME_ADD(cbld, 2, fs_reg(1u));
|
2015-06-03 20:43:09 +03:00
|
|
|
|
ibld.emit(BRW_OPCODE_ENDIF);
|
2012-12-10 09:21:34 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 20:43:09 +03:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
|
2015-06-19 15:40:09 -07:00
|
|
|
|
int shader_time_subindex,
|
2015-06-19 14:46:03 -07:00
|
|
|
|
fs_reg value)
|
2012-12-10 09:21:34 -08:00
|
|
|
|
{
|
2015-06-19 14:46:03 -07:00
|
|
|
|
int index = shader_time_index * 3 + shader_time_subindex;
|
|
|
|
|
|
fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2013-03-19 15:28:11 -07:00
|
|
|
|
fs_reg payload;
|
|
|
|
|
|
if (dispatch_width == 8)
|
2014-05-16 02:21:51 -07:00
|
|
|
|
payload = vgrf(glsl_type::uvec2_type);
|
2013-03-19 15:28:11 -07:00
|
|
|
|
else
|
2014-05-16 02:21:51 -07:00
|
|
|
|
payload = vgrf(glsl_type::uint_type);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-06-03 20:43:09 +03:00
|
|
|
|
bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2015-02-18 17:43:07 -08:00
|
|
|
|
msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
|
2015-02-18 17:43:07 -08:00
|
|
|
|
if (debug_enabled) {
|
2011-06-10 15:26:02 -03:00
|
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Mark this program as impossible to compile in SIMD16 mode.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
|
|
|
|
|
* things that are unsupported in SIMD16 mode, so the compiler can skip
|
|
|
|
|
|
* the SIMD16 compile altogether.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During a SIMD16 compile (if one happens anyway), this just calls fail().
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2015-06-22 16:30:04 -07:00
|
|
|
|
fs_visitor::no16(const char *msg)
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
{
|
|
|
|
|
|
if (dispatch_width == 16) {
|
2015-06-22 16:30:04 -07:00
|
|
|
|
fail("%s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
simd16_unsupported = true;
|
|
|
|
|
|
|
2015-06-22 17:17:56 -07:00
|
|
|
|
compiler->shader_perf_log(log_data,
|
2015-06-22 17:01:22 -07:00
|
|
|
|
"SIMD16 shader failed to compile: %s", msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
|
* it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
|
{
|
2013-08-05 16:24:43 -07:00
|
|
|
|
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
|
2015-06-18 12:50:09 -07:00
|
|
|
|
(this->exec_size * type_sz(this->dst.type)) < 32 ||
|
2014-08-29 17:18:42 -07:00
|
|
|
|
!this->dst.is_contiguous());
|
2012-06-04 08:59:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
unsigned
|
|
|
|
|
|
fs_inst::components_read(unsigned i) const
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_LINTERP:
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_PIXEL_X:
|
|
|
|
|
|
case FS_OPCODE_PIXEL_Y:
|
|
|
|
|
|
assert(i == 0);
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
|
|
|
|
assert(src[6].file == IMM);
|
|
|
|
|
|
/* First/second FB write color. */
|
|
|
|
|
|
if (i < 2)
|
|
|
|
|
|
return src[6].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
|
assert(src[8].file == IMM && src[9].file == IMM);
|
|
|
|
|
|
/* Texture coordinates. */
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return src[8].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
/* Texture derivatives. */
|
|
|
|
|
|
else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
|
|
|
|
|
|
return src[9].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
/* Texture offset. */
|
|
|
|
|
|
else if (i == 7)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
assert(src[3].file == IMM);
|
|
|
|
|
|
/* Surface coordinates. */
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return src[3].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
/* Surface operation source (ignored for reads). */
|
|
|
|
|
|
else if (i == 1)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
assert(src[3].file == IMM &&
|
|
|
|
|
|
src[4].file == IMM);
|
|
|
|
|
|
/* Surface coordinates. */
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return src[3].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
/* Surface operation source. */
|
|
|
|
|
|
else if (i == 1)
|
|
|
|
|
|
return src[4].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
|
|
|
|
|
|
assert(src[3].file == IMM &&
|
|
|
|
|
|
src[4].file == IMM);
|
|
|
|
|
|
const unsigned op = src[4].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
/* Surface coordinates. */
|
|
|
|
|
|
if (i == 0)
|
|
|
|
|
|
return src[3].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
/* Surface operation source. */
|
|
|
|
|
|
else if (i == 1 && op == BRW_AOP_CMPWR)
|
|
|
|
|
|
return 2;
|
|
|
|
|
|
else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
|
|
|
|
|
|
op == BRW_AOP_PREDEC))
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
else
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 17:28:39 +03:00
|
|
|
|
default:
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
int
|
2015-02-06 01:24:17 +02:00
|
|
|
|
fs_inst::regs_read(int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
{
|
2015-06-18 11:53:08 -07:00
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
|
|
|
|
|
if (arg == 0)
|
|
|
|
|
|
return mlen;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-17 18:02:11 -07:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
|
|
|
|
|
|
/* The payload is actually stored in src1 */
|
|
|
|
|
|
if (arg == 1)
|
|
|
|
|
|
return mlen;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
case FS_OPCODE_LINTERP:
|
2015-07-21 17:28:39 +03:00
|
|
|
|
if (arg == 1)
|
2015-07-01 09:58:47 -07:00
|
|
|
|
return 1;
|
2015-06-18 17:48:27 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-06-30 15:51:13 -07:00
|
|
|
|
case SHADER_OPCODE_LOAD_PAYLOAD:
|
|
|
|
|
|
if (arg < this->header_size)
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-07-16 15:04:43 -07:00
|
|
|
|
case CS_OPCODE_CS_TERMINATE:
|
2015-09-15 14:01:17 -07:00
|
|
|
|
case SHADER_OPCODE_BARRIER:
|
2015-07-16 15:04:43 -07:00
|
|
|
|
return 1;
|
|
|
|
|
|
|
2015-06-18 11:53:08 -07:00
|
|
|
|
default:
|
|
|
|
|
|
if (is_tex() && arg == 0 && src[0].file == GRF)
|
|
|
|
|
|
return mlen;
|
|
|
|
|
|
break;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
|
case BAD_FILE:
|
2015-07-16 15:58:56 +03:00
|
|
|
|
return 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
case GRF:
|
2015-08-05 16:29:30 +03:00
|
|
|
|
case ATTR:
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case HW_REG:
|
2015-07-21 17:28:39 +03:00
|
|
|
|
return DIV_ROUND_UP(components_read(arg) *
|
|
|
|
|
|
src[arg].component_size(exec_size),
|
2015-07-14 15:43:44 +03:00
|
|
|
|
REG_SIZE);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
case MRF:
|
|
|
|
|
|
unreachable("MRF registers are not allowed as sources");
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid register file");
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-20 11:32:01 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::reads_flag() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
|
|
|
|
|
return predicate;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::writes_flag() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
2014-11-11 15:56:58 -08:00
|
|
|
|
return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
|
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) ||
|
2013-10-20 11:32:01 -07:00
|
|
|
|
opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns how many MRFs an FS opcode will write over.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this is not the 0 or 1 implied writes in an actual gen
|
|
|
|
|
|
* instruction -- the FS opcodes often generate MOVs in addition.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int
|
|
|
|
|
|
fs_visitor::implied_mrf_writes(fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (inst->mlen == 0)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->base_mrf == -1)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
switch (inst->opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 1 * dispatch_width / 8;
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_POW:
|
2011-09-28 17:37:54 -07:00
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 2 * dispatch_width / 8;
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
case FS_OPCODE_TXB:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
2013-12-10 16:36:31 +02:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2013-11-30 10:32:16 +13:00
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
2013-03-31 21:31:12 +13:00
|
|
|
|
case SHADER_OPCODE_TG4:
|
2013-10-08 21:42:10 +13:00
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
2013-03-06 14:47:01 -08:00
|
|
|
|
case SHADER_OPCODE_LOD:
|
2015-08-11 20:37:32 -04:00
|
|
|
|
case SHADER_OPCODE_SAMPLEINFO:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
return 2;
|
2015-04-13 16:55:49 +02:00
|
|
|
|
case FS_OPCODE_GET_BUFFER_SIZE:
|
2012-11-07 10:42:34 -08:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_READ:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
2012-11-07 11:18:34 -08:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
|
2013-03-18 10:16:42 -07:00
|
|
|
|
return inst->mlen;
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
|
2015-05-19 17:35:29 -07:00
|
|
|
|
return inst->mlen;
|
2013-09-11 14:01:50 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC:
|
2013-09-11 14:03:13 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
|
2015-04-23 14:24:14 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
|
2015-04-23 14:28:25 +03:00
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE:
|
2014-10-20 23:00:50 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
2013-11-18 21:13:13 +13:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_CENTROID:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2013-09-11 14:01:50 -07:00
|
|
|
|
return 0;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
default:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::vgrf(const glsl_type *const type)
|
|
|
|
|
|
{
|
|
|
|
|
|
int reg_width = dispatch_width / 8;
|
2015-08-12 14:19:17 -07:00
|
|
|
|
return fs_reg(GRF, alloc.allocate(type_size_scalar(type) * reg_width),
|
2015-06-18 12:44:35 -07:00
|
|
|
|
brw_type_for_base_type(type));
|
2014-05-16 02:21:51 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
/** Fixed HW reg constructor. */
|
2011-05-15 09:36:19 -07:00
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
2010-09-03 13:21:51 -07:00
|
|
|
|
init();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->file = file;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
this->reg = reg;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-15 12:04:52 -07:00
|
|
|
|
/** Fixed HW reg constructor. */
|
2014-06-29 16:02:59 -07:00
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
|
2010-10-15 12:04:52 -07:00
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = file;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
this->reg = reg;
|
2010-10-15 12:04:52 -07:00
|
|
|
|
this->type = type;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
this->stride = (file == UNIFORM ? 0 : 1);
|
2010-10-15 12:04:52 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->pull_constant_loc = v->pull_constant_loc;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->uniforms = v->uniforms;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->param_size = v->param_size;
|
2011-03-23 12:50:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
fs_reg *
|
2014-08-05 11:02:02 -07:00
|
|
|
|
fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
|
|
|
|
|
|
bool origin_upper_left)
|
2010-09-28 13:29:45 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
fs_reg wpos = *reg;
|
2014-08-05 11:02:02 -07:00
|
|
|
|
bool flip = !origin_upper_left ^ key->render_to_fbo;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.x */
|
2014-08-05 11:02:02 -07:00
|
|
|
|
if (pixel_center_integer) {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.MOV(wpos, this->pixel_x);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
} else {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
wpos = offset(wpos, bld, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.y */
|
2014-08-05 11:02:02 -07:00
|
|
|
|
if (!flip && pixel_center_integer) {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.MOV(wpos, this->pixel_y);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg pixel_y = this->pixel_y;
|
2015-07-12 00:13:45 -07:00
|
|
|
|
float offset = (pixel_center_integer ? 0.0f : 0.5f);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
2010-11-13 14:00:58 -08:00
|
|
|
|
if (flip) {
|
2010-09-28 13:29:45 -07:00
|
|
|
|
pixel_y.negate = true;
|
2015-07-12 00:13:45 -07:00
|
|
|
|
offset += key->drawable_height - 1.0f;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.ADD(wpos, pixel_y, fs_reg(offset));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
wpos = offset(wpos, bld, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.z */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
} else {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.emit(FS_OPCODE_LINTERP, wpos,
|
2015-04-06 17:44:40 -07:00
|
|
|
|
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
|
2013-02-23 09:00:58 -08:00
|
|
|
|
interp_reg(VARYING_SLOT_POS, 2));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
wpos = offset(wpos, bld, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.w: Already set up in emit_interpolation */
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.MOV(wpos, this->wpos_w);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
return reg;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-21 11:33:22 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
|
2012-06-18 13:52:02 -07:00
|
|
|
|
glsl_interp_qualifier interpolation_mode,
|
2014-01-06 13:59:18 -08:00
|
|
|
|
bool is_centroid, bool is_sample)
|
2012-06-21 11:33:22 -07:00
|
|
|
|
{
|
|
|
|
|
|
brw_wm_barycentric_interp_mode barycoord_mode;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2013-04-27 11:00:46 +12:00
|
|
|
|
if (is_centroid) {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
|
2014-01-06 13:59:18 -08:00
|
|
|
|
} else if (is_sample) {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
|
2013-04-27 11:00:46 +12:00
|
|
|
|
} else {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
|
|
|
|
|
|
}
|
2012-06-18 13:52:02 -07:00
|
|
|
|
} else {
|
2013-04-27 11:00:46 +12:00
|
|
|
|
/* On Ironlake and below, there is only one interpolation mode.
|
|
|
|
|
|
* Centroid interpolation doesn't mean anything on this hardware --
|
|
|
|
|
|
* there is no multisampling.
|
|
|
|
|
|
*/
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
|
2012-06-18 13:52:02 -07:00
|
|
|
|
}
|
2015-06-03 21:54:54 +03:00
|
|
|
|
return bld.emit(FS_OPCODE_LINTERP, attr,
|
|
|
|
|
|
this->delta_xy[barycoord_mode], interp);
|
2012-06-21 11:33:22 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-20 18:05:36 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
|
|
|
|
|
|
const glsl_type *type,
|
|
|
|
|
|
glsl_interp_qualifier interpolation_mode,
|
|
|
|
|
|
int location, bool mod_centroid,
|
|
|
|
|
|
bool mod_sample)
|
2010-09-03 13:22:38 -07:00
|
|
|
|
{
|
2014-10-20 18:05:36 -07:00
|
|
|
|
attr.type = brw_type_for_base_type(type->get_scalar_type());
|
2010-09-03 13:22:38 -07:00
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
unsigned int array_elements;
|
|
|
|
|
|
|
2014-10-20 18:05:36 -07:00
|
|
|
|
if (type->is_array()) {
|
|
|
|
|
|
array_elements = type->length;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
if (array_elements == 0) {
|
2014-10-20 18:05:36 -07:00
|
|
|
|
fail("dereferenced array '%s' has length 0\n", name);
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
2014-10-20 18:05:36 -07:00
|
|
|
|
type = type->fields.array;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
array_elements = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-20 18:05:36 -07:00
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_NONE) {
|
|
|
|
|
|
bool is_gl_Color =
|
|
|
|
|
|
location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
|
|
|
|
|
|
if (key->flat_shade && is_gl_Color) {
|
|
|
|
|
|
interpolation_mode = INTERP_QUALIFIER_FLAT;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
interpolation_mode = INTERP_QUALIFIER_SMOOTH;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2011-10-21 07:56:08 -07:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
for (unsigned int i = 0; i < array_elements; i++) {
|
|
|
|
|
|
for (unsigned int j = 0; j < type->matrix_columns; j++) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
if (prog_data->urb_setup[location] == -1) {
|
2010-09-28 14:53:36 -07:00
|
|
|
|
/* If there's no incoming setup data for this slot, don't
|
2010-10-01 12:15:48 -07:00
|
|
|
|
* emit interpolation for it.
|
2010-09-28 14:53:36 -07:00
|
|
|
|
*/
|
2015-06-18 12:07:27 -07:00
|
|
|
|
attr = offset(attr, bld, type->vector_elements);
|
2010-10-01 11:44:27 -07:00
|
|
|
|
location++;
|
2010-09-28 14:53:36 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-10-21 07:56:08 -07:00
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
|
2011-01-12 12:52:16 -08:00
|
|
|
|
/* Constant interpolation (flat shading) case. The SF has
|
|
|
|
|
|
* handed us defined values in only the constant offset
|
|
|
|
|
|
* field of the setup reg.
|
|
|
|
|
|
*/
|
2011-03-29 15:39:01 +01:00
|
|
|
|
for (unsigned int k = 0; k < type->vector_elements; k++) {
|
|
|
|
|
|
struct brw_reg interp = interp_reg(location, k);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
interp = suboffset(interp, 3);
|
2014-10-20 18:05:36 -07:00
|
|
|
|
interp.type = attr.type;
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
|
2015-06-18 12:07:27 -07:00
|
|
|
|
attr = offset(attr, bld, 1);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2011-10-22 09:33:16 -07:00
|
|
|
|
/* Smooth/noperspective interpolation case. */
|
2011-03-29 15:39:01 +01:00
|
|
|
|
for (unsigned int k = 0; k < type->vector_elements; k++) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
struct brw_reg interp = interp_reg(location, k);
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
/* Get the pixel/sample mask into f0 so that we know
|
|
|
|
|
|
* which pixels are lit. Then, for each channel that is
|
|
|
|
|
|
* unlit, replace the centroid data with non-centroid
|
|
|
|
|
|
* data.
|
|
|
|
|
|
*/
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
|
2014-06-28 18:38:03 -07:00
|
|
|
|
|
|
|
|
|
|
fs_inst *inst;
|
i965/fs: Mark predicated PLN instructions with dependency hints.
To implement the unlit_centroid_workaround, previously we emitted
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 1Q };
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 1Q };
where the flag register contains the channel enable bits from g0.
Since the predicates are complementary, the pair of pln instructions
write to non-overlapping components of the destination, which is the
case that the dependency control hints are designed for.
Typically setting dependency control hints on predicated instructions
isn't safe (if an instruction doesn't execute due to the predicate, it
won't update the scoreboard, leaving it in a bad state) but since we
must have at least one channel executing (i.e., +f0 is true for some
channel) by virtue of the fact that the thread is running, we can put
the +f0 pln instruction last and set the hints:
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 NoDDClr 1Q };
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 NoDDChk 1Q };
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
2014-06-28 23:32:05 -07:00
|
|
|
|
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
|
|
|
|
|
false, false);
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = true;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->has_pln)
|
i965/fs: Mark predicated PLN instructions with dependency hints.
To implement the unlit_centroid_workaround, previously we emitted
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 1Q };
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 1Q };
where the flag register contains the channel enable bits from g0.
Since the predicates are complementary, the pair of pln instructions
write to non-overlapping components of the destination, which is the
case that the dependency control hints are designed for.
Typically setting dependency control hints on predicated instructions
isn't safe (if an instruction doesn't execute due to the predicate, it
won't update the scoreboard, leaving it in a bad state) but since we
must have at least one channel executing (i.e., +f0 is true for some
channel) by virtue of the fact that the thread is running, we can put
the +f0 pln instruction last and set the hints:
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 NoDDClr 1Q };
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 NoDDChk 1Q };
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
2014-06-28 23:32:05 -07:00
|
|
|
|
inst->no_dd_clear = true;
|
|
|
|
|
|
|
2014-06-28 18:38:03 -07:00
|
|
|
|
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
2014-10-20 18:05:36 -07:00
|
|
|
|
mod_centroid && !key->persample_shading,
|
|
|
|
|
|
mod_sample || key->persample_shading);
|
2014-06-28 18:38:03 -07:00
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->has_pln)
|
i965/fs: Mark predicated PLN instructions with dependency hints.
To implement the unlit_centroid_workaround, previously we emitted
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 1Q };
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 1Q };
where the flag register contains the channel enable bits from g0.
Since the predicates are complementary, the pair of pln instructions
write to non-overlapping components of the destination, which is the
case that the dependency control hints are designed for.
Typically setting dependency control hints on predicated instructions
isn't safe (if an instruction doesn't execute due to the predicate, it
won't update the scoreboard, leaving it in a bad state) but since we
must have at least one channel executing (i.e., +f0 is true for some
channel) by virtue of the fact that the thread is running, we can put
the +f0 pln instruction last and set the hints:
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 NoDDClr 1Q };
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 NoDDChk 1Q };
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
2014-06-28 23:32:05 -07:00
|
|
|
|
inst->no_dd_check = true;
|
2014-06-28 18:38:03 -07:00
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
2014-10-20 18:05:36 -07:00
|
|
|
|
mod_centroid && !key->persample_shading,
|
|
|
|
|
|
mod_sample || key->persample_shading);
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
}
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.MUL(attr, attr, this->pixel_w);
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
attr = offset(attr, bld, 1);
|
2010-10-06 11:00:31 -07:00
|
|
|
|
}
|
2011-01-12 12:52:16 -08:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
location++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-06 11:13:22 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
2014-08-05 10:29:00 -07:00
|
|
|
|
fs_visitor::emit_frontfacing_interpolation()
|
2010-10-06 11:13:22 -07:00
|
|
|
|
{
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2014-08-10 09:04:49 -07:00
|
|
|
|
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
|
* a boolean result from this (~0/true or 0/false).
|
|
|
|
|
|
*
|
|
|
|
|
|
* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
|
|
|
|
|
|
* this task in only one instruction:
|
|
|
|
|
|
* - a negation source modifier will flip the bit; and
|
|
|
|
|
|
* - a W -> D type conversion will sign extend the bit into the high
|
|
|
|
|
|
* word of the destination.
|
|
|
|
|
|
*
|
|
|
|
|
|
* An ASR 15 fills the low word of the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
|
|
|
|
|
|
g0.negate = true;
|
|
|
|
|
|
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.ASR(*reg, g0, fs_reg(15));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
} else {
|
2014-08-10 10:28:34 -07:00
|
|
|
|
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
|
* a boolean result from this (1/true or 0/false).
|
|
|
|
|
|
*
|
|
|
|
|
|
* Like in the above case, since the bit is the MSB of g1.6:UD we can use
|
|
|
|
|
|
* the negation source modifier to flip it. Unfortunately the SHR
|
|
|
|
|
|
* instruction only operates on UD (or D with an abs source modifier)
|
|
|
|
|
|
* sources without negation.
|
|
|
|
|
|
*
|
2014-12-02 12:28:13 -08:00
|
|
|
|
* Instead, use ASR (which will give ~0/true or 0/false).
|
2010-10-06 11:19:48 -07:00
|
|
|
|
*/
|
2014-08-10 10:28:34 -07:00
|
|
|
|
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
|
|
|
|
|
|
g1_6.negate = true;
|
|
|
|
|
|
|
2015-06-03 21:54:54 +03:00
|
|
|
|
bld.ASR(*reg, g1_6, fs_reg(31));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
}
|
2010-10-06 11:13:22 -07:00
|
|
|
|
|
|
|
|
|
|
return reg;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 15:53:05 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2013-10-24 15:53:05 -07:00
|
|
|
|
assert(dst.type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->compute_pos_offset) {
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Convert int_sample_pos to floating point */
|
2015-06-03 21:56:20 +03:00
|
|
|
|
bld.MOV(dst, int_sample_pos);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Scale to the range [0, 1] */
|
2015-06-03 21:56:20 +03:00
|
|
|
|
bld.MUL(dst, dst, fs_reg(1 / 16.0f));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
else {
|
|
|
|
|
|
/* From ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SamplePosition will always be
|
|
|
|
|
|
* (0.5, 0.5).
|
|
|
|
|
|
*/
|
2015-06-03 21:56:20 +03:00
|
|
|
|
bld.MOV(dst, fs_reg(0.5f));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
2014-08-05 11:10:07 -07:00
|
|
|
|
fs_visitor::emit_samplepos_setup()
|
2013-10-24 15:53:05 -07:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2015-06-03 21:56:20 +03:00
|
|
|
|
const fs_builder abld = bld.annotate("compute sample position");
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
fs_reg pos = *reg;
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg int_sample_x = vgrf(glsl_type::int_type);
|
|
|
|
|
|
fs_reg int_sample_y = vgrf(glsl_type::int_type);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
|
|
|
|
|
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
|
|
|
|
|
|
* mode will be enabled.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the Ivy Bridge PRM, volume 2 part 1, page 344:
|
|
|
|
|
|
* R31.1:0 Position Offset X/Y for Slot[3:0]
|
|
|
|
|
|
* R31.3:2 Position Offset X/Y for Slot[7:4]
|
|
|
|
|
|
* .....
|
|
|
|
|
|
*
|
|
|
|
|
|
* The X, Y sample positions come in as bytes in thread payload. So, read
|
|
|
|
|
|
* the positions using vstride=16, width=8, hstride=2.
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_reg sample_pos_reg =
|
2014-05-13 21:52:51 -07:00
|
|
|
|
stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
|
2013-10-24 15:53:05 -07:00
|
|
|
|
BRW_REGISTER_TYPE_B), 16, 8, 2);
|
|
|
|
|
|
|
2014-08-16 10:48:18 -07:00
|
|
|
|
if (dispatch_width == 8) {
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
|
2014-08-16 10:48:18 -07:00
|
|
|
|
} else {
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
|
|
|
|
|
|
abld.half(1).MOV(half(int_sample_x, 1),
|
|
|
|
|
|
fs_reg(suboffset(sample_pos_reg, 16)));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
/* Compute gl_SamplePosition.x */
|
|
|
|
|
|
compute_sample_position(pos, int_sample_x);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
pos = offset(pos, abld, 1);
|
2014-08-16 10:48:18 -07:00
|
|
|
|
if (dispatch_width == 8) {
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
|
2014-08-16 10:48:18 -07:00
|
|
|
|
} else {
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.half(0).MOV(half(int_sample_y, 0),
|
|
|
|
|
|
fs_reg(suboffset(sample_pos_reg, 1)));
|
|
|
|
|
|
abld.half(1).MOV(half(int_sample_y, 1),
|
|
|
|
|
|
fs_reg(suboffset(sample_pos_reg, 17)));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
/* Compute gl_SamplePosition.y */
|
|
|
|
|
|
compute_sample_position(pos, int_sample_y);
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
fs_reg *
|
2014-10-17 12:59:18 -07:00
|
|
|
|
fs_visitor::emit_sampleid_setup()
|
2013-10-24 16:17:08 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
2015-06-03 21:56:20 +03:00
|
|
|
|
const fs_builder abld = bld.annotate("compute sample id");
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->compute_sample_id) {
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg t1 = vgrf(glsl_type::int_type);
|
|
|
|
|
|
fs_reg t2 = vgrf(glsl_type::int_type);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
t2.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
|
|
|
|
|
|
/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
|
|
|
|
|
|
* 8x multisampling, subspan 0 will represent sample N (where N
|
|
|
|
|
|
* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
|
|
|
|
|
|
* 7. We can find the value of N by looking at R0.0 bits 7:6
|
|
|
|
|
|
* ("Starting Sample Pair Index (SSPI)") and multiplying by two
|
|
|
|
|
|
* (since samples are always delivered in pairs). That is, we
|
|
|
|
|
|
* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
|
|
|
|
|
|
* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
|
|
|
|
|
|
* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
|
|
|
|
|
|
* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
|
|
|
|
|
|
* populating a temporary variable with the sequence (0, 1, 2, 3),
|
|
|
|
|
|
* and then reading from it using vstride=1, width=4, hstride=0.
|
|
|
|
|
|
* These computations hold good for 4x multisampling as well.
|
2014-07-18 13:19:45 -07:00
|
|
|
|
*
|
|
|
|
|
|
* For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
|
|
|
|
|
|
* the first four slots are sample 0 of subspan 0; the next four
|
|
|
|
|
|
* are sample 1 of subspan 0; the third group is sample 0 of
|
|
|
|
|
|
* subspan 1, and finally sample 1 of subspan 1.
|
2013-10-24 16:17:08 -07:00
|
|
|
|
*/
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.exec_all()
|
|
|
|
|
|
.AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
|
|
|
|
|
|
fs_reg(0xc0));
|
|
|
|
|
|
abld.exec_all().SHR(t1, t1, fs_reg(5));
|
|
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
/* This works for both SIMD8 and SIMD16 */
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.exec_all()
|
|
|
|
|
|
.MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
|
|
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
/* This special instruction takes care of setting vstride=1,
|
|
|
|
|
|
* width=4, hstride=0 of t2 during an ADD instruction.
|
|
|
|
|
|
*/
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
/* As per GL_ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SampleID will always be zero."
|
|
|
|
|
|
*/
|
2015-06-03 21:56:20 +03:00
|
|
|
|
abld.MOV(*reg, fs_reg(0));
|
2013-10-24 16:17:08 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-10 11:52:50 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::resolve_source_modifiers(const fs_reg &src)
|
2015-03-05 20:39:49 -08:00
|
|
|
|
{
|
2015-08-10 11:52:50 -07:00
|
|
|
|
if (!src.abs && !src.negate)
|
|
|
|
|
|
return src;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg temp = bld.vgrf(src.type);
|
|
|
|
|
|
bld.MOV(temp, src);
|
2015-03-05 20:39:49 -08:00
|
|
|
|
|
2015-08-10 11:52:50 -07:00
|
|
|
|
return temp;
|
2015-03-05 20:39:49 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-05 15:48:39 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_discard_jump()
|
|
|
|
|
|
{
|
2015-04-10 10:04:55 -07:00
|
|
|
|
assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
|
|
|
|
|
|
|
2015-03-05 15:48:39 -08:00
|
|
|
|
/* For performance, after a discard, jump to the end of the
|
|
|
|
|
|
* shader if all relevant channels have been discarded.
|
|
|
|
|
|
*/
|
2015-06-03 20:45:54 +03:00
|
|
|
|
fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
|
2015-03-05 15:48:39 -08:00
|
|
|
|
discard_jump->flag_subreg = 1;
|
|
|
|
|
|
|
|
|
|
|
|
discard_jump->predicate = (dispatch_width == 8)
|
|
|
|
|
|
? BRW_PREDICATE_ALIGN1_ANY8H
|
|
|
|
|
|
: BRW_PREDICATE_ALIGN1_ANY16H;
|
|
|
|
|
|
discard_jump->predicate_inverse = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
|
{
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 8) {
|
2014-08-29 12:50:46 -07:00
|
|
|
|
prog_data->dispatch_grf_start_reg = payload.num_regs;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
} else {
|
2014-08-30 19:57:39 -07:00
|
|
|
|
if (stage == MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
|
|
|
|
|
|
} else if (stage == MESA_SHADER_COMPUTE) {
|
|
|
|
|
|
brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
unreachable("Unsupported shader type!");
|
|
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
|
prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
|
2014-02-19 15:27:01 +01:00
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
int constant_nr;
|
|
|
|
|
|
if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
|
|
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
|
*/
|
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-13 21:52:51 -07:00
|
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
|
constant_nr / 8,
|
|
|
|
|
|
constant_nr % 8);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
assert(inst->src[i].stride == 0);
|
2013-04-29 16:05:05 -07:00
|
|
|
|
inst->src[i].file = HW_REG;
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[i].fixed_hw_reg = byte_offset(
|
|
|
|
|
|
retype(brw_reg, inst->src[i].type),
|
|
|
|
|
|
inst->src[i].subreg_offset);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-10-03 19:05:32 -07:00
|
|
|
|
|
|
|
|
|
|
/* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
|
|
|
|
|
|
this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-16 21:53:02 -07:00
|
|
|
|
void
|
2010-10-01 12:15:48 -07:00
|
|
|
|
fs_visitor::calculate_urb_setup()
|
2010-08-16 21:53:02 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
|
2014-08-24 21:51:28 -07:00
|
|
|
|
memset(prog_data->urb_setup, -1,
|
|
|
|
|
|
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
int urb_next = 0;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (_mesa_bitcount_64(nir->info.inputs_read &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BRW_FS_VARYING_INPUT_MASK) <= 16) {
|
|
|
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
|
* a different vertex (or geometry) shader.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(i)) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_vue_map prev_stage_vue_map;
|
2015-04-17 12:52:00 -07:00
|
|
|
|
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
|
i965: Don't re-layout varyings for separate shader programs.
Previously, our VUE map code always assigned slots to varyings
sequentially, in one contiguous block.
This was a bad fit for separate shaders - the GS input layout depended
or the VS output layout, so if we swapped out vertex shaders, we might
have to recompile the GS on the fly - which rather defeats the point of
using separate shader objects. (Tessellation would suffer from this
as well - we could have to recompile the HS, DS, and GS.)
Instead, this patch makes the VUE map for separate shaders use a fixed
layout, based on the input/output variable's location field. (This is
either specified by layout(location = ...) or assigned by the linker.)
Corresponding inputs/outputs will match up by location; if there's a
mismatch, we're allowed to have undefined behavior.
This may be less efficient - depending what locations were chosen, we
may have empty padding slots in the VUE. But applications presumably
use small consecutive integers for locations, so it hopefully won't be
much worse in practice.
3% of Dota 2 Reborn shaders are hurt, but only by 2 instructions.
This seems like a small price to pay for avoiding recompiles.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz>
2015-09-09 16:21:56 -07:00
|
|
|
|
key->input_slots_valid,
|
2015-10-01 15:12:59 -07:00
|
|
|
|
nir->info.separate_shader);
|
2013-09-03 12:15:53 -07:00
|
|
|
|
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
|
|
|
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
|
slot++) {
|
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
|
|
|
|
|
/* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
|
|
|
|
|
|
* unused.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (varying != BRW_VARYING_SLOT_COUNT &&
|
2015-10-01 15:12:59 -07:00
|
|
|
|
(nir->info.inputs_read & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(varying))) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* Point size is packed into the header, not as a general attribute */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
if (i == VARYING_SLOT_PSIZ)
|
2012-07-19 22:00:16 +02:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* The back color slot is skipped when the front color is
|
|
|
|
|
|
* also written to. In addition, some slots can be
|
|
|
|
|
|
* written in the vertex shader and not read in the
|
|
|
|
|
|
* fragment shader. So the register number must always be
|
|
|
|
|
|
* incremented, mapped or not.
|
|
|
|
|
|
*/
|
2013-02-23 08:28:18 -08:00
|
|
|
|
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next;
|
2012-07-19 22:00:16 +02:00
|
|
|
|
urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-02-27 15:46:32 +08:00
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* It's a FS only attribute, and we did interpolation for this attribute
|
|
|
|
|
|
* in SF thread. So, count it here, too.
|
|
|
|
|
|
*
|
|
|
|
|
|
* See compile_sf_prog() for more info.
|
|
|
|
|
|
*/
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (nir->info.inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->num_varying_inputs = urb_next;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_urb_setup()
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
|
|
|
|
|
|
2014-09-02 11:38:29 -07:00
|
|
|
|
int urb_start = payload.num_regs + prog_data->base.curb_read_length;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
2010-08-16 21:53:02 -07:00
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2011-01-12 12:52:16 -08:00
|
|
|
|
if (inst->opcode == FS_OPCODE_LINTERP) {
|
2015-04-06 17:44:40 -07:00
|
|
|
|
assert(inst->src[1].file == HW_REG);
|
|
|
|
|
|
inst->src[1].fixed_hw_reg.nr += urb_start;
|
2011-01-12 12:52:16 -08:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2011-01-12 12:52:16 -08:00
|
|
|
|
if (inst->opcode == FS_OPCODE_CINTERP) {
|
2013-04-29 16:05:05 -07:00
|
|
|
|
assert(inst->src[0].file == HW_REG);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
inst->src[0].fixed_hw_reg.nr += urb_start;
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-02 17:35:32 -07:00
|
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg. */
|
2014-10-03 19:05:32 -07:00
|
|
|
|
this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_vs_urb_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
|
|
|
|
|
|
int grf, count, slot, channel, attr;
|
|
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
count = _mesa_bitcount_64(vs_prog_data->inputs_read);
|
|
|
|
|
|
if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
|
|
|
|
|
|
count++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Each attribute is 4 regs. */
|
2014-10-03 19:05:32 -07:00
|
|
|
|
this->first_non_payload_grf += count * 4;
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
|
|
|
|
|
unsigned vue_entries =
|
|
|
|
|
|
MAX2(count, vs_prog_data->base.vue_map.num_slots);
|
|
|
|
|
|
|
|
|
|
|
|
vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
|
|
|
|
|
|
vs_prog_data->base.urb_read_length = (count + 1) / 2;
|
|
|
|
|
|
|
|
|
|
|
|
assert(vs_prog_data->base.urb_read_length <= 15);
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to the hw grf that they land in. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].reg == VERT_ATTRIB_MAX) {
|
|
|
|
|
|
slot = count - 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Attributes come in in a contiguous block, ordered by their
|
|
|
|
|
|
* gl_vert_attrib value. That means we can compute the slot
|
|
|
|
|
|
* number for an attribute by masking out the enabled
|
|
|
|
|
|
* attributes before it and counting the bits.
|
|
|
|
|
|
*/
|
|
|
|
|
|
attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
|
|
|
|
|
|
slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
|
|
|
|
|
|
BITFIELD64_MASK(attr));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
channel = inst->src[i].reg_offset & 3;
|
|
|
|
|
|
|
|
|
|
|
|
grf = payload.num_regs +
|
|
|
|
|
|
prog_data->curb_read_length +
|
|
|
|
|
|
slot * 4 + channel;
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[i].file = HW_REG;
|
|
|
|
|
|
inst->src[i].fixed_hw_reg =
|
2015-09-23 16:57:47 -07:00
|
|
|
|
stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
|
|
|
|
|
|
inst->src[i].subreg_offset),
|
|
|
|
|
|
inst->exec_size * inst->src[i].stride,
|
|
|
|
|
|
inst->exec_size, inst->src[i].stride);
|
2014-10-27 22:42:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-13 20:17:15 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Split large virtual GRFs into separate components if we can.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is mostly duplicated with what brw_fs_vector_splitting does,
|
|
|
|
|
|
* but that's really conservative because it's afraid of doing
|
|
|
|
|
|
* splitting that doesn't result in real progress after the rest of
|
|
|
|
|
|
* the optimization phases, which would cause infinite looping in
|
|
|
|
|
|
* optimization. We can do it once here, safely. This also has the
|
|
|
|
|
|
* opportunity to split interpolated values, or maybe even uniforms,
|
|
|
|
|
|
* which we don't have at the IR level.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We want to split, because virtual GRFs are what we register
|
|
|
|
|
|
* allocate and spill (due to contiguousness requirements for some
|
|
|
|
|
|
* instructions), and they're what we naturally generate in the
|
|
|
|
|
|
* codegen process, but most virtual GRFs don't actually need to be
|
|
|
|
|
|
* contiguous sets of GRFs. If we split, we'll end up with reduced
|
|
|
|
|
|
* live intervals and better dead code elimination and coalescing.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::split_virtual_grfs()
|
|
|
|
|
|
{
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int num_vars = this->alloc.count;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* Count the total number of registers */
|
|
|
|
|
|
int reg_count = 0;
|
|
|
|
|
|
int vgrf_to_reg[num_vars];
|
2010-10-13 20:17:15 -07:00
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
vgrf_to_reg[i] = reg_count;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
reg_count += alloc.sizes[i];
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* An array of "split points". For each register slot, this indicates
|
|
|
|
|
|
* if this slot can be separated from the previous slot. Every time an
|
|
|
|
|
|
* instruction uses multiple elements of a register (as a source or
|
|
|
|
|
|
* destination), we mark the used slots as inseparable. Then we go
|
|
|
|
|
|
* through and split the registers into the smallest pieces we can.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool split_points[reg_count];
|
|
|
|
|
|
memset(split_points, 0, sizeof(split_points));
|
|
|
|
|
|
|
|
|
|
|
|
/* Mark all used registers as fully splittable */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->dst.reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->src[i].reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
|
|
|
|
|
|
for (int j = 1; j < inst->regs_written; j++)
|
|
|
|
|
|
split_points[reg + j] = false;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
|
2015-02-06 01:24:17 +02:00
|
|
|
|
for (int j = 1; j < inst->regs_read(i); j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = false;
|
2013-08-28 11:22:01 -07:00
|
|
|
|
}
|
2013-03-19 15:28:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-19 13:57:11 -07:00
|
|
|
|
int new_virtual_grf[reg_count];
|
|
|
|
|
|
int new_reg_offset[reg_count];
|
|
|
|
|
|
|
|
|
|
|
|
int reg = 0;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* The first one should always be 0 as a quick sanity check. */
|
|
|
|
|
|
assert(split_points[reg] == false);
|
|
|
|
|
|
|
|
|
|
|
|
/* j = 0 case */
|
|
|
|
|
|
new_reg_offset[reg] = 0;
|
|
|
|
|
|
reg++;
|
|
|
|
|
|
int offset = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/* j > 0 case */
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < alloc.sizes[i]; j++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* If this is a split point, reset the offset to 0 and allocate a
|
|
|
|
|
|
* new virtual GRF for the previous offset many registers
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (split_points[reg]) {
|
2014-10-01 10:54:59 -07:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int grf = alloc.allocate(offset);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int k = reg - offset; k < reg; k++)
|
|
|
|
|
|
new_virtual_grf[k] = grf;
|
|
|
|
|
|
offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
new_reg_offset[reg] = offset;
|
|
|
|
|
|
offset++;
|
|
|
|
|
|
reg++;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* The last one gets the original register number */
|
2014-10-01 10:54:59 -07:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[i] = offset;
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int k = reg - offset; k < reg; k++)
|
|
|
|
|
|
new_virtual_grf[k] = i;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
assert(reg == reg_count);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
|
|
|
|
|
|
inst->dst.reg = new_virtual_grf[reg];
|
|
|
|
|
|
inst->dst.reg_offset = new_reg_offset[reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
|
|
|
|
|
|
inst->src[i].reg = new_virtual_grf[reg];
|
|
|
|
|
|
inst->src[i].reg_offset = new_reg_offset[reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove unused virtual GRFs and compact the virtual_grf_* arrays.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During code generation, we create tons of temporary variables, many of
|
|
|
|
|
|
* which get immediately killed and are never used again. Yet, in later
|
|
|
|
|
|
* optimization and analysis passes, such as compute_live_intervals, we need
|
|
|
|
|
|
* to loop over all the virtual GRFs. Compacting them can save a lot of
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool
|
2012-11-01 22:04:50 -07:00
|
|
|
|
fs_visitor::compact_virtual_grfs()
|
|
|
|
|
|
{
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool progress = false;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int remap_table[this->alloc.count];
|
2012-11-01 22:04:50 -07:00
|
|
|
|
memset(remap_table, -1, sizeof(remap_table));
|
|
|
|
|
|
|
2014-08-19 16:11:36 -07:00
|
|
|
|
/* Mark which virtual GRFs are used. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, const fs_inst, inst, cfg) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->dst.file == GRF)
|
|
|
|
|
|
remap_table[inst->dst.reg] = 0;
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->src[i].file == GRF)
|
|
|
|
|
|
remap_table[inst->src[i].reg] = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Compact the GRF arrays. */
|
|
|
|
|
|
int new_index = 0;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2014-09-16 13:14:09 -07:00
|
|
|
|
if (remap_table[i] == -1) {
|
|
|
|
|
|
/* We just found an unused register. This means that we are
|
|
|
|
|
|
* actually going to compact something.
|
|
|
|
|
|
*/
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
remap_table[i] = new_index;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[new_index] = alloc.sizes[i];
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-11-01 22:04:50 -07:00
|
|
|
|
++new_index;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
this->alloc.count = new_index;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
|
|
|
|
|
/* Patch all the instructions to use the newly renumbered registers */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->dst.file == GRF)
|
|
|
|
|
|
inst->dst.reg = remap_table[inst->dst.reg];
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->src[i].file == GRF)
|
|
|
|
|
|
inst->src[i].reg = remap_table[inst->src[i].reg];
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
/* Patch all the references to delta_xy, since they're used in register
|
|
|
|
|
|
* allocation. If they're unused, switch them to BAD_FILE so we don't
|
|
|
|
|
|
* think some random VGRF is delta_xy.
|
2014-08-10 19:03:34 -07:00
|
|
|
|
*/
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
|
|
|
|
|
if (delta_xy[i].file == GRF) {
|
|
|
|
|
|
if (remap_table[delta_xy[i].reg] != -1) {
|
|
|
|
|
|
delta_xy[i].reg = remap_table[delta_xy[i].reg];
|
2014-09-12 17:45:30 -07:00
|
|
|
|
} else {
|
2015-04-06 17:44:40 -07:00
|
|
|
|
delta_xy[i].file = BAD_FILE;
|
2014-09-12 17:45:30 -07:00
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 13:14:09 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-08-18 17:04:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*
|
2015-08-18 17:04:53 -07:00
|
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
|
|
|
|
|
* update the program to load them. We also use pull constants for all
|
|
|
|
|
|
* indirect constant loads because we don't support indirect accesses in
|
|
|
|
|
|
* registers yet.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
2015-08-18 17:04:53 -07:00
|
|
|
|
fs_visitor::assign_constant_locations()
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
2015-08-18 17:04:53 -07:00
|
|
|
|
/* Only the first compile (SIMD8 mode) gets to decide on locations. */
|
2014-03-07 16:10:50 -08:00
|
|
|
|
if (dispatch_width != 8)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2015-08-18 17:40:02 -07:00
|
|
|
|
unsigned int num_pull_constants = 0;
|
|
|
|
|
|
|
2014-03-07 02:10:14 -08:00
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
2014-08-24 21:51:28 -07:00
|
|
|
|
memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2015-08-18 17:40:02 -07:00
|
|
|
|
bool is_live[uniforms];
|
|
|
|
|
|
memset(is_live, 0, sizeof(is_live));
|
|
|
|
|
|
|
|
|
|
|
|
/* First, we walk through the instructions and do two things:
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1) Figure out which uniforms are live.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 2) Find all indirect access of uniform arrays and flag them as needing
|
|
|
|
|
|
* to go into the pull constant buffer.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*
|
|
|
|
|
|
* Note that we don't move constant-indexed accesses to arrays. No
|
|
|
|
|
|
* testing has been done of the performance impact of this choice.
|
|
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0 ; i < inst->sources; i++) {
|
2015-08-18 17:40:02 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-08-18 17:40:02 -07:00
|
|
|
|
if (inst->src[i].reladdr) {
|
|
|
|
|
|
int uniform = inst->src[i].reg;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2015-08-18 17:40:02 -07:00
|
|
|
|
/* If this array isn't already present in the pull constant buffer,
|
|
|
|
|
|
* add it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (pull_constant_loc[uniform] == -1) {
|
|
|
|
|
|
assert(param_size[uniform]);
|
|
|
|
|
|
for (int j = 0; j < param_size[uniform]; j++)
|
|
|
|
|
|
pull_constant_loc[uniform + j] = num_pull_constants++;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
2015-08-18 17:40:02 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Mark the the one accessed uniform as live */
|
|
|
|
|
|
int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
if (constant_nr >= 0 && constant_nr < (int) uniforms)
|
|
|
|
|
|
is_live[constant_nr] = true;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Just demote the end of the list. We could probably do better
|
2010-10-22 12:57:00 -07:00
|
|
|
|
* here, demoting things that are rarely used in the program first.
|
2014-05-19 08:51:12 -07:00
|
|
|
|
*
|
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
|
* brw_curbe.c.
|
2010-10-22 12:57:00 -07:00
|
|
|
|
*/
|
2014-03-11 14:35:27 -07:00
|
|
|
|
unsigned int max_push_components = 16 * 8;
|
|
|
|
|
|
unsigned int num_push_constants = 0;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
if (!is_live[i] || pull_constant_loc[i] != -1) {
|
|
|
|
|
|
/* This UNIFORM register is either dead, or has already been demoted
|
|
|
|
|
|
* to a pull const. Mark it as no longer living in the param[] array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
push_constant_loc[i] = -1;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (num_push_constants < max_push_components) {
|
|
|
|
|
|
/* Retain as a push constant. Record the location in the params[]
|
|
|
|
|
|
* array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
push_constant_loc[i] = num_push_constants++;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
} else {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Demote to a pull constant. */
|
|
|
|
|
|
push_constant_loc[i] = -1;
|
2015-08-18 17:40:02 -07:00
|
|
|
|
pull_constant_loc[i] = num_pull_constants++;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
|
|
|
|
|
stage_prog_data->nr_params = num_push_constants;
|
2015-08-18 17:40:02 -07:00
|
|
|
|
stage_prog_data->nr_pull_params = num_pull_constants;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
|
|
|
|
|
/* Up until now, the param[] array has been indexed by reg + reg_offset
|
2015-08-18 17:40:02 -07:00
|
|
|
|
* of UNIFORM registers. Move pull constants into pull_param[] and
|
|
|
|
|
|
* condense param[] to only contain the uniforms we chose to push.
|
|
|
|
|
|
*
|
|
|
|
|
|
* NOTE: Because we are condensing the params[] array, we know that
|
|
|
|
|
|
* push_constant_loc[i] <= i and we can do it in one smooth loop without
|
|
|
|
|
|
* having to make a copy.
|
2014-03-11 14:35:27 -07:00
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
2015-08-18 17:40:02 -07:00
|
|
|
|
const gl_constant_value *value = stage_prog_data->param[i];
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
2015-08-18 17:40:02 -07:00
|
|
|
|
if (pull_constant_loc[i] != -1) {
|
|
|
|
|
|
stage_prog_data->pull_param[pull_constant_loc[i]] = value;
|
|
|
|
|
|
} else if (push_constant_loc[i] != -1) {
|
|
|
|
|
|
stage_prog_data->param[push_constant_loc[i]] = value;
|
|
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
}
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
|
|
|
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-03-11 22:24:39 -07:00
|
|
|
|
fs_visitor::demote_pull_constants()
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
{
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2010-10-22 12:57:00 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-03-10 11:36:43 +01:00
|
|
|
|
int pull_index;
|
|
|
|
|
|
unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
if (location >= uniforms) /* Out of bounds access */
|
|
|
|
|
|
pull_index = -1;
|
|
|
|
|
|
else
|
|
|
|
|
|
pull_index = pull_constant_loc[location];
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
if (pull_index == -1)
|
2010-10-22 12:57:00 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Set up the annotation tracking for new generated instructions. */
|
2015-07-27 17:55:49 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg dst = vgrf(glsl_type::float_type);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
assert(inst->src[i].stride == 0);
|
|
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Generate a pull load into dst. */
|
|
|
|
|
|
if (inst->src[i].reladdr) {
|
2015-06-03 22:22:39 +03:00
|
|
|
|
VARYING_PULL_CONSTANT_LOAD(ibld, dst,
|
|
|
|
|
|
surf_index,
|
|
|
|
|
|
*inst->src[i].reladdr,
|
|
|
|
|
|
pull_index);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
inst->src[i].reladdr = NULL;
|
i965/fs: Fix stride field for uniforms.
This fixes essentially the same problem as for immediates. Registers
of the UNIFORM file are typically accessed according to the formula:
read_uniform(r, channel_index, array_index) =
read_element(r, channel_index * 0 + array_index * 1)
Which matches the general direct addressing formula for stride=0:
read_direct(r, channel_index, array_index) =
read_element(r, channel_index * stride +
array_index * max{1, stride * width})
In either case if reladdr is present the access will be according to
the composition of two register regions, the first one determining the
per-channel array_index used for the second, like:
read_indirect(r, channel_index, array_index) =
read_direct(r, channel_index,
read(r.reladdr, channel_index, array_index))
where:
read(r, channel_index, array_index) = if r.reladdr == NULL
then read_direct(r, channel_index, array_index)
else read_indirect(r, channel_index, array_index)
In conclusion we can handle uniforms consistently with the other
register files if we set stride to zero. After lowering to a GRF
using VARYING_PULL_CONSTANT_LOAD in demote_pull_constant_loads() the
stride of the source is set to one again because the result of
VARYING_PULL_CONSTANT_LOAD is generally non-uniform.
Reviewed-by: Samuel Iglesias Gonsálvez <siglesias@igalia.com>
2015-07-13 15:29:39 +03:00
|
|
|
|
inst->src[i].stride = 1;
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
} else {
|
2015-07-27 17:55:49 +03:00
|
|
|
|
const fs_builder ubld = ibld.exec_all().group(8, 0);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
|
2015-07-27 17:55:49 +03:00
|
|
|
|
ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
|
2015-06-03 22:22:39 +03:00
|
|
|
|
dst, surf_index, offset);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
inst->src[i].set_smear(pull_index & 3);
|
|
|
|
|
|
}
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
|
|
|
|
|
inst->src[i].file = GRF;
|
|
|
|
|
|
inst->src[i].reg = dst.reg;
|
|
|
|
|
|
inst->src[i].reg_offset = 0;
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_algebraic()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
switch (inst->opcode) {
|
2014-12-21 06:56:54 -08:00
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
|
if (inst->src[0].file != IMM)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->saturate) {
|
|
|
|
|
|
if (inst->dst.type != inst->src[0].type)
|
|
|
|
|
|
assert(!"unimplemented: saturate mixed types");
|
|
|
|
|
|
|
|
|
|
|
|
if (brw_saturate_immediate(inst->dst.type,
|
|
|
|
|
|
&inst->src[0].fixed_hw_reg)) {
|
|
|
|
|
|
inst->saturate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* a * 1.0 = a */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_one()) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-04 18:08:30 -08:00
|
|
|
|
/* a * -1.0 = -a */
|
|
|
|
|
|
if (inst->src[1].is_negative_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].negate = !inst->src[0].negate;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-09-20 11:06:07 +02:00
|
|
|
|
/* a * 0.0 = 0.0 */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_zero()) {
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2012-11-17 15:10:53 -08:00
|
|
|
|
inst->src[0] = inst->src[1];
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-09 17:27:52 -08:00
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2011-05-03 10:55:50 -07:00
|
|
|
|
break;
|
2012-09-20 11:06:07 +02:00
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* a + 0.0 = a */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_zero()) {
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2014-11-09 17:27:52 -08:00
|
|
|
|
|
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2012-09-20 11:06:07 +02:00
|
|
|
|
break;
|
2013-10-27 19:34:48 -07:00
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-01-09 20:57:36 -08:00
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
|
if (inst->src[1].equals(inst->src[2])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = inst->src[1];
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-01-05 13:51:03 -08:00
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
|
if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
|
|
|
|
|
|
inst->src[0].abs &&
|
|
|
|
|
|
inst->src[0].negate &&
|
|
|
|
|
|
inst->src[1].is_zero()) {
|
|
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
case BRW_OPCODE_SEL:
|
2014-04-18 10:01:41 -07:00
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NONE;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->saturate && inst->src[1].file == IMM) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
2015-02-10 21:36:26 -08:00
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 21:26:36 -07:00
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
|
2013-10-27 21:26:36 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2013-10-27 20:03:48 -07:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
|
if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[0].is_zero()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MUL;
|
|
|
|
|
|
inst->src[0] = inst->src[2];
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
2015-03-16 10:08:08 +02:00
|
|
|
|
progress = true;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
} else if (inst->src[1].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[1] = inst->src[2];
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[2].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-09-27 10:34:56 -07:00
|
|
|
|
case SHADER_OPCODE_RCP: {
|
|
|
|
|
|
fs_inst *prev = (fs_inst *)inst->prev;
|
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_SQRT) {
|
|
|
|
|
|
if (inst->src[0].equals(prev->dst)) {
|
|
|
|
|
|
inst->opcode = SHADER_OPCODE_RSQ;
|
|
|
|
|
|
inst->src[0] = prev->src[0];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2015-02-19 14:52:24 +02:00
|
|
|
|
case SHADER_OPCODE_BROADCAST:
|
|
|
|
|
|
if (is_uniform(inst->src[0])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = component(inst->src[0],
|
|
|
|
|
|
inst->src[1].fixed_hw_reg.dw1.ud);
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-05-03 10:55:50 -07:00
|
|
|
|
default:
|
2011-07-22 16:45:15 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-16 10:08:08 +02:00
|
|
|
|
/* Swap if src[0] is immediate. */
|
|
|
|
|
|
if (progress && inst->is_commutative()) {
|
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
fs_reg tmp = inst->src[1];
|
|
|
|
|
|
inst->src[1] = inst->src[0];
|
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2011-07-22 16:45:15 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-23 16:56:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages that have constant zero values for the trailing
|
|
|
|
|
|
* texture coordinates. We can just reduce the message length for these
|
|
|
|
|
|
* instructions instead of reserving a register for it. Trailing parameters
|
|
|
|
|
|
* that aren't sent default to zero anyway. This will cause the dead code
|
|
|
|
|
|
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
|
|
|
|
|
* set up the zero value.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_zero_samples()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Gen4 infers the texturing opcode based on the message length so we can't
|
|
|
|
|
|
* change it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 5)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (!inst->is_tex())
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *load_payload = (fs_inst *) inst->prev;
|
|
|
|
|
|
|
|
|
|
|
|
if (load_payload->is_head_sentinel() ||
|
|
|
|
|
|
load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-05-08 16:13:52 +01:00
|
|
|
|
/* We don't want to remove the message header or the first parameter.
|
|
|
|
|
|
* Removing the first parameter is not allowed, see the Haswell PRM
|
|
|
|
|
|
* volume 7, page 149:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Parameter 0 is required except for the sampleinfo message, which
|
|
|
|
|
|
* has no parameter 0"
|
2015-04-23 16:56:53 -07:00
|
|
|
|
*/
|
2015-07-13 15:33:04 +03:00
|
|
|
|
while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
|
2015-03-24 10:17:32 -07:00
|
|
|
|
load_payload->src[(inst->mlen - inst->header_size) /
|
2015-07-13 15:33:04 +03:00
|
|
|
|
(inst->exec_size / 8) +
|
2015-03-24 10:17:32 -07:00
|
|
|
|
inst->header_size - 1].is_zero()) {
|
2015-07-13 15:33:04 +03:00
|
|
|
|
inst->mlen -= inst->exec_size / 8;
|
2015-04-23 16:56:53 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages which are followed by the final RT write.
|
|
|
|
|
|
*
|
|
|
|
|
|
* CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
|
|
|
|
|
|
* results sent directly to the framebuffer, bypassing the EU. Recognize the
|
|
|
|
|
|
* final texturing results copied to the framebuffer write payload and modify
|
|
|
|
|
|
* them to write to the framebuffer directly.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_sampler_eot()
|
|
|
|
|
|
{
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
|
|
|
|
|
|
2015-04-28 14:20:06 +01:00
|
|
|
|
if (stage != MESA_SHADER_FRAGMENT)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 9 && !devinfo->is_cherryview)
|
2015-02-08 13:59:57 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* FINISHME: It should be possible to implement this optimization when there
|
|
|
|
|
|
* are multiple drawbuffers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (key->nr_color_regions != 1)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* Look for a texturing instruction immediately before the final FB_WRITE. */
|
2015-07-27 18:15:44 +03:00
|
|
|
|
bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
|
|
|
|
|
|
fs_inst *fb_write = (fs_inst *)block->end();
|
2015-02-08 13:59:57 -08:00
|
|
|
|
assert(fb_write->eot);
|
|
|
|
|
|
assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *tex_inst = (fs_inst *) fb_write->prev;
|
|
|
|
|
|
|
|
|
|
|
|
/* There wasn't one; nothing to do. */
|
|
|
|
|
|
if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-05-08 17:35:18 +01:00
|
|
|
|
/* This optimisation doesn't seem to work for textureGather for some
|
|
|
|
|
|
* reason. I can't find any documentation or known workarounds to indicate
|
|
|
|
|
|
* that this is expected, but considering that it is probably pretty
|
|
|
|
|
|
* unlikely that a shader would directly write out the results from
|
|
|
|
|
|
* textureGather we might as well just disable it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
|
|
|
|
|
|
tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
/* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
|
|
|
|
|
|
* It's very likely to be the previous instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *load_payload = (fs_inst *) tex_inst->prev;
|
|
|
|
|
|
if (load_payload->is_head_sentinel() ||
|
|
|
|
|
|
load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
assert(!tex_inst->eot); /* We can't get here twice */
|
|
|
|
|
|
assert((tex_inst->offset & (0xff << 24)) == 0);
|
|
|
|
|
|
|
2015-07-27 18:15:44 +03:00
|
|
|
|
const fs_builder ibld(this, block, tex_inst);
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
tex_inst->offset |= fb_write->target << 24;
|
|
|
|
|
|
tex_inst->eot = true;
|
2015-07-27 18:15:44 +03:00
|
|
|
|
tex_inst->dst = ibld.null_reg_ud();
|
2015-02-08 13:59:57 -08:00
|
|
|
|
fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
|
|
|
|
|
|
|
|
|
|
|
|
/* If a header is present, marking the eot is sufficient. Otherwise, we need
|
|
|
|
|
|
* to create a new LOAD_PAYLOAD command with the same sources and a space
|
|
|
|
|
|
* saved for the header. Using a new destination register not only makes sure
|
|
|
|
|
|
* we have enough space, but it will make sure the dead code eliminator kills
|
|
|
|
|
|
* the instruction that this will replace.
|
|
|
|
|
|
*/
|
2015-03-24 10:17:32 -07:00
|
|
|
|
if (tex_inst->header_size != 0)
|
2015-02-08 13:59:57 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
|
2015-07-27 18:15:44 +03:00
|
|
|
|
fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
|
|
|
|
|
|
load_payload->sources + 1);
|
2015-02-08 13:59:57 -08:00
|
|
|
|
fs_reg *new_sources =
|
|
|
|
|
|
ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
|
|
|
|
|
|
|
|
|
|
|
|
new_sources[0] = fs_reg();
|
|
|
|
|
|
for (int i = 0; i < load_payload->sources; i++)
|
|
|
|
|
|
new_sources[i+1] = load_payload->src[i];
|
|
|
|
|
|
|
|
|
|
|
|
/* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
|
|
|
|
|
|
* requires a lot of information about the sources to appropriately figure
|
|
|
|
|
|
* out the number of registers needed to be used. Given this stage in our
|
|
|
|
|
|
* optimization, we may not have the appropriate GRFs required by
|
|
|
|
|
|
* LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
|
|
|
|
|
|
* manually emit the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
|
|
|
|
|
|
load_payload->exec_size,
|
|
|
|
|
|
send_header,
|
|
|
|
|
|
new_sources,
|
|
|
|
|
|
load_payload->sources + 1);
|
|
|
|
|
|
|
|
|
|
|
|
new_load_payload->regs_written = load_payload->regs_written + 1;
|
2015-05-07 18:55:55 +01:00
|
|
|
|
new_load_payload->header_size = 1;
|
2015-02-08 13:59:57 -08:00
|
|
|
|
tex_inst->mlen++;
|
2015-03-24 10:17:32 -07:00
|
|
|
|
tex_inst->header_size = 1;
|
2015-02-08 13:59:57 -08:00
|
|
|
|
tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
|
|
|
|
|
|
tex_inst->src[0] = send_header;
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-14 15:01:37 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_register_renaming()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
int depth = 0;
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int remap[alloc.count];
|
|
|
|
|
|
memset(remap, -1, sizeof(int) * alloc.count);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-04-14 15:01:37 -07:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
} else if (inst->opcode == BRW_OPCODE_ENDIF ||
|
|
|
|
|
|
inst->opcode == BRW_OPCODE_WHILE) {
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite instruction sources. */
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == GRF &&
|
|
|
|
|
|
remap[inst->src[i].reg] != -1 &&
|
|
|
|
|
|
remap[inst->src[i].reg] != inst->src[i].reg) {
|
|
|
|
|
|
inst->src[i].reg = remap[inst->src[i].reg];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const int dst = inst->dst.reg;
|
|
|
|
|
|
|
|
|
|
|
|
if (depth == 0 &&
|
|
|
|
|
|
inst->dst.file == GRF &&
|
2015-06-18 13:49:22 -07:00
|
|
|
|
alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
|
2014-04-14 15:01:37 -07:00
|
|
|
|
!inst->is_partial_write()) {
|
|
|
|
|
|
if (remap[dst] == -1) {
|
|
|
|
|
|
remap[dst] = dst;
|
|
|
|
|
|
} else {
|
2015-06-18 13:49:22 -07:00
|
|
|
|
remap[dst] = alloc.allocate(inst->exec_size / 8);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
inst->dst.reg = remap[dst];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if (inst->dst.file == GRF &&
|
|
|
|
|
|
remap[dst] != -1 &&
|
|
|
|
|
|
remap[dst] != dst) {
|
|
|
|
|
|
inst->dst.reg = remap[dst];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
|
|
|
|
|
if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
|
|
|
|
|
|
delta_xy[i].reg = remap[delta_xy[i].reg];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-05 22:10:41 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove redundant or useless discard jumps.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, we can eliminate jumps in the following sequence:
|
|
|
|
|
|
*
|
|
|
|
|
|
* discard-jump (redundant with the next jump)
|
|
|
|
|
|
* discard-jump (useless; jumps to the next instruction)
|
|
|
|
|
|
* placeholder-halt
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_redundant_discard_jumps()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *placeholder_halt = NULL;
|
|
|
|
|
|
foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
|
|
|
|
|
|
if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
|
|
|
|
|
|
placeholder_halt = inst;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!placeholder_halt)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* Delete any HALTs immediately before the placeholder halt. */
|
|
|
|
|
|
for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
|
|
|
|
|
|
!prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
|
|
|
|
|
|
prev = (fs_inst *) placeholder_halt->prev) {
|
|
|
|
|
|
prev->remove(last_bblock);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::compute_to_mrf()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
2014-07-15 12:56:37 -07:00
|
|
|
|
int next_ip = 0;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
|
2014-10-29 14:21:14 -07:00
|
|
|
|
/* No MRFs on Gen >= 7. */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7)
|
2014-10-29 14:21:14 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2011-01-12 10:10:01 -08:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2014-09-03 23:52:26 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
int ip = next_ip;
|
|
|
|
|
|
next_ip++;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
2013-04-11 09:54:41 -07:00
|
|
|
|
inst->is_partial_write() ||
|
2010-10-08 14:00:14 -07:00
|
|
|
|
inst->dst.file != MRF || inst->src[0].file != GRF ||
|
|
|
|
|
|
inst->dst.type != inst->src[0].type ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[0].abs || inst->src[0].negate ||
|
2014-01-15 22:21:30 +01:00
|
|
|
|
!inst->src[0].is_contiguous() ||
|
2014-07-15 12:56:37 -07:00
|
|
|
|
inst->src[0].subreg_offset)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2011-03-28 16:54:39 -07:00
|
|
|
|
/* Work out which hardware MRF registers are written by this
|
|
|
|
|
|
* instruction.
|
|
|
|
|
|
*/
|
2011-05-15 09:36:19 -07:00
|
|
|
|
int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
|
2011-03-28 16:54:39 -07:00
|
|
|
|
int mrf_high;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
if (inst->dst.reg & BRW_MRF_COMPR4) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
mrf_high = mrf_low + 4;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
} else if (inst->exec_size == 16) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
mrf_high = mrf_low + 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
mrf_high = mrf_low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
/* Can't compute-to-MRF this GRF if someone else was going to
|
|
|
|
|
|
* read it later.
|
|
|
|
|
|
*/
|
2013-04-30 15:00:40 -07:00
|
|
|
|
if (this->virtual_grf_end[inst->src[0].reg] > ip)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Found a move of a GRF to a MRF. Let's see if we can go
|
|
|
|
|
|
* rewrite the thing that made this GRF to write into the MRF.
|
|
|
|
|
|
*/
|
2014-09-02 10:08:24 -07:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
if (scan_inst->dst.file == GRF &&
|
|
|
|
|
|
scan_inst->dst.reg == inst->src[0].reg) {
|
|
|
|
|
|
/* Found the last thing to write our reg we want to turn
|
|
|
|
|
|
* into a compute-to-MRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/* If this one instruction didn't populate all the
|
|
|
|
|
|
* channels, bail. We might be able to rewrite everything
|
2011-03-28 16:54:39 -07:00
|
|
|
|
* that writes that reg, but it would require smarter
|
|
|
|
|
|
* tracking to delay the rewriting until complete success.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2012-06-04 08:59:00 -07:00
|
|
|
|
if (scan_inst->is_partial_write())
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2014-07-15 12:56:37 -07:00
|
|
|
|
/* Things returning more than one register would need us to
|
|
|
|
|
|
* understand coalescing out more than one MOV at a time.
|
|
|
|
|
|
*/
|
2015-06-18 13:49:22 -07:00
|
|
|
|
if (scan_inst->regs_written > scan_inst->exec_size / 8)
|
2014-07-15 12:56:37 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* SEND instructions can't have MRF as a destination. */
|
|
|
|
|
|
if (scan_inst->mlen)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
/* gen6 math instructions must have the destination be
|
|
|
|
|
|
* GRF, so no compute-to-MRF for them.
|
|
|
|
|
|
*/
|
2011-01-18 22:48:11 -08:00
|
|
|
|
if (scan_inst->is_math()) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
|
|
|
|
|
|
/* Found the creator of our MRF's source value. */
|
2010-11-18 15:03:50 +08:00
|
|
|
|
scan_inst->dst.file = MRF;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
scan_inst->dst.reg = inst->dst.reg;
|
2010-11-18 15:03:50 +08:00
|
|
|
|
scan_inst->saturate |= inst->saturate;
|
2014-09-03 23:52:26 -07:00
|
|
|
|
inst->remove(block);
|
2010-11-18 15:03:50 +08:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:36:18 -08:00
|
|
|
|
/* We don't handle control flow here. Most computation of
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* values that end up in MRFs are shortly before the MRF
|
|
|
|
|
|
* write anyway.
|
|
|
|
|
|
*/
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->start() == scan_inst)
|
2010-11-18 15:03:50 +08:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* You can't read from an MRF, so if someone else reads our
|
|
|
|
|
|
* MRF's source GRF that we wanted to rewrite, that stops us.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool interfered = false;
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->sources; i++) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
if (scan_inst->src[i].file == GRF &&
|
|
|
|
|
|
scan_inst->src[i].reg == inst->src[0].reg &&
|
|
|
|
|
|
scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
|
|
|
|
|
|
interfered = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (interfered)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-03-28 16:54:39 -07:00
|
|
|
|
if (scan_inst->dst.file == MRF) {
|
|
|
|
|
|
/* If somebody else writes our MRF here, we can't
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* compute-to-MRF before that.
|
|
|
|
|
|
*/
|
2011-05-15 09:36:19 -07:00
|
|
|
|
int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
|
2011-03-28 16:54:39 -07:00
|
|
|
|
int scan_mrf_high;
|
|
|
|
|
|
|
2011-05-15 09:36:19 -07:00
|
|
|
|
if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
scan_mrf_high = scan_mrf_low + 4;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
} else if (scan_inst->exec_size == 16) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
scan_mrf_high = scan_mrf_low + 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
scan_mrf_high = scan_mrf_low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (mrf_low == scan_mrf_low ||
|
|
|
|
|
|
mrf_low == scan_mrf_high ||
|
|
|
|
|
|
mrf_high == scan_mrf_low ||
|
|
|
|
|
|
mrf_high == scan_mrf_high) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-11-18 15:03:50 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
/* Found a SEND instruction, which means that there are
|
|
|
|
|
|
* live values in MRFs from base_mrf to base_mrf +
|
|
|
|
|
|
* scan_inst->mlen - 1. Don't go pushing our MRF write up
|
|
|
|
|
|
* above it.
|
|
|
|
|
|
*/
|
2011-03-28 16:54:39 -07:00
|
|
|
|
if (mrf_low >= scan_inst->base_mrf &&
|
|
|
|
|
|
mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (mrf_high >= scan_inst->base_mrf &&
|
|
|
|
|
|
mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-20 20:25:04 +02:00
|
|
|
|
/**
|
|
|
|
|
|
* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
|
|
|
|
|
|
* flow. We could probably do better here with some form of divergence
|
|
|
|
|
|
* analysis.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::eliminate_find_live_channel()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
unsigned depth = 0;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case BRW_OPCODE_IF:
|
|
|
|
|
|
case BRW_OPCODE_DO:
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case BRW_OPCODE_ENDIF:
|
|
|
|
|
|
case BRW_OPCODE_WHILE:
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_DISCARD_JUMP:
|
|
|
|
|
|
/* This can potentially make control flow non-uniform until the end
|
|
|
|
|
|
* of the program.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
|
|
|
|
|
if (depth == 0) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = fs_reg(0);
|
|
|
|
|
|
inst->sources = 1;
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-07 15:27:17 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
|
|
|
|
|
* instructions to FS_OPCODE_REP_FB_WRITE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_visitor::emit_repclear_shader()
|
2014-07-07 15:27:17 -07:00
|
|
|
|
{
|
2014-08-19 13:57:11 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
int base_mrf = 1;
|
|
|
|
|
|
int color_mrf = base_mrf + 2;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2015-06-03 21:07:52 +03:00
|
|
|
|
fs_inst *mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
|
|
|
|
|
|
fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_inst *write;
|
|
|
|
|
|
if (key->nr_color_regions == 1) {
|
2015-06-03 21:07:52 +03:00
|
|
|
|
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->saturate = key->clamp_fragment_color;
|
|
|
|
|
|
write->base_mrf = color_mrf;
|
|
|
|
|
|
write->target = 0;
|
2015-03-24 10:17:32 -07:00
|
|
|
|
write->header_size = 0;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->mlen = 1;
|
|
|
|
|
|
} else {
|
2014-10-23 15:45:15 -07:00
|
|
|
|
assume(key->nr_color_regions > 0);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
for (int i = 0; i < key->nr_color_regions; ++i) {
|
2015-06-03 21:07:52 +03:00
|
|
|
|
write = bld.emit(FS_OPCODE_REP_FB_WRITE);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->saturate = key->clamp_fragment_color;
|
|
|
|
|
|
write->base_mrf = base_mrf;
|
|
|
|
|
|
write->target = i;
|
2015-03-24 10:17:32 -07:00
|
|
|
|
write->header_size = 2;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
write->mlen = 3;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
2014-09-26 14:47:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
write->eot = true;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
calculate_cfg();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
assign_constant_locations();
|
|
|
|
|
|
assign_curb_setup();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
|
|
|
|
|
|
assert(mov->src[0].file == HW_REG);
|
|
|
|
|
|
mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
2012-01-27 11:06:49 -08:00
|
|
|
|
* Walks through basic blocks, looking for repeated MRF writes and
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* removing the later ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_duplicate_mrf_writes()
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *last_mrf_move[16];
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2011-03-23 14:00:01 -07:00
|
|
|
|
/* Need to update the MRF tracking for compressed instructions. */
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16)
|
2011-03-23 14:00:01 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2013-02-05 15:36:18 -08:00
|
|
|
|
if (inst->is_control_flow()) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
|
2010-11-19 15:57:05 +08:00
|
|
|
|
if (prev_inst && inst->equals(prev_inst)) {
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->remove(block);
|
2010-11-19 15:57:05 +08:00
|
|
|
|
progress = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out the last-write records for MRFs that were overwritten. */
|
|
|
|
|
|
if (inst->dst.file == MRF) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
last_mrf_move[inst->dst.reg] = NULL;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->mlen > 0 && inst->base_mrf != -1) {
|
2011-01-18 13:28:32 -08:00
|
|
|
|
/* Found a SEND instruction, which will include two or fewer
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* implied MRF writes. We could do better here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < implied_mrf_writes(inst); i++) {
|
|
|
|
|
|
last_mrf_move[inst->base_mrf + i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out any MRF move records whose sources got overwritten. */
|
|
|
|
|
|
if (inst->dst.file == GRF) {
|
2015-02-28 09:05:29 -07:00
|
|
|
|
for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
if (last_mrf_move[i] &&
|
|
|
|
|
|
last_mrf_move[i]->src[0].reg == inst->dst.reg) {
|
|
|
|
|
|
last_mrf_move[i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF &&
|
|
|
|
|
|
inst->src[0].file == GRF &&
|
2013-04-11 09:54:41 -07:00
|
|
|
|
!inst->is_partial_write()) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
last_mrf_move[inst->dst.reg] = inst;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
static void
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int grf;
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
grf = inst->src[i].reg;
|
2013-04-29 16:05:05 -07:00
|
|
|
|
} else if (inst->src[i].file == HW_REG &&
|
2013-02-05 15:46:22 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
|
|
|
|
|
|
grf = inst->src[i].fixed_hw_reg.nr;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (grf >= first_grf &&
|
|
|
|
|
|
grf < first_grf + grf_len) {
|
|
|
|
|
|
deps[grf - first_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (inst->exec_size == 16)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
deps[grf - first_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
|
|
|
|
|
|
* check for post destination dependencies on this instruction, software
|
|
|
|
|
|
* must ensure that there is no destination hazard for the case of ‘write
|
|
|
|
|
|
* followed by a posted write’ shown in the following example.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1. mov r3 0
|
|
|
|
|
|
* 2. send r3.xy <rest of send instruction>
|
|
|
|
|
|
* 3. mov r2 r3
|
|
|
|
|
|
*
|
|
|
|
|
|
* Due to no post-destination dependency check on the ‘send’, the above
|
|
|
|
|
|
* code sequence could have two instructions (1 and 2) in flight at the
|
|
|
|
|
|
* same time that both consider ‘r3’ as the target of their final writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
|
|
|
|
|
|
fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2014-08-18 14:27:55 -07:00
|
|
|
|
int write_len = inst->regs_written;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int first_write_grf = inst->dst.reg;
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
|
bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
|
2013-02-05 15:46:22 -08:00
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Walk backwards looking for writes to registers we're writing which
|
|
|
|
|
|
* aren't read since being written. If we hit the start of the program,
|
|
|
|
|
|
* we assume that there are no outstanding dependencies on entry to the
|
|
|
|
|
|
* program.
|
|
|
|
|
|
*/
|
2014-09-02 10:08:24 -07:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, assume that there *are* outstanding
|
|
|
|
|
|
* dependencies, and force their cleanup before our instruction.
|
|
|
|
|
|
*/
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->start() == scan_inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
2015-06-03 22:22:10 +03:00
|
|
|
|
if (needs_dep[i])
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, inst),
|
|
|
|
|
|
first_write_grf + i);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible on the assumption that any
|
|
|
|
|
|
* instruction but a MOV that might have left us an outstanding
|
|
|
|
|
|
* dependency has more latency than a MOV.
|
|
|
|
|
|
*/
|
2013-03-06 17:50:50 -08:00
|
|
|
|
if (scan_inst->dst.file == GRF) {
|
2013-03-18 11:30:57 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->regs_written; i++) {
|
2014-08-18 14:27:55 -07:00
|
|
|
|
int reg = scan_inst->dst.reg + i;
|
2013-03-06 17:50:50 -08:00
|
|
|
|
|
|
|
|
|
|
if (reg >= first_write_grf &&
|
|
|
|
|
|
reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[reg - first_write_grf]) {
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (scan_inst->exec_size == 16)
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Errata: A destination register from a send can not be
|
|
|
|
|
|
* used as a destination register until after it has been sourced by an
|
|
|
|
|
|
* instruction with a different destination register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2014-08-18 14:27:55 -07:00
|
|
|
|
int write_len = inst->regs_written;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int first_write_grf = inst->dst.reg;
|
i965: Turn BRW_MAX_MRF into a macro that accepts a hardware generation
There are some bug reports about shaders failing to compile in gen6
because MRF 14 is used when we need to spill. For example:
https://bugs.freedesktop.org/show_bug.cgi?id=86469
https://bugs.freedesktop.org/show_bug.cgi?id=90631
Discussion in bugzilla pointed to the fact that gen6 might actually have
24 MRF registers available instead of 16, so we could use other MRF
registers and avoid these conflicts (we still need to investigate why
some shaders need up to MRF 14 anyway, since this is not expected).
Notice that the hardware docs are not clear about this fact:
SNB PRM Vol4 Part2's "Table 5-4. MRF Registers Available in Device
Hardware" says "Number per Thread" - "24 registers"
However, SNB PRM Vol4 Part1, 1.6.1 Message Register File (MRF) says:
"Normal threads should construct their messages in m1..m15. (...)
Regardless of actual hardware implementation, the thread should
not assume th at MRF addresses above m15 wrap to legal MRF registers."
Therefore experimentation was necessary to evaluate if we had these extra
MRF registers available or not. This was tested in gen6 using MRF
registers 21..23 for spilling and doing a full piglit run (all.py) forcing
spilling of everything on the FS backend. It was also tested by doing
spilling of everything on both the FS and the VS backends with a piglit run
of shader.py. In both cases no regressions were observed. In fact, many of
these tests where helped in the cases where we forced spilling, since that
triggered the same underlying problem described in the bug reports. Here are
some results using INTEL_DEBUG=spill_fs,spill_vec4 for a shader.py run on
gen6 hardware:
Using MRFs 13..15 for spilling:
crash: 2, fail: 113, pass: 6621, skip: 5461
Using MRFs 21..23 for spilling:
crash: 2, fail: 12, pass: 6722, skip: 5461
This patch sets the ground for later patches to implement spilling
using MRF registers 21..23 in gen6.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-09-15 16:00:26 +02:00
|
|
|
|
bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
|
2013-02-05 15:46:22 -08:00
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
/* Walk forwards looking for writes to registers we're writing which aren't
|
|
|
|
|
|
* read before being written.
|
|
|
|
|
|
*/
|
2014-09-02 10:08:24 -07:00
|
|
|
|
foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, force resolve all remaining dependencies. */
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->end() == scan_inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
|
|
|
|
|
|
first_write_grf + i);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible since they're reading the
|
|
|
|
|
|
* result of a SEND, which has massive latency.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (scan_inst->dst.file == GRF &&
|
|
|
|
|
|
scan_inst->dst.reg >= first_write_grf &&
|
|
|
|
|
|
scan_inst->dst.reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[scan_inst->dst.reg - first_write_grf]) {
|
2015-07-27 18:28:39 +03:00
|
|
|
|
DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
|
|
|
|
|
|
scan_inst->dst.reg);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
needs_dep[scan_inst->dst.reg - first_write_grf] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::insert_gen4_send_dependency_workarounds()
|
|
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen != 4 || devinfo->is_g4x)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-06-09 02:59:22 -07:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* Note that we're done with register allocation, so GRF fs_regs always
|
|
|
|
|
|
* have a .reg_offset of 0.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2014-08-24 19:07:01 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
if (inst->mlen != 0 && inst->dst.file == GRF) {
|
2014-08-24 19:07:01 -07:00
|
|
|
|
insert_gen4_pre_send_dependency_workarounds(block, inst);
|
|
|
|
|
|
insert_gen4_post_send_dependency_workarounds(block, inst);
|
2014-06-09 02:59:22 -07:00
|
|
|
|
progress = true;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-06-09 02:59:22 -07:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-15 19:26:48 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Turns the generic expression-style uniform pull constant load instruction
|
|
|
|
|
|
* into a hardware-specific series of instructions for loading a pull
|
|
|
|
|
|
* constant.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The expression style allows the CSE pass before this to optimize out
|
|
|
|
|
|
* repeated loads from the same offset, and gives the pre-register-allocation
|
|
|
|
|
|
* scheduling full flexibility, while the conversion to native instructions
|
|
|
|
|
|
* allows the post-register-allocation scheduler the best information
|
|
|
|
|
|
* possible.
|
2013-03-06 14:47:22 -08:00
|
|
|
|
*
|
|
|
|
|
|
* Note that execution masking for setting up pull constant loads is special:
|
|
|
|
|
|
* the channels that need to be written are unrelated to the current execution
|
|
|
|
|
|
* mask, since a later instruction will use one of the result channels as a
|
|
|
|
|
|
* source operand for all 8 or 16 of its channels.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::lower_uniform_pull_constant_loads()
|
|
|
|
|
|
{
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2013-02-15 19:26:48 -08:00
|
|
|
|
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7) {
|
2013-03-14 14:41:37 -07:00
|
|
|
|
/* The offset arg before was a vec4-aligned byte offset. We need to
|
|
|
|
|
|
* turn it into a dword offset.
|
|
|
|
|
|
*/
|
2013-02-15 19:26:48 -08:00
|
|
|
|
fs_reg const_offset_reg = inst->src[1];
|
|
|
|
|
|
assert(const_offset_reg.file == IMM &&
|
|
|
|
|
|
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
|
2014-06-29 15:13:24 -07:00
|
|
|
|
const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
|
2013-03-06 14:47:22 -08:00
|
|
|
|
|
2015-06-19 12:58:37 -07:00
|
|
|
|
fs_reg payload, offset;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 9) {
|
2015-06-19 12:58:37 -07:00
|
|
|
|
/* We have to use a message header on Skylake to get SIMD4x2
|
|
|
|
|
|
* mode. Reserve space for the register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
offset = payload = fs_reg(GRF, alloc.allocate(2));
|
|
|
|
|
|
offset.reg_offset++;
|
|
|
|
|
|
inst->mlen = 2;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
offset = payload = fs_reg(GRF, alloc.allocate(1));
|
|
|
|
|
|
inst->mlen = 1;
|
2014-12-10 14:59:26 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
/* This is actually going to be a MOV, but since only the first dword
|
|
|
|
|
|
* is accessed, we have a special opcode to do just that one. Note
|
|
|
|
|
|
* that this needs to be an operation that will be considered a def
|
|
|
|
|
|
* by live variable analysis, or register allocation will explode.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
2013-03-06 14:47:22 -08:00
|
|
|
|
fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
|
2015-06-19 12:58:37 -07:00
|
|
|
|
8, offset, const_offset_reg);
|
2013-03-06 14:47:22 -08:00
|
|
|
|
setup->force_writemask_all = true;
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
setup->ir = inst->ir;
|
|
|
|
|
|
setup->annotation = inst->annotation;
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->insert_before(block, setup);
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
/* Similarly, this will only populate the first 4 channels of the
|
|
|
|
|
|
* result register (since we only use smear values from 0-3), but we
|
|
|
|
|
|
* don't tell the optimizer.
|
|
|
|
|
|
*/
|
2013-02-15 19:26:48 -08:00
|
|
|
|
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
inst->src[1] = payload;
|
2015-06-19 12:58:37 -07:00
|
|
|
|
inst->base_mrf = -1;
|
2013-03-06 15:58:46 -08:00
|
|
|
|
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Before register allocation, we didn't tell the scheduler about the
|
|
|
|
|
|
* MRF we use. We know it's safe to use this MRF because nothing
|
|
|
|
|
|
* else does except for register spill/unspill, which generates and
|
|
|
|
|
|
* uses its MRF within a single IR instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->base_mrf = 14;
|
|
|
|
|
|
inst->mlen = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-18 11:56:46 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_load_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
assert(inst->dst.file == MRF || inst->dst.file == GRF);
|
|
|
|
|
|
assert(inst->saturate == false);
|
|
|
|
|
|
fs_reg dst = inst->dst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Get rid of COMPR4. We'll add it back in if we need it */
|
|
|
|
|
|
if (dst.file == MRF)
|
|
|
|
|
|
dst.reg = dst.reg & ~BRW_MRF_COMPR4;
|
|
|
|
|
|
|
2015-07-27 18:34:43 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
|
|
|
|
|
const fs_builder hbld = ibld.exec_all().group(8, 0);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
for (uint8_t i = 0; i < inst->header_size; i++) {
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
hbld.MOV(mov_dst, mov_src);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
2015-06-18 12:07:27 -07:00
|
|
|
|
dst = offset(dst, hbld, 1);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
|
|
|
|
|
|
inst->exec_size > 8) {
|
|
|
|
|
|
/* In this case, the payload portion of the LOAD_PAYLOAD isn't
|
|
|
|
|
|
* a straightforward copy. Instead, the result of the
|
|
|
|
|
|
* LOAD_PAYLOAD is treated as interleaved and the first four
|
|
|
|
|
|
* non-header sources are unpacked as:
|
|
|
|
|
|
*
|
|
|
|
|
|
* m + 0: r0
|
|
|
|
|
|
* m + 1: g0
|
|
|
|
|
|
* m + 2: b0
|
|
|
|
|
|
* m + 3: a0
|
|
|
|
|
|
* m + 4: r1
|
|
|
|
|
|
* m + 5: g1
|
|
|
|
|
|
* m + 6: b1
|
|
|
|
|
|
* m + 7: a1
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is used for gen <= 5 fb writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->exec_size == 16);
|
|
|
|
|
|
assert(inst->header_size + 4 <= inst->sources);
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
|
|
|
|
|
|
if (inst->src[i].file != BAD_FILE) {
|
|
|
|
|
|
if (devinfo->has_compr4) {
|
|
|
|
|
|
fs_reg compr4_dst = retype(dst, inst->src[i].type);
|
|
|
|
|
|
compr4_dst.reg |= BRW_MRF_COMPR4;
|
2015-06-03 20:36:47 +03:00
|
|
|
|
ibld.MOV(compr4_dst, inst->src[i]);
|
2015-02-05 12:20:03 +02:00
|
|
|
|
} else {
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
/* Platform doesn't have COMPR4. We have to fake it */
|
|
|
|
|
|
fs_reg mov_dst = retype(dst, inst->src[i].type);
|
2015-06-03 20:36:47 +03:00
|
|
|
|
ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
|
2015-06-18 12:07:27 -07:00
|
|
|
|
mov_dst.reg += 4;
|
|
|
|
|
|
ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
|
2015-02-04 19:49:32 +02:00
|
|
|
|
}
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
dst.reg++;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
/* The loop above only ever incremented us through the first set
|
|
|
|
|
|
* of 4 registers. However, thanks to the magic of COMPR4, we
|
|
|
|
|
|
* actually wrote to the first 8 registers, so we need to take
|
|
|
|
|
|
* that into account now.
|
|
|
|
|
|
*/
|
|
|
|
|
|
dst.reg += 4;
|
|
|
|
|
|
|
|
|
|
|
|
/* The COMPR4 code took care of the first 4 sources. We'll let
|
|
|
|
|
|
* the regular path handle any remaining sources. Yes, we are
|
|
|
|
|
|
* modifying the instruction but we're about to delete it so
|
|
|
|
|
|
* this really doesn't hurt anything.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->header_size += 4;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
|
|
|
|
|
|
for (uint8_t i = inst->header_size; i < inst->sources; i++) {
|
2015-06-03 20:36:47 +03:00
|
|
|
|
if (inst->src[i].file != BAD_FILE)
|
|
|
|
|
|
ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
|
2015-06-18 12:07:27 -07:00
|
|
|
|
dst = offset(dst, ibld, 1);
|
i965/fs: Rework the fs_visitor LOAD_PAYLOAD instruction
The newly reworked instruction is far more straightforward than the
original. Before, the LOAD_PAYLOAD instruction was lowered by a the
complicated and broken-by-design pile of heuristics to try and guess
force_writemask_all, exec_size, and a number of other factors on the
sources.
Instead, we use the header_size on the instruction to denote which sources
are "header sources". Header sources are required to be a single physical
hardware register that is copied verbatim. The registers that follow are
considered the actual payload registers and have a width that correspond's
to the LOAD_PAYLOAD's exec_size and are treated as being per-channel. This
gives us a fairly straightforward lowering:
1) All header sources are copied directly using force_writemask_all and,
since they are guaranteed to be a single register, there are no
force_sechalf issues.
2) All non-header sources are copied using the exact same force_sechalf
and force_writemask_all modifiers as the LOAD_PAYLOAD operation itself.
3) In order to accommodate older gens that need interleaved colors,
lower_load_payload detects when the destination is a COMPR4 register
and automatically interleaves the non-header sources. The
lower_load_payload pass does the right thing here regardless of whether
or not the hardware actually supports COMPR4.
This patch commit itself is made up of a bunch of smaller changes squashed
together. Individual change descriptions follow:
i965/fs: Rework fs_visitor::LOAD_PAYLOAD
We rework LOAD_PAYLOAD to verify that all of the sources that count as
headers are, indeed, exactly one register and that all of the non-header
sources match the destination width. We then take the exec_size for
LOAD_PAYLOAD directly from the destination width.
i965/fs: Make destinations of load_payload have the appropreate width
i965/fs: Rework fs_visitor::lower_load_payload
v2: Don't allow the saturate flag on LOAD_PAYLOAD instructions
i965/fs_cse: Support the new-style LOAD_PAYLOAD
i965/fs_inst::is_copy_payload: Support the new-style LOAD_PAYLOAD
i965/fs: Simplify setup_color_payload
Previously, setup_color_payload was a a big helper function that did a
lot of gen-specific special casing for setting up the color sources of
the LOAD_PAYLOAD instruction. Now that lower_load_payload is much more
sane, most of that complexity isn't needed anymore. Instead, we can do
a simple fixup pass for color clamps and then just stash sources
directly in the LOAD_PAYLOAD. We can trust lower_load_payload to do the
right thing with respect to COMPR4.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-03-24 17:00:04 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-05-11 09:29:56 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_integer_multiplication()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2015-07-27 18:38:59 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
2015-05-11 09:29:56 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_MUL) {
|
|
|
|
|
|
if (inst->dst.is_accumulator() ||
|
|
|
|
|
|
(inst->dst.type != BRW_REGISTER_TYPE_D &&
|
|
|
|
|
|
inst->dst.type != BRW_REGISTER_TYPE_UD))
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
|
|
|
|
|
|
* operation directly, but CHV/BXT cannot.
|
2015-05-12 15:51:44 -07:00
|
|
|
|
*/
|
2015-08-05 16:47:18 +03:00
|
|
|
|
if (devinfo->gen >= 8 &&
|
|
|
|
|
|
!devinfo->is_cherryview && !devinfo->is_broxton)
|
|
|
|
|
|
continue;
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
if (inst->src[1].file == IMM &&
|
|
|
|
|
|
inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
|
|
|
|
|
|
/* The MUL instruction isn't commutative. On Gen <= 6, only the low
|
|
|
|
|
|
* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
|
|
|
|
|
|
* src1 are used.
|
|
|
|
|
|
*
|
|
|
|
|
|
* If multiplying by an immediate value that fits in 16-bits, do a
|
|
|
|
|
|
* single MUL instruction with that value in the proper location.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 7) {
|
|
|
|
|
|
fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
|
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
ibld.MOV(imm, inst->src[1]);
|
|
|
|
|
|
ibld.MUL(inst->dst, imm, inst->src[0]);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
|
|
|
|
|
|
* do 32-bit integer multiplication in one instruction, but instead
|
|
|
|
|
|
* must do a sequence (which actually calculates a 64-bit result):
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
|
|
|
|
|
|
* mach(8) null g3<8,8,1>D g4<8,8,1>D
|
|
|
|
|
|
* mov(8) g2<1>D acc0<8,8,1>D
|
|
|
|
|
|
*
|
|
|
|
|
|
* But on Gen > 6, the ability to use second accumulator register
|
|
|
|
|
|
* (acc1) for non-float data types was removed, preventing a simple
|
|
|
|
|
|
* implementation in SIMD16. A 16-channel result can be calculated by
|
|
|
|
|
|
* executing the three instructions twice in SIMD8, once with quarter
|
|
|
|
|
|
* control of 1Q for the first eight channels and again with 2Q for
|
|
|
|
|
|
* the second eight channels.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Which accumulator register is implicitly accessed (by AccWrEnable
|
|
|
|
|
|
* for instance) is determined by the quarter control. Unfortunately
|
|
|
|
|
|
* Ivybridge (and presumably Baytrail) has a hardware bug in which an
|
|
|
|
|
|
* implicit accumulator access by an instruction with 2Q will access
|
|
|
|
|
|
* acc1 regardless of whether the data type is usable in acc1.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Specifically, the 2Q mach(8) writes acc1 which does not exist for
|
|
|
|
|
|
* integer data types.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Since we only want the low 32-bits of the result, we can do two
|
|
|
|
|
|
* 32-bit x 16-bit multiplies (like the mul and mach are doing), and
|
|
|
|
|
|
* adjust the high result and add them (like the mach is doing):
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
|
|
|
|
|
|
* mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
|
|
|
|
|
|
* shl(8) g9<1>D g8<8,8,1>D 16D
|
|
|
|
|
|
* add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
|
|
|
|
|
|
*
|
|
|
|
|
|
* We avoid the shl instruction by realizing that we only want to add
|
|
|
|
|
|
* the low 16-bits of the "high" result to the high 16-bits of the
|
|
|
|
|
|
* "low" result and using proper regioning on the add:
|
|
|
|
|
|
*
|
|
|
|
|
|
* mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
|
|
|
|
|
|
* mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
|
|
|
|
|
|
* add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
|
|
|
|
|
|
*
|
|
|
|
|
|
* Since it does not use the (single) accumulator register, we can
|
|
|
|
|
|
* schedule multi-component multiplications much better.
|
|
|
|
|
|
*/
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-09-01 22:00:24 -07:00
|
|
|
|
fs_reg orig_dst = inst->dst;
|
|
|
|
|
|
if (orig_dst.is_null() || orig_dst.file == MRF) {
|
2015-08-05 16:47:18 +03:00
|
|
|
|
inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
|
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
}
|
|
|
|
|
|
fs_reg low = inst->dst;
|
|
|
|
|
|
fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
|
|
|
|
|
|
inst->dst.type);
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
if (devinfo->gen >= 7) {
|
|
|
|
|
|
fs_reg src1_0_w = inst->src[1];
|
|
|
|
|
|
fs_reg src1_1_w = inst->src[1];
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[1].file == IMM) {
|
|
|
|
|
|
src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
|
|
|
|
|
|
src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
src1_0_w.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
if (src1_0_w.stride != 0) {
|
|
|
|
|
|
assert(src1_0_w.stride == 1);
|
|
|
|
|
|
src1_0_w.stride = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
src1_1_w.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
if (src1_1_w.stride != 0) {
|
|
|
|
|
|
assert(src1_1_w.stride == 1);
|
|
|
|
|
|
src1_1_w.stride = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
|
2015-06-22 10:59:33 -07:00
|
|
|
|
}
|
2015-08-05 16:47:18 +03:00
|
|
|
|
ibld.MUL(low, inst->src[0], src1_0_w);
|
|
|
|
|
|
ibld.MUL(high, inst->src[0], src1_1_w);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg src0_0_w = inst->src[0];
|
|
|
|
|
|
fs_reg src0_1_w = inst->src[0];
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
src0_0_w.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
if (src0_0_w.stride != 0) {
|
|
|
|
|
|
assert(src0_0_w.stride == 1);
|
|
|
|
|
|
src0_0_w.stride = 2;
|
2015-06-22 10:59:33 -07:00
|
|
|
|
}
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
src0_1_w.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
if (src0_1_w.stride != 0) {
|
|
|
|
|
|
assert(src0_1_w.stride == 1);
|
|
|
|
|
|
src0_1_w.stride = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
ibld.MUL(low, src0_0_w, inst->src[1]);
|
|
|
|
|
|
ibld.MUL(high, src0_1_w, inst->src[1]);
|
2015-06-22 10:59:33 -07:00
|
|
|
|
}
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
fs_reg dst = inst->dst;
|
|
|
|
|
|
dst.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
dst.subreg_offset = 2;
|
|
|
|
|
|
dst.stride = 2;
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
high.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
high.stride = 2;
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
low.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
low.subreg_offset = 2;
|
|
|
|
|
|
low.stride = 2;
|
2015-05-13 18:34:03 -07:00
|
|
|
|
|
2015-08-05 16:47:18 +03:00
|
|
|
|
ibld.ADD(dst, low, high);
|
2015-05-27 10:44:45 -07:00
|
|
|
|
|
2015-09-01 22:00:24 -07:00
|
|
|
|
if (inst->conditional_mod || orig_dst.file == MRF) {
|
2015-08-05 16:47:18 +03:00
|
|
|
|
set_condmod(inst->conditional_mod,
|
2015-09-01 22:00:24 -07:00
|
|
|
|
ibld.MOV(orig_dst, inst->dst));
|
2015-08-05 16:47:18 +03:00
|
|
|
|
}
|
2015-05-27 10:44:45 -07:00
|
|
|
|
}
|
2015-08-06 14:04:00 +03:00
|
|
|
|
|
|
|
|
|
|
} else if (inst->opcode == SHADER_OPCODE_MULH) {
|
|
|
|
|
|
/* Should have been lowered to 8-wide. */
|
|
|
|
|
|
assert(inst->exec_size <= 8);
|
|
|
|
|
|
const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
|
|
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
|
|
|
|
|
|
fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 8) {
|
|
|
|
|
|
/* Until Gen8, integer multiplies read 32-bits from one source,
|
|
|
|
|
|
* and 16-bits from the other, and relying on the MACH instruction
|
|
|
|
|
|
* to generate the high bits of the result.
|
|
|
|
|
|
*
|
|
|
|
|
|
* On Gen8, the multiply instruction does a full 32x32-bit
|
|
|
|
|
|
* multiply, but in order to do a 64-bit multiply we can simulate
|
|
|
|
|
|
* the previous behavior and then use a MACH instruction.
|
|
|
|
|
|
*
|
|
|
|
|
|
* FINISHME: Don't use source modifiers on src1.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
|
|
|
|
|
|
mul->src[1].type == BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
mul->src[1].type = (type_is_signed(mul->src[1].type) ?
|
|
|
|
|
|
BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
|
|
|
|
|
|
mul->src[1].stride *= 2;
|
|
|
|
|
|
|
|
|
|
|
|
} else if (devinfo->gen == 7 && !devinfo->is_haswell &&
|
|
|
|
|
|
inst->force_sechalf) {
|
|
|
|
|
|
/* Among other things the quarter control bits influence which
|
|
|
|
|
|
* accumulator register is used by the hardware for instructions
|
|
|
|
|
|
* that access the accumulator implicitly (e.g. MACH). A
|
|
|
|
|
|
* second-half instruction would normally map to acc1, which
|
|
|
|
|
|
* doesn't exist on Gen7 and up (the hardware does emulate it for
|
|
|
|
|
|
* floating-point instructions *only* by taking advantage of the
|
|
|
|
|
|
* extra precision of acc0 not normally used for floating point
|
|
|
|
|
|
* arithmetic).
|
|
|
|
|
|
*
|
|
|
|
|
|
* HSW and up are careful enough not to try to access an
|
|
|
|
|
|
* accumulator register that doesn't exist, but on earlier Gen7
|
|
|
|
|
|
* hardware we need to make sure that the quarter control bits are
|
|
|
|
|
|
* zero to avoid non-deterministic behaviour and emit an extra MOV
|
|
|
|
|
|
* to get the result masked correctly according to the current
|
|
|
|
|
|
* channel enables.
|
|
|
|
|
|
*/
|
|
|
|
|
|
mach->force_sechalf = false;
|
|
|
|
|
|
mach->force_writemask_all = true;
|
|
|
|
|
|
mach->dst = ibld.vgrf(inst->dst.type);
|
|
|
|
|
|
ibld.MOV(inst->dst, mach->dst);
|
|
|
|
|
|
}
|
2015-08-05 16:47:18 +03:00
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
2015-05-11 09:29:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:59:34 +03:00
|
|
|
|
static void
|
|
|
|
|
|
setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
|
|
|
|
|
|
fs_reg *dst, fs_reg color, unsigned components)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (key->clamp_fragment_color) {
|
|
|
|
|
|
fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
|
|
|
|
|
|
assert(color.type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < components; i++)
|
|
|
|
|
|
set_saturate(true,
|
|
|
|
|
|
bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
|
|
|
|
|
|
|
|
|
|
|
|
color = tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < components; i++)
|
|
|
|
|
|
dst[i] = offset(color, bld, i);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-27 16:14:36 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
|
|
|
|
|
|
const brw_wm_prog_data *prog_data,
|
|
|
|
|
|
const brw_wm_prog_key *key,
|
|
|
|
|
|
const fs_visitor::thread_payload &payload)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(inst->src[6].file == IMM);
|
|
|
|
|
|
const brw_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
const fs_reg &color0 = inst->src[0];
|
|
|
|
|
|
const fs_reg &color1 = inst->src[1];
|
|
|
|
|
|
const fs_reg &src0_alpha = inst->src[2];
|
|
|
|
|
|
const fs_reg &src_depth = inst->src[3];
|
|
|
|
|
|
const fs_reg &dst_depth = inst->src[4];
|
|
|
|
|
|
fs_reg sample_mask = inst->src[5];
|
|
|
|
|
|
const unsigned components = inst->src[6].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
|
2015-07-13 17:59:34 +03:00
|
|
|
|
/* We can potentially have a message length of up to 15, so we have to set
|
|
|
|
|
|
* base_mrf to either 0 or 1 in order to fit in m0..m15.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg sources[15];
|
|
|
|
|
|
int header_size = 2, payload_header_size;
|
|
|
|
|
|
unsigned length = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* From the Sandy Bridge PRM, volume 4, page 198:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Dispatched Pixel Enables. One bit per pixel indicating
|
|
|
|
|
|
* which pixels were originally enabled when the thread was
|
|
|
|
|
|
* dispatched. This field is only required for the end-of-
|
|
|
|
|
|
* thread message and on all dual-source messages."
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen >= 6 &&
|
|
|
|
|
|
(devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
|
|
|
|
|
|
color1.file == BAD_FILE &&
|
|
|
|
|
|
key->nr_color_regions == 1) {
|
|
|
|
|
|
header_size = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (header_size != 0) {
|
|
|
|
|
|
assert(header_size == 2);
|
|
|
|
|
|
/* Allocate 2 registers for a header */
|
|
|
|
|
|
length += 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (payload.aa_dest_stencil_reg) {
|
|
|
|
|
|
sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
|
|
|
|
|
|
bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
|
|
|
|
|
|
.MOV(sources[length],
|
|
|
|
|
|
fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (prog_data->uses_omask) {
|
|
|
|
|
|
sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
|
|
|
|
|
|
/* Hand over gl_SampleMask. Only the lower 16 bits of each channel are
|
|
|
|
|
|
* relevant. Since it's unsigned single words one vgrf is always
|
|
|
|
|
|
* 16-wide, but only the lower or higher 8 channels will be used by the
|
|
|
|
|
|
* hardware when doing a SIMD8 write depending on whether we have
|
|
|
|
|
|
* selected the subspans for the first or second half respectively.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
|
|
|
|
|
|
sample_mask.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
sample_mask.stride *= 2;
|
|
|
|
|
|
|
|
|
|
|
|
bld.exec_all().annotate("FB write oMask")
|
|
|
|
|
|
.MOV(half(retype(sources[length], BRW_REGISTER_TYPE_UW),
|
|
|
|
|
|
inst->force_sechalf),
|
|
|
|
|
|
sample_mask);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
payload_header_size = length;
|
|
|
|
|
|
|
|
|
|
|
|
if (src0_alpha.file != BAD_FILE) {
|
|
|
|
|
|
/* FIXME: This is being passed at the wrong location in the payload and
|
|
|
|
|
|
* doesn't work when gl_SampleMask and MRTs are used simultaneously.
|
|
|
|
|
|
* It's supposed to be immediately before oMask but there seems to be no
|
|
|
|
|
|
* reasonable way to pass them in the correct order because LOAD_PAYLOAD
|
|
|
|
|
|
* requires header sources to form a contiguous segment at the beginning
|
|
|
|
|
|
* of the message and src0_alpha has per-channel semantics.
|
|
|
|
|
|
*/
|
|
|
|
|
|
setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
setup_color_payload(bld, key, &sources[length], color0, components);
|
|
|
|
|
|
length += 4;
|
|
|
|
|
|
|
|
|
|
|
|
if (color1.file != BAD_FILE) {
|
|
|
|
|
|
setup_color_payload(bld, key, &sources[length], color1, components);
|
|
|
|
|
|
length += 4;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (src_depth.file != BAD_FILE) {
|
|
|
|
|
|
sources[length] = src_depth;
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (dst_depth.file != BAD_FILE) {
|
|
|
|
|
|
sources[length] = dst_depth;
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *load;
|
|
|
|
|
|
if (devinfo->gen >= 7) {
|
|
|
|
|
|
/* Send from the GRF */
|
|
|
|
|
|
fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
|
|
|
|
|
|
payload.reg = bld.shader->alloc.allocate(load->regs_written);
|
|
|
|
|
|
load->dst = payload;
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[0] = payload;
|
|
|
|
|
|
inst->resize_sources(1);
|
|
|
|
|
|
inst->base_mrf = -1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Send from the MRF */
|
|
|
|
|
|
load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
|
|
|
|
|
|
sources, length, payload_header_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* On pre-SNB, we have to interlace the color values. LOAD_PAYLOAD
|
|
|
|
|
|
* will do this for us if we just give it a COMPR4 destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 6 && bld.dispatch_width() == 16)
|
|
|
|
|
|
load->dst.reg |= BRW_MRF_COMPR4;
|
|
|
|
|
|
|
|
|
|
|
|
inst->resize_sources(0);
|
|
|
|
|
|
inst->base_mrf = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = FS_OPCODE_FB_WRITE;
|
|
|
|
|
|
inst->mlen = load->regs_written;
|
|
|
|
|
|
inst->header_size = header_size;
|
2015-07-27 16:14:36 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-18 17:09:37 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
|
|
|
|
|
|
const fs_reg &coordinate,
|
|
|
|
|
|
const fs_reg &shadow_c,
|
|
|
|
|
|
const fs_reg &lod, const fs_reg &lod2,
|
|
|
|
|
|
const fs_reg &sampler,
|
|
|
|
|
|
unsigned coord_components,
|
|
|
|
|
|
unsigned grad_components)
|
|
|
|
|
|
{
|
|
|
|
|
|
const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
|
|
|
|
|
|
op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
|
|
|
|
|
|
fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
fs_reg msg_end = msg_begin;
|
|
|
|
|
|
|
|
|
|
|
|
/* g0 header. */
|
|
|
|
|
|
msg_end = offset(msg_end, bld.group(8, 0), 1);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++)
|
|
|
|
|
|
bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
|
|
|
|
|
|
offset(coordinate, bld, i));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, coord_components);
|
|
|
|
|
|
|
|
|
|
|
|
/* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
|
|
|
|
|
|
* require all three components to be present and zero if they are unused.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (coord_components > 0 &&
|
|
|
|
|
|
(has_lod || shadow_c.file != BAD_FILE ||
|
|
|
|
|
|
(op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
|
|
|
|
|
|
for (unsigned i = coord_components; i < 3; i++)
|
|
|
|
|
|
bld.MOV(offset(msg_end, bld, i), fs_reg(0.0f));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 3 - coord_components);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (op == SHADER_OPCODE_TXD) {
|
|
|
|
|
|
/* TXD unsupported in SIMD16 mode. */
|
|
|
|
|
|
assert(bld.dispatch_width() == 8);
|
|
|
|
|
|
|
|
|
|
|
|
/* the slots for u and v are always present, but r is optional */
|
|
|
|
|
|
if (coord_components < 2)
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 2 - coord_components);
|
|
|
|
|
|
|
|
|
|
|
|
/* P = u, v, r
|
|
|
|
|
|
* dPdx = dudx, dvdx, drdx
|
|
|
|
|
|
* dPdy = dudy, dvdy, drdy
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1-arg: Does not exist.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 2-arg: dudx dvdx dudy dvdy
|
|
|
|
|
|
* dPdx.x dPdx.y dPdy.x dPdy.y
|
|
|
|
|
|
* m4 m5 m6 m7
|
|
|
|
|
|
*
|
|
|
|
|
|
* 3-arg: dudx dvdx drdx dudy dvdy drdy
|
|
|
|
|
|
* dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
|
|
|
|
|
|
* m5 m6 m7 m8 m9 m10
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < grad_components; i++)
|
|
|
|
|
|
bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < grad_components; i++)
|
|
|
|
|
|
bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
|
|
|
|
|
|
|
|
|
|
|
|
msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (has_lod) {
|
|
|
|
|
|
/* Bias/LOD with shadow comparitor is unsupported in SIMD16 -- *Without*
|
|
|
|
|
|
* shadow comparitor (including RESINFO) it's unsupported in SIMD8 mode.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
|
|
|
|
|
|
bld.dispatch_width() == 16);
|
|
|
|
|
|
|
|
|
|
|
|
const brw_reg_type type =
|
|
|
|
|
|
(op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
bld.MOV(retype(msg_end, type), lod);
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (shadow_c.file != BAD_FILE) {
|
|
|
|
|
|
if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
|
|
|
|
|
|
/* There's no plain shadow compare message, so we use shadow
|
|
|
|
|
|
* compare with a bias of 0.0.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bld.MOV(msg_end, fs_reg(0.0f));
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(msg_end, shadow_c);
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = op;
|
|
|
|
|
|
inst->src[0] = reg_undef;
|
|
|
|
|
|
inst->src[1] = sampler;
|
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
|
inst->base_mrf = msg_begin.reg;
|
|
|
|
|
|
inst->mlen = msg_end.reg - msg_begin.reg;
|
|
|
|
|
|
inst->header_size = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-18 16:52:06 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
|
|
|
|
|
|
fs_reg coordinate,
|
|
|
|
|
|
const fs_reg &shadow_c,
|
|
|
|
|
|
fs_reg lod, fs_reg lod2,
|
|
|
|
|
|
const fs_reg &sample_index,
|
|
|
|
|
|
const fs_reg &sampler,
|
|
|
|
|
|
const fs_reg &offset_value,
|
|
|
|
|
|
unsigned coord_components,
|
|
|
|
|
|
unsigned grad_components)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
fs_reg msg_coords = message;
|
|
|
|
|
|
unsigned header_size = 0;
|
|
|
|
|
|
|
|
|
|
|
|
if (offset_value.file != BAD_FILE) {
|
|
|
|
|
|
/* The offsets set up by the visitor are in the m1 header, so we can't
|
|
|
|
|
|
* go headerless.
|
|
|
|
|
|
*/
|
|
|
|
|
|
header_size = 1;
|
|
|
|
|
|
message.reg--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++) {
|
|
|
|
|
|
bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type), coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
fs_reg msg_end = offset(msg_coords, bld, coord_components);
|
|
|
|
|
|
fs_reg msg_lod = offset(msg_coords, bld, 4);
|
|
|
|
|
|
|
|
|
|
|
|
if (shadow_c.file != BAD_FILE) {
|
|
|
|
|
|
fs_reg msg_shadow = msg_lod;
|
|
|
|
|
|
bld.MOV(msg_shadow, shadow_c);
|
|
|
|
|
|
msg_lod = offset(msg_shadow, bld, 1);
|
|
|
|
|
|
msg_end = msg_lod;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
bld.MOV(msg_lod, lod);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
/**
|
|
|
|
|
|
* P = u, v, r
|
|
|
|
|
|
* dPdx = dudx, dvdx, drdx
|
|
|
|
|
|
* dPdy = dudy, dvdy, drdy
|
|
|
|
|
|
*
|
|
|
|
|
|
* Load up these values:
|
|
|
|
|
|
* - dudx dudy dvdx dvdy drdx drdy
|
|
|
|
|
|
* - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
|
|
|
|
|
|
*/
|
|
|
|
|
|
msg_end = msg_lod;
|
|
|
|
|
|
for (unsigned i = 0; i < grad_components; i++) {
|
|
|
|
|
|
bld.MOV(msg_end, lod);
|
|
|
|
|
|
lod = offset(lod, bld, 1);
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(msg_end, lod2);
|
|
|
|
|
|
lod2 = offset(lod2, bld, 1);
|
|
|
|
|
|
msg_end = offset(msg_end, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.MOV(msg_lod, lod);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
msg_lod = offset(msg_coords, bld, 3);
|
|
|
|
|
|
bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 1);
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
|
|
|
|
|
msg_lod = offset(msg_coords, bld, 3);
|
|
|
|
|
|
/* lod */
|
|
|
|
|
|
bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
|
|
|
|
|
|
/* sample index */
|
|
|
|
|
|
bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
|
|
|
|
|
|
msg_end = offset(msg_lod, bld, 2);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->opcode = op;
|
|
|
|
|
|
inst->src[0] = reg_undef;
|
|
|
|
|
|
inst->src[1] = sampler;
|
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
|
inst->base_mrf = message.reg;
|
|
|
|
|
|
inst->mlen = msg_end.reg - message.reg;
|
|
|
|
|
|
inst->header_size = header_size;
|
|
|
|
|
|
|
|
|
|
|
|
/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
|
|
|
|
|
|
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
static bool
|
|
|
|
|
|
is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (devinfo->gen < 8 && !devinfo->is_haswell)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
|
|
|
|
|
|
fs_reg coordinate,
|
|
|
|
|
|
const fs_reg &shadow_c,
|
|
|
|
|
|
fs_reg lod, fs_reg lod2,
|
|
|
|
|
|
const fs_reg &sample_index,
|
|
|
|
|
|
const fs_reg &mcs, const fs_reg &sampler,
|
|
|
|
|
|
fs_reg offset_value,
|
|
|
|
|
|
unsigned coord_components,
|
|
|
|
|
|
unsigned grad_components)
|
|
|
|
|
|
{
|
|
|
|
|
|
const brw_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
int reg_width = bld.dispatch_width() / 8;
|
|
|
|
|
|
unsigned header_size = 0, length = 0;
|
|
|
|
|
|
fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
|
|
|
|
|
|
sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
|
|
|
|
|
if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
|
|
|
|
|
|
offset_value.file != BAD_FILE ||
|
|
|
|
|
|
is_high_sampler(devinfo, sampler)) {
|
|
|
|
|
|
/* For general texture offsets (no txf workaround), we need a header to
|
|
|
|
|
|
* put them in. Note that we're only reserving space for it in the
|
|
|
|
|
|
* message payload as it will be initialized implicitly by the
|
|
|
|
|
|
* generator.
|
|
|
|
|
|
*
|
|
|
|
|
|
* TG4 needs to place its channel select in the header, for interaction
|
|
|
|
|
|
* with ARB_texture_swizzle. The sampler index is only 4-bits, so for
|
|
|
|
|
|
* larger sampler numbers we need to offset the Sampler State Pointer in
|
|
|
|
|
|
* the header.
|
|
|
|
|
|
*/
|
|
|
|
|
|
header_size = 1;
|
|
|
|
|
|
sources[0] = fs_reg();
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (shadow_c.file != BAD_FILE) {
|
|
|
|
|
|
bld.MOV(sources[length], shadow_c);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool coordinate_done = false;
|
|
|
|
|
|
|
|
|
|
|
|
/* The sampler can only meaningfully compute LOD for fragment shader
|
|
|
|
|
|
* messages. For all other stages, we change the opcode to TXL and
|
|
|
|
|
|
* hardcode the LOD to 0.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (bld.shader->stage != MESA_SHADER_FRAGMENT &&
|
|
|
|
|
|
op == SHADER_OPCODE_TEX) {
|
|
|
|
|
|
op = SHADER_OPCODE_TXL;
|
|
|
|
|
|
lod = fs_reg(0.0f);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Set up the LOD info */
|
|
|
|
|
|
switch (op) {
|
|
|
|
|
|
case FS_OPCODE_TXB:
|
|
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
bld.MOV(sources[length], lod);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
/* TXD should have been lowered in SIMD16 mode. */
|
|
|
|
|
|
assert(bld.dispatch_width() == 8);
|
|
|
|
|
|
|
|
|
|
|
|
/* Load dPdx and the coordinate together:
|
|
|
|
|
|
* [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++) {
|
|
|
|
|
|
bld.MOV(sources[length], coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
|
|
|
|
|
|
/* For cube map array, the coordinate is (u,v,r,ai) but there are
|
|
|
|
|
|
* only derivatives for (u, v, r).
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (i < grad_components) {
|
|
|
|
|
|
bld.MOV(sources[length], lod);
|
|
|
|
|
|
lod = offset(lod, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(sources[length], lod2);
|
|
|
|
|
|
lod2 = offset(lod2, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
|
|
|
|
|
/* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
|
|
|
|
|
|
* On Gen9 they are u, v, lod, r
|
|
|
|
|
|
*/
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
|
|
|
|
|
|
if (devinfo->gen >= 9) {
|
|
|
|
|
|
if (coord_components >= 2) {
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2015-07-17 18:50:27 +03:00
|
|
|
|
case SHADER_OPCODE_TXF_UMS:
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
|
|
|
|
|
if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
2015-07-17 18:50:27 +03:00
|
|
|
|
if (op == SHADER_OPCODE_TXF_CMS) {
|
|
|
|
|
|
/* Data from the multisample control surface. */
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
2015-07-13 18:08:51 +03:00
|
|
|
|
|
|
|
|
|
|
/* There is no offsetting for this message; just copy in the integer
|
|
|
|
|
|
* texture coordinates.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++) {
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
|
|
|
|
|
/* gather4_po_c should have been lowered in SIMD16 mode. */
|
|
|
|
|
|
assert(bld.dispatch_width() == 8 || shadow_c.file == BAD_FILE);
|
|
|
|
|
|
|
|
|
|
|
|
/* More crazy intermixing */
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) { /* u, v */
|
|
|
|
|
|
bld.MOV(sources[length], coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < 2; i++) { /* offu, offv */
|
|
|
|
|
|
bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
|
|
|
|
|
|
offset_value = offset(offset_value, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (coord_components == 3) { /* r if present */
|
|
|
|
|
|
bld.MOV(sources[length], coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
coordinate_done = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Set up the coordinate (except for cases where it was done above) */
|
|
|
|
|
|
if (!coordinate_done) {
|
|
|
|
|
|
for (unsigned i = 0; i < coord_components; i++) {
|
|
|
|
|
|
bld.MOV(sources[length], coordinate);
|
|
|
|
|
|
coordinate = offset(coordinate, bld, 1);
|
|
|
|
|
|
length++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
int mlen;
|
|
|
|
|
|
if (reg_width == 2)
|
|
|
|
|
|
mlen = length * reg_width - header_size;
|
|
|
|
|
|
else
|
|
|
|
|
|
mlen = length * reg_width;
|
|
|
|
|
|
|
|
|
|
|
|
const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
|
|
|
|
|
|
BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
|
|
|
|
|
|
|
|
|
|
|
|
/* Generate the SEND. */
|
|
|
|
|
|
inst->opcode = op;
|
|
|
|
|
|
inst->src[0] = src_payload;
|
|
|
|
|
|
inst->src[1] = sampler;
|
|
|
|
|
|
inst->resize_sources(2);
|
|
|
|
|
|
inst->base_mrf = -1;
|
|
|
|
|
|
inst->mlen = mlen;
|
|
|
|
|
|
inst->header_size = header_size;
|
|
|
|
|
|
|
|
|
|
|
|
/* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
|
|
|
|
|
|
assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
|
|
|
|
|
|
{
|
|
|
|
|
|
const brw_device_info *devinfo = bld.shader->devinfo;
|
|
|
|
|
|
const fs_reg &coordinate = inst->src[0];
|
|
|
|
|
|
const fs_reg &shadow_c = inst->src[1];
|
|
|
|
|
|
const fs_reg &lod = inst->src[2];
|
|
|
|
|
|
const fs_reg &lod2 = inst->src[3];
|
|
|
|
|
|
const fs_reg &sample_index = inst->src[4];
|
|
|
|
|
|
const fs_reg &mcs = inst->src[5];
|
|
|
|
|
|
const fs_reg &sampler = inst->src[6];
|
|
|
|
|
|
const fs_reg &offset_value = inst->src[7];
|
|
|
|
|
|
assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
|
|
|
|
|
|
const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
|
|
|
|
|
|
|
2015-07-13 18:08:51 +03:00
|
|
|
|
if (devinfo->gen >= 7) {
|
|
|
|
|
|
lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
|
|
|
|
|
|
shadow_c, lod, lod2, sample_index,
|
|
|
|
|
|
mcs, sampler, offset_value,
|
|
|
|
|
|
coord_components, grad_components);
|
2015-07-18 16:52:06 +03:00
|
|
|
|
} else if (devinfo->gen >= 5) {
|
|
|
|
|
|
lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
|
|
|
|
|
|
shadow_c, lod, lod2, sample_index,
|
|
|
|
|
|
sampler, offset_value,
|
|
|
|
|
|
coord_components, grad_components);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
} else {
|
2015-07-18 17:09:37 +03:00
|
|
|
|
lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
|
|
|
|
|
|
shadow_c, lod, lod2, sampler,
|
|
|
|
|
|
coord_components, grad_components);
|
2015-07-13 18:08:51 +03:00
|
|
|
|
}
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-14 18:42:57 +03:00
|
|
|
|
/**
|
|
|
|
|
|
* Initialize the header present in some typed and untyped surface
|
|
|
|
|
|
* messages.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static fs_reg
|
|
|
|
|
|
emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_builder ubld = bld.exec_all().group(8, 0);
|
|
|
|
|
|
const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
ubld.MOV(dst, fs_reg(0));
|
|
|
|
|
|
ubld.MOV(component(dst, 7), sample_mask);
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
static void
|
|
|
|
|
|
lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
|
|
|
|
|
|
const fs_reg &sample_mask)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Get the logical send arguments. */
|
|
|
|
|
|
const fs_reg &addr = inst->src[0];
|
|
|
|
|
|
const fs_reg &src = inst->src[1];
|
|
|
|
|
|
const fs_reg &surface = inst->src[2];
|
2015-07-14 18:42:57 +03:00
|
|
|
|
const UNUSED fs_reg &dims = inst->src[3];
|
2015-07-21 18:45:32 +03:00
|
|
|
|
const fs_reg &arg = inst->src[4];
|
|
|
|
|
|
|
2015-07-14 18:42:57 +03:00
|
|
|
|
/* Calculate the total number of components of the payload. */
|
|
|
|
|
|
const unsigned addr_sz = inst->components_read(0);
|
|
|
|
|
|
const unsigned src_sz = inst->components_read(1);
|
|
|
|
|
|
const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
|
|
|
|
|
|
const unsigned sz = header_sz + addr_sz + src_sz;
|
|
|
|
|
|
|
|
|
|
|
|
/* Allocate space for the payload. */
|
|
|
|
|
|
fs_reg *const components = new fs_reg[sz];
|
|
|
|
|
|
const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
|
|
|
|
|
|
unsigned n = 0;
|
|
|
|
|
|
|
|
|
|
|
|
/* Construct the payload. */
|
|
|
|
|
|
if (header_sz)
|
|
|
|
|
|
components[n++] = emit_surface_header(bld, sample_mask);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < addr_sz; i++)
|
|
|
|
|
|
components[n++] = offset(addr, bld, i);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < src_sz; i++)
|
|
|
|
|
|
components[n++] = offset(src, bld, i);
|
|
|
|
|
|
|
|
|
|
|
|
bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
|
|
|
|
|
|
|
|
|
|
|
|
/* Update the original instruction. */
|
|
|
|
|
|
inst->opcode = op;
|
|
|
|
|
|
inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
|
|
|
|
|
|
inst->header_size = header_sz;
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[0] = payload;
|
|
|
|
|
|
inst->src[1] = surface;
|
|
|
|
|
|
inst->src[2] = arg;
|
|
|
|
|
|
inst->resize_sources(3);
|
|
|
|
|
|
|
|
|
|
|
|
delete[] components;
|
2015-07-21 18:45:32 +03:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:44:58 +03:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_logical_sends()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2015-07-27 18:41:18 +03:00
|
|
|
|
const fs_builder ibld(this, block, inst);
|
2015-07-13 17:44:58 +03:00
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
2015-07-27 16:14:36 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
lower_fb_write_logical_send(ibld, inst,
|
|
|
|
|
|
(const brw_wm_prog_data *)prog_data,
|
|
|
|
|
|
(const brw_wm_prog_key *)key,
|
|
|
|
|
|
payload);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
i965/fs: Define logical texture sampling opcodes.
Each logical variant is largely equivalent to the original opcode but
instead of taking a single payload source it expects the arguments
separately as individual sources, like:
tex_logical dst, coordinates, shadow_c, lod, lod2,
sample_index, mcs, sampler, offset,
num_coordinate_components, num_grad_components
This patch defines the opcodes and usual instruction boilerplate,
including a placeholder lowering function provided mostly as
documentation for their source registers.
Reviewed-by: Jason Ekstrand <jason.ekstrand@intel.com>
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
2015-07-21 18:42:27 +03:00
|
|
|
|
case SHADER_OPCODE_TEX_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_CMS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_UMS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TXF_MCS_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_LOD_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TG4_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
|
|
|
|
|
|
lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2015-07-21 18:45:32 +03:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
lower_surface_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_UNTYPED_SURFACE_READ,
|
2015-07-14 18:42:57 +03:00
|
|
|
|
fs_reg(0xffff));
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
lower_surface_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
|
2015-07-14 18:42:57 +03:00
|
|
|
|
ibld.sample_mask_reg());
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
lower_surface_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_UNTYPED_ATOMIC,
|
2015-07-14 18:42:57 +03:00
|
|
|
|
ibld.sample_mask_reg());
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
lower_surface_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_TYPED_SURFACE_READ,
|
2015-07-14 18:42:57 +03:00
|
|
|
|
fs_reg(0xffff));
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
lower_surface_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_TYPED_SURFACE_WRITE,
|
2015-07-14 18:42:57 +03:00
|
|
|
|
ibld.sample_mask_reg());
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
lower_surface_logical_send(ibld, inst,
|
|
|
|
|
|
SHADER_OPCODE_TYPED_ATOMIC,
|
2015-07-14 18:42:57 +03:00
|
|
|
|
ibld.sample_mask_reg());
|
2015-07-21 18:45:32 +03:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-07-13 17:44:58 +03:00
|
|
|
|
default:
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
/**
|
|
|
|
|
|
* Get the closest native SIMD width supported by the hardware for instruction
|
|
|
|
|
|
* \p inst. The instruction will be left untouched by
|
|
|
|
|
|
* fs_visitor::lower_simd_width() if the returned value is equal to the
|
|
|
|
|
|
* original execution size.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static unsigned
|
|
|
|
|
|
get_lowered_simd_width(const struct brw_device_info *devinfo,
|
|
|
|
|
|
const fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
switch (inst->opcode) {
|
2015-08-04 19:07:19 +03:00
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
|
case BRW_OPCODE_NOT:
|
|
|
|
|
|
case BRW_OPCODE_AND:
|
|
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
|
case BRW_OPCODE_XOR:
|
|
|
|
|
|
case BRW_OPCODE_SHR:
|
|
|
|
|
|
case BRW_OPCODE_SHL:
|
|
|
|
|
|
case BRW_OPCODE_ASR:
|
|
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
|
case BRW_OPCODE_CMPN:
|
|
|
|
|
|
case BRW_OPCODE_CSEL:
|
|
|
|
|
|
case BRW_OPCODE_F32TO16:
|
|
|
|
|
|
case BRW_OPCODE_F16TO32:
|
|
|
|
|
|
case BRW_OPCODE_BFREV:
|
|
|
|
|
|
case BRW_OPCODE_BFE:
|
|
|
|
|
|
case BRW_OPCODE_BFI1:
|
|
|
|
|
|
case BRW_OPCODE_BFI2:
|
|
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
|
case BRW_OPCODE_AVG:
|
|
|
|
|
|
case BRW_OPCODE_FRC:
|
|
|
|
|
|
case BRW_OPCODE_RNDU:
|
|
|
|
|
|
case BRW_OPCODE_RNDD:
|
|
|
|
|
|
case BRW_OPCODE_RNDE:
|
|
|
|
|
|
case BRW_OPCODE_RNDZ:
|
|
|
|
|
|
case BRW_OPCODE_LZD:
|
|
|
|
|
|
case BRW_OPCODE_FBH:
|
|
|
|
|
|
case BRW_OPCODE_FBL:
|
|
|
|
|
|
case BRW_OPCODE_CBIT:
|
|
|
|
|
|
case BRW_OPCODE_SAD2:
|
|
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_POW:
|
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS: {
|
|
|
|
|
|
/* According to the PRMs:
|
|
|
|
|
|
* "A. In Direct Addressing mode, a source cannot span more than 2
|
|
|
|
|
|
* adjacent GRF registers.
|
|
|
|
|
|
* B. A destination cannot span more than 2 adjacent GRF registers."
|
|
|
|
|
|
*
|
|
|
|
|
|
* Look for the source or destination with the largest register region
|
|
|
|
|
|
* which is the one that is going to limit the overal execution size of
|
|
|
|
|
|
* the instruction due to this rule.
|
|
|
|
|
|
*/
|
|
|
|
|
|
unsigned reg_count = inst->regs_written;
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < inst->sources; i++)
|
|
|
|
|
|
reg_count = MAX2(reg_count, (unsigned)inst->regs_read(i));
|
|
|
|
|
|
|
|
|
|
|
|
/* Calculate the maximum execution size of the instruction based on the
|
|
|
|
|
|
* factor by which it goes over the hardware limit of 2 GRFs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
|
|
|
|
|
|
}
|
2015-08-06 14:04:00 +03:00
|
|
|
|
case SHADER_OPCODE_MULH:
|
|
|
|
|
|
/* MULH is lowered to the MUL/MACH sequence using the accumulator, which
|
|
|
|
|
|
* is 8-wide on Gen7+.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return (devinfo->gen >= 7 ? 8 : inst->exec_size);
|
|
|
|
|
|
|
2015-07-13 21:19:28 +03:00
|
|
|
|
case FS_OPCODE_FB_WRITE_LOGICAL:
|
|
|
|
|
|
/* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
|
|
|
|
|
|
* here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(devinfo->gen != 6 || inst->src[3].file == BAD_FILE ||
|
|
|
|
|
|
inst->exec_size == 8);
|
|
|
|
|
|
/* Dual-source FB writes are unsupported in SIMD16 mode. */
|
|
|
|
|
|
return (inst->src[1].file != BAD_FILE ? 8 : inst->exec_size);
|
|
|
|
|
|
|
2015-07-13 21:19:52 +03:00
|
|
|
|
case SHADER_OPCODE_TXD_LOGICAL:
|
|
|
|
|
|
/* TXD is unsupported in SIMD16 mode. */
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
|
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET_LOGICAL: {
|
|
|
|
|
|
/* gather4_po_c is unsupported in SIMD16 mode. */
|
|
|
|
|
|
const fs_reg &shadow_c = inst->src[1];
|
|
|
|
|
|
return (shadow_c.file != BAD_FILE ? 8 : inst->exec_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
case SHADER_OPCODE_TXL_LOGICAL:
|
|
|
|
|
|
case FS_OPCODE_TXB_LOGICAL: {
|
|
|
|
|
|
/* Gen4 doesn't have SIMD8 non-shadow-compare bias/LOD instructions, and
|
|
|
|
|
|
* Gen4-6 can't support TXL and TXB with shadow comparison in SIMD16
|
|
|
|
|
|
* mode because the message exceeds the maximum length of 11.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const fs_reg &shadow_c = inst->src[1];
|
|
|
|
|
|
if (devinfo->gen == 4 && shadow_c.file == BAD_FILE)
|
|
|
|
|
|
return 16;
|
|
|
|
|
|
else if (devinfo->gen < 7 && shadow_c.file != BAD_FILE)
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
else
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
case SHADER_OPCODE_TXF_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS_LOGICAL:
|
|
|
|
|
|
/* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
|
|
|
|
|
|
* messages. Use SIMD16 instead.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen == 4)
|
|
|
|
|
|
return 16;
|
|
|
|
|
|
else
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
|
2015-07-18 16:16:19 +03:00
|
|
|
|
case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
|
|
|
|
|
|
case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
|
|
|
|
|
|
return 8;
|
|
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
default:
|
|
|
|
|
|
return inst->exec_size;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* The \p rows array of registers represents a \p num_rows by \p num_columns
|
|
|
|
|
|
* matrix in row-major order, write it in column-major order into the register
|
|
|
|
|
|
* passed as destination. \p stride gives the separation between matrix
|
|
|
|
|
|
* elements in the input in fs_builder::dispatch_width() units.
|
|
|
|
|
|
*/
|
|
|
|
|
|
static void
|
|
|
|
|
|
emit_transpose(const fs_builder &bld,
|
|
|
|
|
|
const fs_reg &dst, const fs_reg *rows,
|
|
|
|
|
|
unsigned num_rows, unsigned num_columns, unsigned stride)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_reg *const components = new fs_reg[num_rows * num_columns];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < num_columns; ++i) {
|
|
|
|
|
|
for (unsigned j = 0; j < num_rows; ++j)
|
|
|
|
|
|
components[num_rows * i + j] = offset(rows[j], bld, stride * i);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bld.LOAD_PAYLOAD(dst, components, num_rows * num_columns, 0);
|
|
|
|
|
|
|
|
|
|
|
|
delete[] components;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_simd_width()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
|
|
|
|
|
|
|
|
|
|
|
|
if (lower_width != inst->exec_size) {
|
2015-07-27 18:42:31 +03:00
|
|
|
|
/* Builder matching the original instruction. We may also need to
|
|
|
|
|
|
* emit an instruction of width larger than the original, set the
|
|
|
|
|
|
* execution size of the builder to the highest of both for now so
|
|
|
|
|
|
* we're sure that both cases can be handled.
|
|
|
|
|
|
*/
|
2015-07-13 21:15:31 +03:00
|
|
|
|
const fs_builder ibld = bld.at(block, inst)
|
|
|
|
|
|
.exec_all(inst->force_writemask_all)
|
2015-07-27 18:42:31 +03:00
|
|
|
|
.group(MAX2(inst->exec_size, lower_width),
|
|
|
|
|
|
inst->force_sechalf);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
|
|
|
|
|
/* Split the copies in chunks of the execution width of either the
|
|
|
|
|
|
* original or the lowered instruction, whichever is lower.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned copy_width = MIN2(lower_width, inst->exec_size);
|
|
|
|
|
|
const unsigned n = inst->exec_size / copy_width;
|
|
|
|
|
|
const unsigned dst_size = inst->regs_written * REG_SIZE /
|
|
|
|
|
|
inst->dst.component_size(inst->exec_size);
|
|
|
|
|
|
fs_reg dsts[4];
|
|
|
|
|
|
|
|
|
|
|
|
assert(n > 0 && n <= ARRAY_SIZE(dsts) &&
|
|
|
|
|
|
!inst->writes_accumulator && !inst->mlen);
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < n; i++) {
|
|
|
|
|
|
/* Emit a copy of the original instruction with the lowered width.
|
|
|
|
|
|
* If the EOT flag was set throw it away except for the last
|
|
|
|
|
|
* instruction to avoid killing the thread prematurely.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst split_inst = *inst;
|
|
|
|
|
|
split_inst.exec_size = lower_width;
|
|
|
|
|
|
split_inst.eot = inst->eot && i == n - 1;
|
|
|
|
|
|
|
2015-07-27 18:42:31 +03:00
|
|
|
|
/* Select the correct channel enables for the i-th group, then
|
|
|
|
|
|
* transform the sources and destination and emit the lowered
|
|
|
|
|
|
* instruction.
|
2015-07-13 21:15:31 +03:00
|
|
|
|
*/
|
2015-07-27 18:42:31 +03:00
|
|
|
|
const fs_builder lbld = ibld.group(lower_width, i);
|
2015-07-13 21:15:31 +03:00
|
|
|
|
|
|
|
|
|
|
for (unsigned j = 0; j < inst->sources; j++) {
|
|
|
|
|
|
if (inst->src[j].file != BAD_FILE &&
|
|
|
|
|
|
!is_uniform(inst->src[j])) {
|
|
|
|
|
|
/* Get the i-th copy_width-wide chunk of the source. */
|
|
|
|
|
|
const fs_reg src = horiz_offset(inst->src[j], copy_width * i);
|
|
|
|
|
|
const unsigned src_size = inst->components_read(j);
|
|
|
|
|
|
|
|
|
|
|
|
/* Use a trivial transposition to copy one every n
|
|
|
|
|
|
* copy_width-wide components of the register into a
|
|
|
|
|
|
* temporary passed as source to the lowered instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
split_inst.src[j] = lbld.vgrf(inst->src[j].type, src_size);
|
|
|
|
|
|
emit_transpose(lbld.group(copy_width, 0),
|
|
|
|
|
|
split_inst.src[j], &src, 1, src_size, n);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->regs_written) {
|
|
|
|
|
|
/* Allocate enough space to hold the result of the lowered
|
|
|
|
|
|
* instruction and fix up the number of registers written.
|
|
|
|
|
|
*/
|
|
|
|
|
|
split_inst.dst = dsts[i] =
|
|
|
|
|
|
lbld.vgrf(inst->dst.type, dst_size);
|
|
|
|
|
|
split_inst.regs_written =
|
|
|
|
|
|
DIV_ROUND_UP(inst->regs_written * lower_width,
|
|
|
|
|
|
inst->exec_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
lbld.emit(split_inst);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->regs_written) {
|
|
|
|
|
|
/* Distance between useful channels in the temporaries, skipping
|
|
|
|
|
|
* garbage if the lowered instruction is wider than the original.
|
|
|
|
|
|
*/
|
|
|
|
|
|
const unsigned m = lower_width / copy_width;
|
|
|
|
|
|
|
|
|
|
|
|
/* Interleave the components of the result from the lowered
|
|
|
|
|
|
* instructions. We need to set exec_all() when copying more than
|
|
|
|
|
|
* one half per component, because LOAD_PAYLOAD (in terms of which
|
|
|
|
|
|
* emit_transpose is implemented) can only use the same channel
|
|
|
|
|
|
* enable signals for all of its non-header sources.
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit_transpose(ibld.exec_all(inst->exec_size > copy_width)
|
|
|
|
|
|
.group(copy_width, 0),
|
|
|
|
|
|
inst->dst, dsts, n, dst_size, m);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
inst->remove(block);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:34:01 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions()
|
2014-05-29 13:08:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instructions(NULL);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions(const char *name)
|
2013-08-04 23:34:01 -07:00
|
|
|
|
{
|
2014-05-29 13:08:59 -07:00
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
|
if (name && geteuid() != 0) {
|
|
|
|
|
|
file = fopen(name, "w");
|
|
|
|
|
|
if (!file)
|
|
|
|
|
|
file = stderr;
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
|
2015-02-13 10:46:32 -08:00
|
|
|
|
if (cfg) {
|
|
|
|
|
|
calculate_register_pressure();
|
|
|
|
|
|
int ip = 0, max_pressure = 0;
|
|
|
|
|
|
foreach_block_and_inst(block, backend_instruction, inst, cfg) {
|
|
|
|
|
|
max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
|
|
|
|
|
|
fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
ip++;
|
|
|
|
|
|
}
|
|
|
|
|
|
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
|
foreach_in_list(backend_instruction, inst, &instructions) {
|
|
|
|
|
|
fprintf(file, "%4d: ", ip++);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
2014-05-29 13:08:59 -07:00
|
|
|
|
|
|
|
|
|
|
if (file != stderr) {
|
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
void
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instruction(be_inst, stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
|
2012-10-30 15:35:44 -07:00
|
|
|
|
{
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_inst *inst = (fs_inst *)be_inst;
|
|
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->predicate) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(%cf0.%d) ",
|
2012-12-06 10:36:11 -08:00
|
|
|
|
inst->predicate_inverse ? '-' : '+',
|
|
|
|
|
|
inst->flag_subreg);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", brw_instruction_name(inst->opcode));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->saturate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".sat");
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->conditional_mod) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (!inst->predicate &&
|
2015-04-15 18:00:05 -07:00
|
|
|
|
(devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
|
2012-12-06 10:36:11 -08:00
|
|
|
|
inst->opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
inst->opcode != BRW_OPCODE_WHILE))) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".f0.%d", inst->flag_subreg);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 18:02:52 -07:00
|
|
|
|
fprintf(file, "(%d) ", inst->exec_size);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2015-06-02 20:40:54 -07:00
|
|
|
|
if (inst->mlen) {
|
|
|
|
|
|
fprintf(file, "(mlen: %d) ", inst->mlen);
|
|
|
|
|
|
}
|
2012-12-06 10:36:11 -08:00
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->dst.file) {
|
|
|
|
|
|
case GRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->dst.reg);
|
2015-06-18 12:44:35 -07:00
|
|
|
|
if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->dst.subreg_offset)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d",
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->dst.reg_offset, inst->dst.subreg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case MRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "m%d", inst->dst.reg);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case UNIFORM:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
|
|
|
|
|
fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
|
|
|
|
|
|
break;
|
2013-10-08 23:30:08 -07:00
|
|
|
|
case HW_REG:
|
2013-11-25 15:37:18 -08:00
|
|
|
|
if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
|
|
|
|
|
|
switch (inst->dst.fixed_hw_reg.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "null");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->dst.fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->dst.fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->dst.fixed_hw_reg.subnr)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-09-16 15:56:47 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->src[i].file) {
|
|
|
|
|
|
case GRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->src[i].reg);
|
2015-06-18 12:44:35 -07:00
|
|
|
|
if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[i].subreg_offset)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->src[i].subreg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case MRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "***m%d***", inst->src[i].reg);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2015-08-25 16:09:46 -07:00
|
|
|
|
fprintf(file, "attr%d+%d", inst->src[i].reg, inst->src[i].reg_offset);
|
2014-10-20 23:16:48 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case UNIFORM:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
|
2014-03-11 00:11:42 -07:00
|
|
|
|
if (inst->src[i].reladdr) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+reladdr");
|
2014-09-05 17:07:16 -07:00
|
|
|
|
} else if (inst->src[i].subreg_offset) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->src[i].subreg_offset);
|
2014-03-11 00:11:42 -07:00
|
|
|
|
}
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case IMM:
|
|
|
|
|
|
switch (inst->src[i].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_W:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UW:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2014-03-08 17:25:34 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_VF:
|
2014-12-31 16:54:44 -08:00
|
|
|
|
fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
|
2014-03-08 17:25:34 -08:00
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
|
|
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-08 23:30:08 -07:00
|
|
|
|
case HW_REG:
|
|
|
|
|
|
if (inst->src[i].fixed_hw_reg.negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
|
|
|
|
|
|
switch (inst->src[i].fixed_hw_reg.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "null");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.subnr)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2013-12-02 13:10:29 -08:00
|
|
|
|
if (inst->src[i].file != IMM) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
|
2013-12-02 13:10:29 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ", ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-09-16 18:02:52 -07:00
|
|
|
|
if (dispatch_width == 16 && inst->exec_size == 8) {
|
|
|
|
|
|
if (inst->force_sechalf)
|
|
|
|
|
|
fprintf(file, "2ndhalf ");
|
|
|
|
|
|
else
|
|
|
|
|
|
fprintf(file, "1sthalf ");
|
|
|
|
|
|
}
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "\n");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-03-10 13:48:42 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Possibly returns an instruction that set up @param reg.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Sometimes we want to take the result of some expression/variable
|
|
|
|
|
|
* dereference tree and rewrite the instruction generating the result
|
|
|
|
|
|
* of the tree. When processing the tree, we know that the
|
|
|
|
|
|
* instructions generated are all writing temporaries that are dead
|
|
|
|
|
|
* outside of this tree. So, if we have some instructions that write
|
|
|
|
|
|
* a temporary, we're free to point that temp write somewhere else.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this doesn't guarantee that the instruction generated
|
|
|
|
|
|
* only reg -- it might be the size=4 destination of a texture instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::get_instruction_generating_reg(fs_inst *start,
|
|
|
|
|
|
fs_inst *end,
|
2014-02-19 20:31:14 -08:00
|
|
|
|
const fs_reg ®)
|
2012-03-10 13:48:42 -08:00
|
|
|
|
{
|
|
|
|
|
|
if (end == start ||
|
2012-06-04 08:59:00 -07:00
|
|
|
|
end->is_partial_write() ||
|
2012-11-08 16:06:24 -08:00
|
|
|
|
reg.reladdr ||
|
2012-05-10 16:10:14 -07:00
|
|
|
|
!reg.equals(end->dst)) {
|
2012-03-10 13:48:42 -08:00
|
|
|
|
return NULL;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return end;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-13 19:36:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_payload_gen6()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool uses_depth =
|
2015-10-01 15:12:59 -07:00
|
|
|
|
(nir->info.inputs_read & (1 << VARYING_SLOT_POS)) != 0;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
unsigned barycentric_interp_modes =
|
|
|
|
|
|
(stage == MESA_SHADER_FRAGMENT) ?
|
|
|
|
|
|
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
|
|
|
|
|
/* R0-1: masks, pixel X/Y coordinates. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs = 2;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R2: only for 32-pixel dispatch.*/
|
|
|
|
|
|
|
|
|
|
|
|
/* R3-26: barycentric interpolation coordinates. These appear in the
|
|
|
|
|
|
* same order that they appear in the brw_wm_barycentric_interp_mode
|
|
|
|
|
|
* enum. Each set of coordinates occupies 2 registers if dispatch width
|
|
|
|
|
|
* == 8 and 4 registers if dispatch width == 16. Coordinates only
|
|
|
|
|
|
* appear if they were enabled using the "Barycentric Interpolation
|
|
|
|
|
|
* Mode" bits in WM_STATE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
|
|
|
|
|
|
if (barycentric_interp_modes & (1 << i)) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.barycentric_coord_reg[i] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += 2;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs += 2;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* R27: interpolated depth if uses source depth */
|
|
|
|
|
|
if (uses_depth) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.source_depth_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R28: interpolated depth if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
|
|
|
|
|
|
if (uses_depth) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.source_w_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R30: interpolated W if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
|
if (stage == MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
prog_data->uses_pos_offset = key->compute_pos_offset;
|
|
|
|
|
|
/* R31: MSAA position offsets. */
|
|
|
|
|
|
if (prog_data->uses_pos_offset) {
|
|
|
|
|
|
payload.sample_pos_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
|
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 20:29:43 +13:00
|
|
|
|
/* R32: MSAA input coverage mask */
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (nir->info.system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) {
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.sample_mask_in_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R33: input coverage mask if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* R34-: bary for 32-pixel. */
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R58-59: interp W for 32-pixel. */
|
|
|
|
|
|
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (nir->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
2014-05-14 00:08:58 -07:00
|
|
|
|
source_depth_to_render_target = true;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_vs_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* R0: thread header, R1: urb handles */
|
|
|
|
|
|
payload.num_regs = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_cs_payload()
|
|
|
|
|
|
{
|
2015-06-19 17:19:38 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2014-08-30 19:57:39 -07:00
|
|
|
|
|
|
|
|
|
|
payload.num_regs = 1;
|
2014-11-21 18:47:49 -08:00
|
|
|
|
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (nir->info.system_values_read & SYSTEM_BIT_LOCAL_INVOCATION_ID) {
|
2014-11-21 18:47:49 -08:00
|
|
|
|
const unsigned local_id_dwords =
|
2015-10-01 13:52:21 -07:00
|
|
|
|
brw_cs_prog_local_id_payload_dwords(dispatch_width);
|
2014-11-21 18:47:49 -08:00
|
|
|
|
assert((local_id_dwords & 0x7) == 0);
|
|
|
|
|
|
const unsigned local_id_regs = local_id_dwords / 8;
|
|
|
|
|
|
payload.local_invocation_id_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += local_id_regs;
|
|
|
|
|
|
}
|
2014-08-30 19:57:39 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:27:14 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::calculate_register_pressure()
|
|
|
|
|
|
{
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2014-09-26 16:08:52 -07:00
|
|
|
|
unsigned num_instructions = 0;
|
|
|
|
|
|
foreach_block(block, cfg)
|
|
|
|
|
|
num_instructions += block->instructions.length();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
|
|
|
|
|
|
regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned reg = 0; reg < alloc.count; reg++) {
|
2013-08-04 23:27:14 -07:00
|
|
|
|
for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
|
2015-02-10 15:51:34 +02:00
|
|
|
|
regs_live_at_ip[ip] += alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::optimize()
|
|
|
|
|
|
{
|
2015-07-02 15:41:02 -07:00
|
|
|
|
/* Start by validating the shader we currently have. */
|
|
|
|
|
|
validate();
|
|
|
|
|
|
|
2015-06-03 19:59:44 +03:00
|
|
|
|
/* bld is the common builder object pointing at the end of the program we
|
|
|
|
|
|
* used to translate it into i965 IR. For the optimization and lowering
|
|
|
|
|
|
* passes coming next, any code added after the end of the program without
|
|
|
|
|
|
* having explicitly called fs_builder::at() clearly points at a mistake.
|
|
|
|
|
|
* Ideally optimization passes wouldn't be part of the visitor so they
|
|
|
|
|
|
* wouldn't have access to bld at all, but they do, so just in case some
|
|
|
|
|
|
* pass forgets to ask for a location explicitly set it to NULL here to
|
2015-07-27 18:51:01 +03:00
|
|
|
|
* make it trip. The dispatch width is initialized to a bogus value to
|
|
|
|
|
|
* make sure that optimizations set the execution controls explicitly to
|
|
|
|
|
|
* match the code they are manipulating instead of relying on the defaults.
|
2015-06-03 19:59:44 +03:00
|
|
|
|
*/
|
2015-07-27 18:51:01 +03:00
|
|
|
|
bld = fs_builder(this, 64);
|
2015-06-03 19:59:44 +03:00
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
assign_constant_locations();
|
|
|
|
|
|
demote_pull_constants();
|
|
|
|
|
|
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate();
|
|
|
|
|
|
|
2015-08-19 14:29:53 -07:00
|
|
|
|
split_virtual_grfs();
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate();
|
2015-08-19 14:29:53 -07:00
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
#define OPT(pass, args...) ({ \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
pass_num++; \
|
|
|
|
|
|
bool this_progress = pass(args); \
|
|
|
|
|
|
\
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
|
|
|
|
|
|
char filename[64]; \
|
2015-10-01 15:12:59 -07:00
|
|
|
|
snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass, \
|
|
|
|
|
|
stage_abbrev, dispatch_width, nir->info.name, iteration, pass_num); \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
\
|
2015-05-20 09:44:01 -07:00
|
|
|
|
backend_shader::dump_instructions(filename); \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
} \
|
|
|
|
|
|
\
|
2015-07-02 15:41:02 -07:00
|
|
|
|
validate(); \
|
|
|
|
|
|
\
|
2014-11-13 16:28:18 -08:00
|
|
|
|
progress = progress || this_progress; \
|
2015-01-16 01:05:21 -08:00
|
|
|
|
this_progress; \
|
|
|
|
|
|
})
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
|
|
|
|
|
|
char filename[64];
|
2015-10-01 15:12:59 -07:00
|
|
|
|
snprintf(filename, 64, "%s%d-%s-00-start",
|
|
|
|
|
|
stage_abbrev, dispatch_width, nir->info.name);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
2015-05-20 09:44:01 -07:00
|
|
|
|
backend_shader::dump_instructions(filename);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-07-13 17:44:58 +03:00
|
|
|
|
bool progress = false;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
int iteration = 0;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
int pass_num = 0;
|
2015-07-13 17:44:58 +03:00
|
|
|
|
|
2015-07-13 21:15:31 +03:00
|
|
|
|
OPT(lower_simd_width);
|
2015-07-13 17:44:58 +03:00
|
|
|
|
OPT(lower_logical_sends);
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
do {
|
|
|
|
|
|
progress = false;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
iteration++;
|
|
|
|
|
|
|
|
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
OPT(opt_cse);
|
|
|
|
|
|
OPT(opt_copy_propagate);
|
|
|
|
|
|
OPT(opt_peephole_predicated_break);
|
2014-08-22 10:54:43 -07:00
|
|
|
|
OPT(opt_cmod_propagation);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
OPT(dead_control_flow_eliminate, this);
|
|
|
|
|
|
OPT(opt_register_renaming);
|
2014-07-05 22:10:41 -07:00
|
|
|
|
OPT(opt_redundant_discard_jumps);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(opt_saturate_propagation);
|
2015-04-23 16:56:53 -07:00
|
|
|
|
OPT(opt_zero_samples);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
2015-02-20 20:25:04 +02:00
|
|
|
|
OPT(eliminate_find_live_channel);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
OPT(compact_virtual_grfs);
|
|
|
|
|
|
} while (progress);
|
|
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
OPT(opt_sampler_eot);
|
|
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
if (OPT(lower_load_payload)) {
|
2014-11-13 16:28:18 -08:00
|
|
|
|
split_virtual_grfs();
|
2015-01-16 01:05:21 -08:00
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-12 11:00:46 -08:00
|
|
|
|
OPT(opt_combine_constants);
|
2015-05-11 09:29:56 -07:00
|
|
|
|
OPT(lower_integer_multiplication);
|
2014-02-12 11:00:46 -08:00
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
lower_uniform_pull_constant_loads();
|
2015-07-02 15:41:02 -07:00
|
|
|
|
|
|
|
|
|
|
validate();
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Three source instruction must have a GRF/MRF destination register.
|
|
|
|
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fixup_3src_null_dest()
|
|
|
|
|
|
{
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->is_3src() && inst->dst.is_null()) {
|
2015-02-10 15:51:34 +02:00
|
|
|
|
inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
|
2014-12-29 20:33:12 -08:00
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::allocate_registers()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool allocated_without_spills;
|
|
|
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
|
|
|
|
|
schedule_instructions(pre_modes[i]);
|
|
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
|
assign_regs_trivial();
|
|
|
|
|
|
allocated_without_spills = true;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
allocated_without_spills = assign_regs(false);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (allocated_without_spills)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!allocated_without_spills) {
|
|
|
|
|
|
/* We assume that any spilling is worse than just dropping back to
|
|
|
|
|
|
* SIMD8. There's probably actually some intermediate point where
|
|
|
|
|
|
* SIMD16 with a couple of spills is still better.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (dispatch_width == 16) {
|
|
|
|
|
|
fail("Failure to register allocate. Reduce number of "
|
|
|
|
|
|
"live scalar values to avoid this.");
|
|
|
|
|
|
} else {
|
2015-06-22 17:17:56 -07:00
|
|
|
|
compiler->shader_perf_log(log_data,
|
2015-06-22 17:01:22 -07:00
|
|
|
|
"%s shader triggered register spilling. "
|
|
|
|
|
|
"Try reducing the number of live scalar "
|
|
|
|
|
|
"values to improve performance.\n",
|
|
|
|
|
|
stage_name);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Since we're out of heuristics, just go spill registers until we
|
|
|
|
|
|
* get an allocation.
|
|
|
|
|
|
*/
|
|
|
|
|
|
while (!assign_regs(true)) {
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* This must come after all optimization and register allocation, since
|
|
|
|
|
|
* it inserts dead code that happens to have side effects, and it does
|
|
|
|
|
|
* so based on the actual physical registers in use.
|
|
|
|
|
|
*/
|
|
|
|
|
|
insert_gen4_send_dependency_workarounds();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
if (!allocated_without_spills)
|
|
|
|
|
|
schedule_instructions(SCHEDULE_POST);
|
|
|
|
|
|
|
|
|
|
|
|
if (last_scratch > 0)
|
|
|
|
|
|
prog_data->total_scratch = brw_get_scratch_size(last_scratch);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
bool
|
2015-06-19 17:29:42 -07:00
|
|
|
|
fs_visitor::run_vs(gl_clip_plane *clip_planes)
|
2014-10-27 22:42:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
|
|
|
|
|
setup_vs_payload();
|
|
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2014-10-27 22:42:50 -07:00
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2015-05-20 10:03:50 -07:00
|
|
|
|
emit_nir_code();
|
2015-03-09 01:58:59 -07:00
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-06-26 15:05:13 -07:00
|
|
|
|
compute_clip_distance(clip_planes);
|
|
|
|
|
|
|
|
|
|
|
|
emit_urb_writes();
|
2014-10-27 22:42:50 -07:00
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-04-12 03:52:39 -07:00
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_vs_urb_setup();
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2014-10-27 22:42:50 -07:00
|
|
|
|
allocate_registers();
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
bool
|
2015-06-19 17:25:28 -07:00
|
|
|
|
fs_visitor::run_fs(bool do_rep_send)
|
2010-08-26 12:12:00 -07:00
|
|
|
|
{
|
2014-10-27 23:36:31 -07:00
|
|
|
|
brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
|
|
|
|
|
|
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
|
|
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6)
|
2012-11-13 19:36:18 -08:00
|
|
|
|
setup_payload_gen6();
|
|
|
|
|
|
else
|
2012-11-19 14:59:14 -08:00
|
|
|
|
setup_payload_gen4();
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
if (0) {
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_dummy_fs();
|
2015-06-19 17:25:28 -07:00
|
|
|
|
} else if (do_rep_send) {
|
|
|
|
|
|
assert(dispatch_width == 16);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
emit_repclear_shader();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
} else {
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2012-11-27 14:10:52 -08:00
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
calculate_urb_setup();
|
2015-10-01 15:12:59 -07:00
|
|
|
|
if (nir->info.inputs_read > 0) {
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 6)
|
2013-10-19 21:27:37 -07:00
|
|
|
|
emit_interpolation_setup_gen4();
|
|
|
|
|
|
else
|
|
|
|
|
|
emit_interpolation_setup_gen6();
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2012-12-06 12:15:13 -08:00
|
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
|
*/
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_prog_data->uses_kill) {
|
2015-06-03 20:45:54 +03:00
|
|
|
|
fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
|
2012-12-06 12:15:13 -08:00
|
|
|
|
discard_init->flag_subreg = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
/* Generate FS IR for main(). (the visitor only descends into
|
|
|
|
|
|
* functions called "main").
|
|
|
|
|
|
*/
|
2015-05-20 10:03:50 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
2011-06-10 16:00:03 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
|
2015-04-10 10:04:55 -07:00
|
|
|
|
if (wm_prog_data->uses_kill)
|
2015-06-03 20:45:54 +03:00
|
|
|
|
bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
|
2013-03-27 23:19:39 -07:00
|
|
|
|
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_key->alpha_test_func)
|
2013-10-27 12:32:03 +13:00
|
|
|
|
emit_alpha_test();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_fb_writes();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-02-26 22:55:54 -08:00
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
optimize();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_urb_setup();
|
2011-01-18 22:03:34 -08:00
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
allocate_registers();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
2014-11-13 16:28:17 -08:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2014-05-13 20:51:32 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (dispatch_width == 8)
|
|
|
|
|
|
wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
|
|
|
|
|
|
else
|
|
|
|
|
|
wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
|
2011-03-23 12:50:53 -07:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_cs()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_COMPUTE);
|
|
|
|
|
|
|
|
|
|
|
|
setup_cs_payload();
|
|
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-04-15 18:27:50 -07:00
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
if (shader_time_index >= 0)
|
2015-04-15 18:27:50 -07:00
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
|
|
|
|
|
allocate_registers();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
const unsigned *
|
2014-05-14 01:21:02 -07:00
|
|
|
|
brw_wm_fs_emit(struct brw_context *brw,
|
|
|
|
|
|
void *mem_ctx,
|
2014-05-14 00:41:41 -07:00
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
2012-11-20 14:41:21 -08:00
|
|
|
|
struct gl_fragment_program *fp,
|
2012-11-20 16:21:27 -08:00
|
|
|
|
struct gl_shader_program *prog,
|
|
|
|
|
|
unsigned *final_assembly_size)
|
2011-03-11 19:19:01 -08:00
|
|
|
|
{
|
2012-08-27 14:35:01 -07:00
|
|
|
|
struct brw_shader *shader = NULL;
|
|
|
|
|
|
if (prog)
|
|
|
|
|
|
shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
2014-02-20 18:23:52 -08:00
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_WM))
|
2014-10-20 15:50:36 -07:00
|
|
|
|
brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
2015-06-19 15:40:09 -07:00
|
|
|
|
int st_index8 = -1, st_index16 = -1;
|
|
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
|
|
|
|
|
|
st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
|
|
|
|
|
|
st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
/* Now the main event: Visit the shader IR and generate our FS IR for it.
|
|
|
|
|
|
*/
|
2015-10-01 15:21:57 -07:00
|
|
|
|
fs_visitor v(brw->intelScreen->compiler, brw, mem_ctx, key,
|
|
|
|
|
|
&prog_data->base, &fp->Base, fp->Base.nir, 8, st_index8);
|
2015-06-19 17:25:28 -07:00
|
|
|
|
if (!v.run_fs(false /* do_rep_send */)) {
|
2013-04-11 09:55:42 -07:00
|
|
|
|
if (prog) {
|
|
|
|
|
|
prog->LinkStatus = false;
|
|
|
|
|
|
ralloc_strcat(&prog->InfoLog, v.fail_msg);
|
|
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-02-09 10:23:45 -08:00
|
|
|
|
_mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
|
2013-04-11 09:55:42 -07:00
|
|
|
|
v.fail_msg);
|
2012-02-09 10:23:45 -08:00
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
return NULL;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-11 21:16:13 -07:00
|
|
|
|
cfg_t *simd16_cfg = NULL;
|
2015-10-01 15:21:57 -07:00
|
|
|
|
fs_visitor v2(brw->intelScreen->compiler, brw, mem_ctx, key,
|
|
|
|
|
|
&prog_data->base, &fp->Base, fp->Base.nir, 16, st_index16);
|
2015-02-20 14:09:17 -08:00
|
|
|
|
if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
if (!v.simd16_unsupported) {
|
|
|
|
|
|
/* Try a SIMD16 compile */
|
|
|
|
|
|
v2.import_uniforms(&v);
|
2015-06-19 17:25:28 -07:00
|
|
|
|
if (!v2.run_fs(brw->use_rep_send)) {
|
2015-06-22 16:32:06 -07:00
|
|
|
|
perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
} else {
|
2014-07-11 21:16:13 -07:00
|
|
|
|
simd16_cfg = v2.cfg;
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
}
|
2012-07-12 12:48:58 -07:00
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-11 21:16:13 -07:00
|
|
|
|
cfg_t *simd8_cfg;
|
2014-07-07 14:49:12 -07:00
|
|
|
|
int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
|
2015-02-20 14:09:17 -08:00
|
|
|
|
if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
|
2014-07-11 21:16:13 -07:00
|
|
|
|
simd8_cfg = NULL;
|
2014-07-07 14:49:12 -07:00
|
|
|
|
prog_data->no_8 = true;
|
|
|
|
|
|
} else {
|
2014-07-11 21:16:13 -07:00
|
|
|
|
simd8_cfg = v.cfg;
|
2014-07-07 14:49:12 -07:00
|
|
|
|
prog_data->no_8 = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-16 14:34:04 -07:00
|
|
|
|
fs_generator g(brw->intelScreen->compiler, brw,
|
2015-04-16 14:13:52 -07:00
|
|
|
|
mem_ctx, (void *) key, &prog_data->base,
|
2015-03-16 12:18:31 -07:00
|
|
|
|
&fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
|
2014-10-27 19:40:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
|
|
|
|
|
|
char *name;
|
|
|
|
|
|
if (prog)
|
|
|
|
|
|
name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
|
|
|
|
|
|
prog->Label ? prog->Label : "unnamed",
|
|
|
|
|
|
prog->Name);
|
|
|
|
|
|
else
|
|
|
|
|
|
name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
|
|
|
|
|
|
|
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:08 -08:00
|
|
|
|
if (simd8_cfg)
|
|
|
|
|
|
g.generate_code(simd8_cfg, 8);
|
|
|
|
|
|
if (simd16_cfg)
|
|
|
|
|
|
prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
|
2012-11-30 12:55:50 -08:00
|
|
|
|
|
2014-11-13 16:28:08 -08:00
|
|
|
|
return g.get_assembly(final_assembly_size);
|
2010-08-26 12:12:00 -07:00
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2015-09-04 16:35:34 -07:00
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_cs_local_invocation_id_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_COMPUTE);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
|
|
|
|
|
|
|
|
|
|
|
|
struct brw_reg src =
|
|
|
|
|
|
brw_vec8_grf(payload.local_invocation_id_reg, 0);
|
|
|
|
|
|
src = retype(src, BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
bld.MOV(*reg, src);
|
|
|
|
|
|
src.nr += dispatch_width / 8;
|
|
|
|
|
|
bld.MOV(offset(*reg, bld, 1), src);
|
|
|
|
|
|
src.nr += dispatch_width / 8;
|
|
|
|
|
|
bld.MOV(offset(*reg, bld, 2), src);
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_cs_work_group_id_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_COMPUTE);
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
|
|
|
|
|
|
|
|
|
|
|
|
struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
|
|
|
|
|
bld.MOV(*reg, r0_1);
|
|
|
|
|
|
bld.MOV(offset(*reg, bld, 1), r0_6);
|
|
|
|
|
|
bld.MOV(offset(*reg, bld, 2), r0_7);
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const unsigned *
|
|
|
|
|
|
brw_cs_emit(struct brw_context *brw,
|
|
|
|
|
|
void *mem_ctx,
|
|
|
|
|
|
const struct brw_cs_prog_key *key,
|
|
|
|
|
|
struct brw_cs_prog_data *prog_data,
|
|
|
|
|
|
struct gl_compute_program *cp,
|
|
|
|
|
|
struct gl_shader_program *prog,
|
|
|
|
|
|
unsigned *final_assembly_size)
|
|
|
|
|
|
{
|
|
|
|
|
|
struct brw_shader *shader =
|
|
|
|
|
|
(struct brw_shader *) prog->_LinkedShaders[MESA_SHADER_COMPUTE];
|
|
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_CS))
|
|
|
|
|
|
brw_dump_ir("compute", prog, &shader->base, &cp->Base);
|
|
|
|
|
|
|
|
|
|
|
|
prog_data->local_size[0] = cp->LocalSize[0];
|
|
|
|
|
|
prog_data->local_size[1] = cp->LocalSize[1];
|
|
|
|
|
|
prog_data->local_size[2] = cp->LocalSize[2];
|
|
|
|
|
|
unsigned local_workgroup_size =
|
|
|
|
|
|
cp->LocalSize[0] * cp->LocalSize[1] * cp->LocalSize[2];
|
|
|
|
|
|
|
|
|
|
|
|
cfg_t *cfg = NULL;
|
|
|
|
|
|
const char *fail_msg = NULL;
|
|
|
|
|
|
|
|
|
|
|
|
int st_index = -1;
|
|
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);
|
|
|
|
|
|
|
|
|
|
|
|
/* Now the main event: Visit the shader IR and generate our CS IR for it.
|
|
|
|
|
|
*/
|
2015-10-01 15:21:57 -07:00
|
|
|
|
fs_visitor v8(brw->intelScreen->compiler, brw, mem_ctx, key,
|
|
|
|
|
|
&prog_data->base, &cp->Base, cp->Base.nir, 8, st_index);
|
2015-09-04 16:35:34 -07:00
|
|
|
|
if (!v8.run_cs()) {
|
|
|
|
|
|
fail_msg = v8.fail_msg;
|
|
|
|
|
|
} else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
|
|
|
|
|
|
cfg = v8.cfg;
|
|
|
|
|
|
prog_data->simd_size = 8;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-10-01 15:21:57 -07:00
|
|
|
|
fs_visitor v16(brw->intelScreen->compiler, brw, mem_ctx, key,
|
|
|
|
|
|
&prog_data->base, &cp->Base, cp->Base.nir, 16, st_index);
|
2015-09-04 16:35:34 -07:00
|
|
|
|
if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
|
|
|
|
|
|
!fail_msg && !v8.simd16_unsupported &&
|
|
|
|
|
|
local_workgroup_size <= 16 * brw->max_cs_threads) {
|
|
|
|
|
|
/* Try a SIMD16 compile */
|
|
|
|
|
|
v16.import_uniforms(&v8);
|
|
|
|
|
|
if (!v16.run_cs()) {
|
|
|
|
|
|
perf_debug("SIMD16 shader failed to compile: %s", v16.fail_msg);
|
|
|
|
|
|
if (!cfg) {
|
|
|
|
|
|
fail_msg =
|
|
|
|
|
|
"Couldn't generate SIMD16 program and not "
|
|
|
|
|
|
"enough threads for SIMD8";
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
cfg = v16.cfg;
|
|
|
|
|
|
prog_data->simd_size = 16;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (unlikely(cfg == NULL)) {
|
|
|
|
|
|
assert(fail_msg);
|
|
|
|
|
|
prog->LinkStatus = false;
|
|
|
|
|
|
ralloc_strcat(&prog->InfoLog, fail_msg);
|
|
|
|
|
|
_mesa_problem(NULL, "Failed to compile compute shader: %s\n",
|
|
|
|
|
|
fail_msg);
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_generator g(brw->intelScreen->compiler, brw,
|
|
|
|
|
|
mem_ctx, (void*) key, &prog_data->base, &cp->Base,
|
|
|
|
|
|
v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
|
|
|
|
|
|
if (INTEL_DEBUG & DEBUG_CS) {
|
|
|
|
|
|
char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
|
|
|
|
|
|
prog->Label ? prog->Label : "unnamed",
|
|
|
|
|
|
prog->Name);
|
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
g.generate_code(cfg, prog_data->simd_size);
|
|
|
|
|
|
|
|
|
|
|
|
return g.get_assembly(final_assembly_size);
|
|
|
|
|
|
}
|