2010-08-10 20:39:06 -07:00
|
|
|
|
/*
|
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
|
*
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
|
*
|
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
|
* Software.
|
|
|
|
|
|
*
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/** @file brw_fs.cpp
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
2010-08-26 15:43:00 -07:00
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
2014-02-25 01:08:45 -08:00
|
|
|
|
#include "util/hash_table.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "main/macros.h"
|
|
|
|
|
|
#include "main/shaderobj.h"
|
2012-04-20 07:58:59 -06:00
|
|
|
|
#include "main/fbobject.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "program/prog_parameter.h"
|
|
|
|
|
|
#include "program/prog_print.h"
|
2014-09-22 12:24:21 -07:00
|
|
|
|
#include "util/register_allocate.h"
|
2010-08-15 18:58:58 -07:00
|
|
|
|
#include "program/hash_table.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "brw_context.h"
|
|
|
|
|
|
#include "brw_eu.h"
|
|
|
|
|
|
#include "brw_wm.h"
|
2010-10-10 15:42:37 -07:00
|
|
|
|
#include "brw_fs.h"
|
2014-07-12 21:18:39 -07:00
|
|
|
|
#include "brw_cfg.h"
|
2013-10-30 10:32:12 -07:00
|
|
|
|
#include "brw_dead_control_flow.h"
|
2013-09-11 10:59:13 -07:00
|
|
|
|
#include "main/uniforms.h"
|
2012-06-05 11:42:25 -07:00
|
|
|
|
#include "brw_fs_live_variables.h"
|
2011-08-26 13:58:41 -07:00
|
|
|
|
#include "glsl/glsl_types.h"
|
2014-12-16 14:29:28 -08:00
|
|
|
|
#include "program/sampler.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg *src, unsigned sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
memset(this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(sources, 3)];
|
|
|
|
|
|
for (unsigned i = 0; i < sources; i++)
|
|
|
|
|
|
this->src[i] = src[i];
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
this->opcode = opcode;
|
|
|
|
|
|
this->dst = dst;
|
2014-02-20 08:18:22 -08:00
|
|
|
|
this->sources = sources;
|
2014-08-14 13:56:24 -07:00
|
|
|
|
this->exec_size = exec_size;
|
|
|
|
|
|
|
|
|
|
|
|
assert(dst.file != IMM && dst.file != UNIFORM);
|
|
|
|
|
|
|
|
|
|
|
|
/* If exec_size == 0, try to guess it from the registers. Since all
|
|
|
|
|
|
* manner of things may use hardware registers, we first try to guess
|
|
|
|
|
|
* based on GRF registers. If this fails, we will go ahead and take the
|
|
|
|
|
|
* width from the destination register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (this->exec_size == 0) {
|
|
|
|
|
|
if (dst.file == GRF) {
|
|
|
|
|
|
this->exec_size = dst.width;
|
|
|
|
|
|
} else {
|
2015-02-06 01:14:51 +02:00
|
|
|
|
for (unsigned i = 0; i < sources; ++i) {
|
2014-10-20 23:16:48 -07:00
|
|
|
|
if (src[i].file != GRF && src[i].file != ATTR)
|
2014-08-14 13:56:24 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
if (this->exec_size <= 1)
|
|
|
|
|
|
this->exec_size = src[i].width;
|
|
|
|
|
|
assert(src[i].width == 1 || src[i].width == this->exec_size);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (this->exec_size == 0 && dst.file != BAD_FILE)
|
|
|
|
|
|
this->exec_size = dst.width;
|
|
|
|
|
|
}
|
|
|
|
|
|
assert(this->exec_size != 0);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
for (unsigned i = 0; i < sources; ++i) {
|
2014-09-19 20:36:52 -07:00
|
|
|
|
switch (this->src[i].file) {
|
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
|
this->src[i].effective_width = 8;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case GRF:
|
|
|
|
|
|
case HW_REG:
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2014-09-19 20:36:52 -07:00
|
|
|
|
assert(this->src[i].width > 0);
|
|
|
|
|
|
if (this->src[i].width == 1) {
|
|
|
|
|
|
this->src[i].effective_width = this->exec_size;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
this->src[i].effective_width = this->src[i].width;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
this->src[i].effective_width = this->exec_size;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid source register file");
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
this->dst.effective_width = this->exec_size;
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
|
/* This will be the case for almost all instructions. */
|
2014-08-18 14:27:55 -07:00
|
|
|
|
switch (dst.file) {
|
|
|
|
|
|
case GRF:
|
|
|
|
|
|
case HW_REG:
|
|
|
|
|
|
case MRF:
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
2015-02-03 21:57:38 +02:00
|
|
|
|
this->regs_written =
|
|
|
|
|
|
DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
|
this->regs_written = 0;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
unreachable("Invalid destination register file");
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid register file");
|
|
|
|
|
|
}
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst::fs_inst()
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(opcode, exec_size, reg_undef, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
init(opcode, 0, dst, NULL, 0);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[1] = { src0 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, 0, dst, src, 1);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
|
|
|
|
|
|
const fs_reg &src1)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[2] = { src0, src1 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, 0, dst, src, 2);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
|
|
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, exec_size, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
|
|
|
|
|
|
const fs_reg &src1, const fs_reg &src2)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[3] = { src0, src1, src2 };
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, 0, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
|
|
|
|
|
|
const fs_reg src[], unsigned sources)
|
2014-05-26 18:44:17 -07:00
|
|
|
|
{
|
2014-08-14 13:56:24 -07:00
|
|
|
|
init(opcode, 0, dst, src, sources);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
|
2015-02-06 01:14:51 +02:00
|
|
|
|
const fs_reg src[], unsigned sources)
|
2014-08-14 13:56:24 -07:00
|
|
|
|
{
|
|
|
|
|
|
init(opcode, exec_width, dst, src, sources);
|
2014-05-26 18:44:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
|
{
|
|
|
|
|
|
memcpy(this, &that, sizeof(that));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
this->src = new fs_reg[MAX2(that.sources, 3)];
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
for (unsigned i = 0; i < that.sources; i++)
|
2014-02-19 21:18:44 -08:00
|
|
|
|
this->src[i] = that.src[i];
|
2014-02-20 09:40:02 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_inst::~fs_inst()
|
|
|
|
|
|
{
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 13:14:05 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->sources != num_sources) {
|
2015-02-06 01:14:51 +02:00
|
|
|
|
fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
|
|
|
|
|
|
|
|
|
|
|
|
for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
|
|
|
|
|
|
src[i] = this->src[i];
|
|
|
|
|
|
|
|
|
|
|
|
delete[] this->src;
|
|
|
|
|
|
this->src = src;
|
2014-02-20 13:14:05 -08:00
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 12:01:05 -08:00
|
|
|
|
#define ALU1(op) \
|
|
|
|
|
|
fs_inst * \
|
2014-06-28 13:40:52 -07:00
|
|
|
|
fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
|
2012-11-09 12:01:05 -08:00
|
|
|
|
{ \
|
|
|
|
|
|
return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#define ALU2(op) \
|
|
|
|
|
|
fs_inst * \
|
2014-06-28 13:40:52 -07:00
|
|
|
|
fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
|
|
|
|
|
|
const fs_reg &src1) \
|
2012-11-09 12:01:05 -08:00
|
|
|
|
{ \
|
|
|
|
|
|
return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-04 16:51:59 +03:00
|
|
|
|
#define ALU2_ACC(op) \
|
|
|
|
|
|
fs_inst * \
|
2014-06-28 13:40:52 -07:00
|
|
|
|
fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
|
|
|
|
|
|
const fs_reg &src1) \
|
2014-04-04 16:51:59 +03:00
|
|
|
|
{ \
|
|
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
|
|
|
|
|
|
inst->writes_accumulator = true; \
|
|
|
|
|
|
return inst; \
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-12-02 00:08:15 -08:00
|
|
|
|
#define ALU3(op) \
|
|
|
|
|
|
fs_inst * \
|
2014-06-28 13:40:52 -07:00
|
|
|
|
fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
|
|
|
|
|
|
const fs_reg &src1, const fs_reg &src2) \
|
2012-12-02 00:08:15 -08:00
|
|
|
|
{ \
|
|
|
|
|
|
return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 12:01:05 -08:00
|
|
|
|
ALU1(NOT)
|
|
|
|
|
|
ALU1(MOV)
|
|
|
|
|
|
ALU1(FRC)
|
|
|
|
|
|
ALU1(RNDD)
|
|
|
|
|
|
ALU1(RNDE)
|
|
|
|
|
|
ALU1(RNDZ)
|
|
|
|
|
|
ALU2(ADD)
|
|
|
|
|
|
ALU2(MUL)
|
2014-04-04 16:51:59 +03:00
|
|
|
|
ALU2_ACC(MACH)
|
2012-11-09 12:01:05 -08:00
|
|
|
|
ALU2(AND)
|
|
|
|
|
|
ALU2(OR)
|
|
|
|
|
|
ALU2(XOR)
|
|
|
|
|
|
ALU2(SHL)
|
|
|
|
|
|
ALU2(SHR)
|
|
|
|
|
|
ALU2(ASR)
|
2012-12-02 00:08:15 -08:00
|
|
|
|
ALU3(LRP)
|
2013-04-09 19:22:34 -07:00
|
|
|
|
ALU1(BFREV)
|
|
|
|
|
|
ALU3(BFE)
|
|
|
|
|
|
ALU2(BFI1)
|
|
|
|
|
|
ALU3(BFI2)
|
|
|
|
|
|
ALU1(FBH)
|
|
|
|
|
|
ALU1(FBL)
|
|
|
|
|
|
ALU1(CBIT)
|
2013-04-23 17:32:26 -07:00
|
|
|
|
ALU3(MAD)
|
2014-04-04 16:51:59 +03:00
|
|
|
|
ALU2_ACC(ADDC)
|
|
|
|
|
|
ALU2_ACC(SUBB)
|
2013-10-22 19:04:14 -07:00
|
|
|
|
ALU2(SEL)
|
2014-03-28 15:28:32 +02:00
|
|
|
|
ALU2(MAC)
|
2012-11-09 12:01:05 -08:00
|
|
|
|
|
2012-11-09 12:50:03 -08:00
|
|
|
|
/** Gen4 predicated IF. */
|
|
|
|
|
|
fs_inst *
|
2014-06-29 17:58:59 -07:00
|
|
|
|
fs_visitor::IF(enum brw_predicate predicate)
|
2012-11-09 12:50:03 -08:00
|
|
|
|
{
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
|
2012-11-09 12:50:03 -08:00
|
|
|
|
inst->predicate = predicate;
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-14 10:36:12 -08:00
|
|
|
|
/** Gen6 IF with embedded comparison. */
|
2012-11-09 12:50:03 -08:00
|
|
|
|
fs_inst *
|
2014-06-29 17:50:20 -07:00
|
|
|
|
fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
|
|
|
|
|
|
enum brw_conditional_mod condition)
|
2012-11-09 12:50:03 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen == 6);
|
2014-08-14 13:56:24 -07:00
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
|
2012-11-09 12:50:03 -08:00
|
|
|
|
reg_null_d, src0, src1);
|
|
|
|
|
|
inst->conditional_mod = condition;
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* CMP: Sets the low bit of the destination channels with the result
|
|
|
|
|
|
* of the comparison, while the upper bits are undefined, and updates
|
|
|
|
|
|
* the flag register with the packed 16 bits of the result.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
2014-06-29 17:50:20 -07:00
|
|
|
|
fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
|
|
|
|
|
|
enum brw_conditional_mod condition)
|
2012-11-09 12:50:03 -08:00
|
|
|
|
{
|
|
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Take the instruction:
|
|
|
|
|
|
*
|
|
|
|
|
|
* CMP null<d> src0<f> src1<f>
|
|
|
|
|
|
*
|
|
|
|
|
|
* Original gen4 does type conversion to the destination type before
|
|
|
|
|
|
* comparison, producing garbage results for floating point comparisons.
|
2015-01-07 11:52:05 -08:00
|
|
|
|
*
|
|
|
|
|
|
* The destination type doesn't matter on newer generations, so we set the
|
|
|
|
|
|
* type to match src0 so we can compact the instruction.
|
2012-11-09 12:50:03 -08:00
|
|
|
|
*/
|
2015-01-07 11:52:05 -08:00
|
|
|
|
dst.type = src0.type;
|
|
|
|
|
|
if (dst.file == HW_REG)
|
|
|
|
|
|
dst.fixed_hw_reg.type = dst.type;
|
2012-11-09 12:50:03 -08:00
|
|
|
|
|
|
|
|
|
|
resolve_ud_negate(&src0);
|
|
|
|
|
|
resolve_ud_negate(&src1);
|
|
|
|
|
|
|
|
|
|
|
|
inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
|
|
|
|
|
|
inst->conditional_mod = condition;
|
|
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 18:47:40 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
|
|
|
|
|
|
{
|
2014-09-11 22:33:52 -07:00
|
|
|
|
uint8_t exec_size = dst.width;
|
|
|
|
|
|
for (int i = 0; i < sources; ++i) {
|
|
|
|
|
|
assert(src[i].width % dst.width == 0);
|
|
|
|
|
|
if (src[i].width > exec_size)
|
|
|
|
|
|
exec_size = src[i].width;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
|
|
|
|
|
|
dst, src, sources);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
inst->regs_written = 0;
|
|
|
|
|
|
for (int i = 0; i < sources; ++i) {
|
|
|
|
|
|
/* The LOAD_PAYLOAD instruction only really makes sense if we are
|
|
|
|
|
|
* dealing with whole registers. If this ever changes, we can deal
|
|
|
|
|
|
* with it later.
|
|
|
|
|
|
*/
|
2015-02-06 01:14:51 +02:00
|
|
|
|
int size = inst->src[i].effective_width * type_sz(src[i].type);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
assert(size % 32 == 0);
|
|
|
|
|
|
inst->regs_written += (size + 31) / 32;
|
|
|
|
|
|
}
|
2014-05-27 18:47:40 -07:00
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
exec_list
|
2014-02-19 20:31:14 -08:00
|
|
|
|
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &surf_index,
|
|
|
|
|
|
const fs_reg &varying_offset,
|
2013-03-13 12:27:17 -07:00
|
|
|
|
uint32_t const_offset)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
|
|
|
|
|
exec_list instructions;
|
|
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
|
|
|
|
|
* be any component of a vector, and then we load 4 contiguous
|
|
|
|
|
|
* components starting from that.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We break down the const_offset to a portion added to the variable
|
|
|
|
|
|
* offset and a portion done using reg_offset, which means that if you
|
|
|
|
|
|
* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
|
|
|
|
|
|
* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
|
|
|
|
|
|
* CSE can later notice that those loads are all the same and eliminate
|
|
|
|
|
|
* the redundant ones.
|
|
|
|
|
|
*/
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg vec4_offset = vgrf(glsl_type::int_type);
|
2013-03-18 10:16:42 -07:00
|
|
|
|
instructions.push_tail(ADD(vec4_offset,
|
2014-09-10 10:17:28 -07:00
|
|
|
|
varying_offset, fs_reg(const_offset & ~3)));
|
2013-03-18 10:16:42 -07:00
|
|
|
|
|
|
|
|
|
|
int scale = 1;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 4 && dst.width == 8) {
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
|
|
|
|
|
|
* u, v, r) as parameters, or we can just use the SIMD16 message
|
|
|
|
|
|
* consisting of (header, u). We choose the second, at the cost of a
|
|
|
|
|
|
* longer return length.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*/
|
2013-03-18 10:16:42 -07:00
|
|
|
|
scale = 2;
|
|
|
|
|
|
}
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
|
enum opcode op;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7)
|
2013-03-18 10:16:42 -07:00
|
|
|
|
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
else
|
|
|
|
|
|
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
|
|
|
|
|
|
assert(dst.width % 8 == 0);
|
|
|
|
|
|
int regs_written = 4 * (dst.width / 8) * scale;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
|
2014-08-18 14:27:55 -07:00
|
|
|
|
dst.type, dst.width);
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
|
2014-08-18 14:27:55 -07:00
|
|
|
|
inst->regs_written = regs_written;
|
2013-03-18 10:16:42 -07:00
|
|
|
|
instructions.push_tail(inst);
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 7) {
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst->base_mrf = 13;
|
|
|
|
|
|
inst->header_present = true;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 4)
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst->mlen = 3;
|
|
|
|
|
|
else
|
|
|
|
|
|
inst->mlen = 1 + dispatch_width / 8;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-09-06 13:48:34 -07:00
|
|
|
|
fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
|
|
|
|
|
|
instructions.push_tail(MOV(dst, result));
|
2013-03-18 10:16:42 -07:00
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
return instructions;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* A helper for MOV generation for fixing up broken hardware SEND dependency
|
|
|
|
|
|
* handling.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::DEP_RESOLVE_MOV(int grf)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
|
|
|
|
|
|
|
|
|
|
|
|
inst->ir = NULL;
|
|
|
|
|
|
inst->annotation = "send dependency resolve";
|
|
|
|
|
|
|
|
|
|
|
|
/* The caller always wants uncompressed to emit the minimal extra
|
|
|
|
|
|
* dependencies, and to avoid having to deal with aligning its regs to 2.
|
|
|
|
|
|
*/
|
2014-08-14 13:56:24 -07:00
|
|
|
|
inst->exec_size = 8;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::equals(fs_inst *inst) const
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
return (opcode == inst->opcode &&
|
|
|
|
|
|
dst.equals(inst->dst) &&
|
|
|
|
|
|
src[0].equals(inst->src[0]) &&
|
|
|
|
|
|
src[1].equals(inst->src[1]) &&
|
|
|
|
|
|
src[2].equals(inst->src[2]) &&
|
|
|
|
|
|
saturate == inst->saturate &&
|
2012-10-03 13:23:05 -07:00
|
|
|
|
predicate == inst->predicate &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
conditional_mod == inst->conditional_mod &&
|
|
|
|
|
|
mlen == inst->mlen &&
|
|
|
|
|
|
base_mrf == inst->base_mrf &&
|
|
|
|
|
|
target == inst->target &&
|
|
|
|
|
|
eot == inst->eot &&
|
|
|
|
|
|
header_present == inst->header_present &&
|
|
|
|
|
|
shadow_compare == inst->shadow_compare &&
|
2014-08-14 13:56:24 -07:00
|
|
|
|
exec_size == inst->exec_size &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
offset == inst->offset);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-06 15:06:59 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::overwrites_reg(const fs_reg ®) const
|
2012-07-06 15:06:59 -07:00
|
|
|
|
{
|
2015-03-18 19:35:31 +02:00
|
|
|
|
return reg.in_range(dst, regs_written);
|
2012-07-06 15:06:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2014-09-13 11:49:55 -07:00
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
|
|
|
|
|
|
case SHADER_OPCODE_SHADER_TIME_ADD:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_CENTROID:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2014-09-11 16:13:15 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC:
|
2014-09-11 16:43:37 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
|
2014-10-20 23:00:50 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
2014-09-13 11:49:55 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
|
|
|
|
|
return src[1].file == GRF;
|
2014-09-12 16:17:37 -07:00
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
return src[0].file == GRF;
|
2014-09-13 11:49:55 -07:00
|
|
|
|
default:
|
|
|
|
|
|
if (is_tex())
|
|
|
|
|
|
return src[0].file == GRF;
|
|
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
|
}
|
2012-11-09 11:48:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
2015-04-15 18:00:05 -07:00
|
|
|
|
fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6 && is_math())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (is_send_from_grf())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2014-06-23 21:57:31 -07:00
|
|
|
|
if (!backend_instruction::can_do_source_mods())
|
2013-09-19 19:48:22 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-31 15:49:42 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_inst::has_side_effects() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return this->eot || backend_instruction::has_side_effects();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_reg::init()
|
|
|
|
|
|
{
|
|
|
|
|
|
memset(this, 0, sizeof(*this));
|
2013-12-08 04:57:35 +01:00
|
|
|
|
stride = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Generic unset register constructor. */
|
|
|
|
|
|
fs_reg::fs_reg()
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = BAD_FILE;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(float f)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
2014-06-29 15:13:24 -07:00
|
|
|
|
this->fixed_hw_reg.dw1.f = f;
|
2014-08-13 12:25:58 -07:00
|
|
|
|
this->width = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(int32_t i)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_D;
|
2014-06-29 15:13:24 -07:00
|
|
|
|
this->fixed_hw_reg.dw1.d = i;
|
2014-08-13 12:25:58 -07:00
|
|
|
|
this->width = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint32_t u)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_UD;
|
2014-06-29 15:13:24 -07:00
|
|
|
|
this->fixed_hw_reg.dw1.ud = u;
|
2014-08-13 12:25:58 -07:00
|
|
|
|
this->width = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-20 11:47:40 -08:00
|
|
|
|
/** Vector float immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint8_t vf[4])
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_VF;
|
|
|
|
|
|
memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-08 17:25:34 -08:00
|
|
|
|
/** Vector float immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_VF;
|
|
|
|
|
|
this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
|
|
|
|
|
|
(vf1 << 8) |
|
|
|
|
|
|
(vf2 << 16) |
|
|
|
|
|
|
(vf3 << 24);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-22 16:26:53 -08:00
|
|
|
|
/** Fixed brw_reg. */
|
2012-07-04 13:12:50 -07:00
|
|
|
|
fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
2013-04-29 16:05:05 -07:00
|
|
|
|
this->file = HW_REG;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->fixed_hw_reg = fixed_hw_reg;
|
2014-02-21 23:52:24 -08:00
|
|
|
|
this->type = fixed_hw_reg.type;
|
2014-08-13 12:25:58 -07:00
|
|
|
|
this->width = 1 << fixed_hw_reg.width;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
|
|
|
|
|
return (file == r.file &&
|
|
|
|
|
|
reg == r.reg &&
|
|
|
|
|
|
reg_offset == r.reg_offset &&
|
2013-12-08 04:57:08 +01:00
|
|
|
|
subreg_offset == r.subreg_offset &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
type == r.type &&
|
|
|
|
|
|
negate == r.negate &&
|
|
|
|
|
|
abs == r.abs &&
|
2012-11-08 16:06:24 -08:00
|
|
|
|
!reladdr && !r.reladdr &&
|
2014-08-13 12:25:58 -07:00
|
|
|
|
memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
|
|
|
|
|
|
width == r.width &&
|
2014-06-29 15:13:24 -07:00
|
|
|
|
stride == r.stride);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-01-15 22:21:30 +01:00
|
|
|
|
fs_reg &
|
|
|
|
|
|
fs_reg::set_smear(unsigned subreg)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(file != HW_REG && file != IMM);
|
|
|
|
|
|
subreg_offset = subreg * type_sz(type);
|
|
|
|
|
|
stride = 0;
|
|
|
|
|
|
return *this;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_contiguous() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return stride == 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-05-24 16:45:17 -07:00
|
|
|
|
int
|
|
|
|
|
|
fs_visitor::type_size(const struct glsl_type *type)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
|
|
|
|
|
unsigned int size, i;
|
|
|
|
|
|
|
|
|
|
|
|
switch (type->base_type) {
|
|
|
|
|
|
case GLSL_TYPE_UINT:
|
|
|
|
|
|
case GLSL_TYPE_INT:
|
|
|
|
|
|
case GLSL_TYPE_FLOAT:
|
|
|
|
|
|
case GLSL_TYPE_BOOL:
|
2010-08-27 10:44:04 -07:00
|
|
|
|
return type->components();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
case GLSL_TYPE_ARRAY:
|
|
|
|
|
|
return type_size(type->fields.array) * type->length;
|
|
|
|
|
|
case GLSL_TYPE_STRUCT:
|
|
|
|
|
|
size = 0;
|
|
|
|
|
|
for (i = 0; i < type->length; i++) {
|
|
|
|
|
|
size += type_size(type->fields.structure[i].type);
|
|
|
|
|
|
}
|
|
|
|
|
|
return size;
|
|
|
|
|
|
case GLSL_TYPE_SAMPLER:
|
|
|
|
|
|
/* Samplers take up no register space, since they're baked in at
|
|
|
|
|
|
* link time.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return 0;
|
2013-10-20 12:35:47 -07:00
|
|
|
|
case GLSL_TYPE_ATOMIC_UINT:
|
|
|
|
|
|
return 0;
|
2013-11-25 13:50:47 -08:00
|
|
|
|
case GLSL_TYPE_IMAGE:
|
2012-12-11 12:56:03 -08:00
|
|
|
|
case GLSL_TYPE_VOID:
|
|
|
|
|
|
case GLSL_TYPE_ERROR:
|
2012-12-11 12:11:16 -08:00
|
|
|
|
case GLSL_TYPE_INTERFACE:
|
2014-08-14 18:49:20 +10:00
|
|
|
|
case GLSL_TYPE_DOUBLE:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
2012-12-11 12:56:03 -08:00
|
|
|
|
|
|
|
|
|
|
return 0;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-26 23:51:27 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Create a MOV to read the timestamp register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The caller is responsible for emitting the MOV. The return value is
|
|
|
|
|
|
* the destination of the MOV, with extra parameters set.
|
|
|
|
|
|
*/
|
2012-11-27 14:10:52 -08:00
|
|
|
|
fs_reg
|
2015-02-26 23:51:27 -08:00
|
|
|
|
fs_visitor::get_timestamp(fs_inst **out_mov)
|
2012-11-27 14:10:52 -08:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2014-10-31 11:12:30 -07:00
|
|
|
|
fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
2012-11-27 14:10:52 -08:00
|
|
|
|
BRW_ARF_TIMESTAMP,
|
|
|
|
|
|
0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-02-26 23:51:27 -08:00
|
|
|
|
fs_inst *mov = MOV(dst, ts);
|
2014-10-31 11:12:30 -07:00
|
|
|
|
/* We want to read the 3 fields we care about even if it's not enabled in
|
|
|
|
|
|
* the dispatch.
|
2012-11-27 14:10:52 -08:00
|
|
|
|
*/
|
|
|
|
|
|
mov->force_writemask_all = true;
|
|
|
|
|
|
|
|
|
|
|
|
/* The caller wants the low 32 bits of the timestamp. Since it's running
|
|
|
|
|
|
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
|
|
|
|
|
|
* which is plenty of time for our purposes. It is identical across the
|
|
|
|
|
|
* EUs, but since it's tracking GPU core speed it will increment at a
|
|
|
|
|
|
* varying rate as render P-states change.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The caller could also check if render P-states have changed (or anything
|
|
|
|
|
|
* else that might disrupt timing) by setting smear to 2 and checking if
|
|
|
|
|
|
* that field is != 0.
|
|
|
|
|
|
*/
|
2014-01-15 22:21:30 +01:00
|
|
|
|
dst.set_smear(0);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-02-26 23:51:27 -08:00
|
|
|
|
*out_mov = mov;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_begin()
|
|
|
|
|
|
{
|
|
|
|
|
|
current_annotation = "shader time start";
|
2015-02-26 23:51:27 -08:00
|
|
|
|
fs_inst *mov;
|
|
|
|
|
|
shader_start_time = get_timestamp(&mov);
|
|
|
|
|
|
emit(mov);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_end()
|
|
|
|
|
|
{
|
|
|
|
|
|
current_annotation = "shader time end";
|
|
|
|
|
|
|
2012-12-10 09:44:19 -08:00
|
|
|
|
enum shader_time_shader_type type, written_type, reset_type;
|
2015-01-15 02:05:18 -08:00
|
|
|
|
switch (stage) {
|
|
|
|
|
|
case MESA_SHADER_VERTEX:
|
|
|
|
|
|
type = ST_VS;
|
|
|
|
|
|
written_type = ST_VS_WRITTEN;
|
|
|
|
|
|
reset_type = ST_VS_RESET;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case MESA_SHADER_GEOMETRY:
|
|
|
|
|
|
type = ST_GS;
|
|
|
|
|
|
written_type = ST_GS_WRITTEN;
|
|
|
|
|
|
reset_type = ST_GS_RESET;
|
|
|
|
|
|
break;
|
|
|
|
|
|
case MESA_SHADER_FRAGMENT:
|
|
|
|
|
|
if (dispatch_width == 8) {
|
|
|
|
|
|
type = ST_FS8;
|
|
|
|
|
|
written_type = ST_FS8_WRITTEN;
|
|
|
|
|
|
reset_type = ST_FS8_RESET;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
assert(dispatch_width == 16);
|
|
|
|
|
|
type = ST_FS16;
|
|
|
|
|
|
written_type = ST_FS16_WRITTEN;
|
|
|
|
|
|
reset_type = ST_FS16_RESET;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-04-15 18:27:50 -07:00
|
|
|
|
case MESA_SHADER_COMPUTE:
|
|
|
|
|
|
type = ST_CS;
|
|
|
|
|
|
written_type = ST_CS_WRITTEN;
|
|
|
|
|
|
reset_type = ST_CS_RESET;
|
|
|
|
|
|
break;
|
2015-01-15 02:05:18 -08:00
|
|
|
|
default:
|
|
|
|
|
|
unreachable("fs_visitor::emit_shader_time_end missing code");
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-26 22:55:54 -08:00
|
|
|
|
/* Insert our code just before the final SEND with EOT. */
|
|
|
|
|
|
exec_node *end = this->instructions.get_tail();
|
|
|
|
|
|
assert(end && ((fs_inst *) end)->eot);
|
|
|
|
|
|
|
2015-02-26 23:51:27 -08:00
|
|
|
|
fs_inst *tm_read;
|
|
|
|
|
|
fs_reg shader_end_time = get_timestamp(&tm_read);
|
2015-02-26 22:55:54 -08:00
|
|
|
|
end->insert_before(tm_read);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* Check that there weren't any timestamp reset events (assuming these
|
|
|
|
|
|
* were the only two timestamp reads that happened).
|
|
|
|
|
|
*/
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg reset = shader_end_time;
|
2014-01-15 22:21:30 +01:00
|
|
|
|
reset.set_smear(2);
|
2015-02-26 22:55:54 -08:00
|
|
|
|
fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
|
2012-11-27 14:10:52 -08:00
|
|
|
|
test->conditional_mod = BRW_CONDITIONAL_Z;
|
2015-03-07 23:01:07 -08:00
|
|
|
|
test->force_writemask_all = true;
|
2015-02-26 22:55:54 -08:00
|
|
|
|
end->insert_before(test);
|
|
|
|
|
|
end->insert_before(IF(BRW_PREDICATE_NORMAL));
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg start = shader_start_time;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
start.negate = true;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
|
2015-03-08 00:13:41 -08:00
|
|
|
|
diff.set_smear(0);
|
2015-03-07 23:01:07 -08:00
|
|
|
|
fs_inst *add = ADD(diff, start, shader_end_time);
|
|
|
|
|
|
add->force_writemask_all = true;
|
2015-02-26 22:55:54 -08:00
|
|
|
|
end->insert_before(add);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* If there were no instructions between the two timestamp gets, the diff
|
|
|
|
|
|
* is 2 cycles. Remove that overhead, so I can forget about that when
|
|
|
|
|
|
* trying to determine the time taken for single instructions.
|
|
|
|
|
|
*/
|
2015-03-07 23:01:07 -08:00
|
|
|
|
add = ADD(diff, diff, fs_reg(-2u));
|
|
|
|
|
|
add->force_writemask_all = true;
|
2015-02-26 22:55:54 -08:00
|
|
|
|
end->insert_before(add);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-02-26 22:55:54 -08:00
|
|
|
|
end->insert_before(SHADER_TIME_ADD(type, diff));
|
|
|
|
|
|
end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
|
|
|
|
|
|
end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
|
|
|
|
|
|
end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
|
|
|
|
|
|
end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
|
2012-12-10 09:21:34 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-26 22:49:04 -08:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
|
2012-12-10 09:21:34 -08:00
|
|
|
|
{
|
2013-04-08 17:17:44 -07:00
|
|
|
|
int shader_time_index =
|
2014-08-28 17:34:29 -07:00
|
|
|
|
brw_get_shader_time_index(brw, shader_prog, prog, type);
|
2013-03-19 15:28:11 -07:00
|
|
|
|
fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2013-03-19 15:28:11 -07:00
|
|
|
|
fs_reg payload;
|
|
|
|
|
|
if (dispatch_width == 8)
|
2014-05-16 02:21:51 -07:00
|
|
|
|
payload = vgrf(glsl_type::uvec2_type);
|
2013-03-19 15:28:11 -07:00
|
|
|
|
else
|
2014-05-16 02:21:51 -07:00
|
|
|
|
payload = vgrf(glsl_type::uint_type);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2015-02-26 22:49:04 -08:00
|
|
|
|
return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
|
|
|
|
|
|
fs_reg(), payload, offset, value);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
2015-02-18 17:43:07 -08:00
|
|
|
|
msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
|
2015-02-18 17:43:07 -08:00
|
|
|
|
if (debug_enabled) {
|
2011-06-10 15:26:02 -03:00
|
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Mark this program as impossible to compile in SIMD16 mode.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
|
|
|
|
|
* things that are unsupported in SIMD16 mode, so the compiler can skip
|
|
|
|
|
|
* the SIMD16 compile altogether.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During a SIMD16 compile (if one happens anyway), this just calls fail().
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::no16(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
|
|
|
|
|
|
if (dispatch_width == 16) {
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
simd16_unsupported = true;
|
|
|
|
|
|
|
2014-04-06 17:16:28 -07:00
|
|
|
|
if (brw->perf_debug) {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
if (no16_msg)
|
|
|
|
|
|
ralloc_vasprintf_append(&no16_msg, format, va);
|
|
|
|
|
|
else
|
|
|
|
|
|
no16_msg = ralloc_vasprintf(mem_ctx, format, va);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode)
|
|
|
|
|
|
{
|
2014-08-14 13:56:24 -07:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2014-06-28 23:11:22 -07:00
|
|
|
|
fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2014-06-28 23:11:22 -07:00
|
|
|
|
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2014-06-28 23:11:22 -07:00
|
|
|
|
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
|
|
|
|
|
|
const fs_reg &src1)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2014-06-28 23:11:22 -07:00
|
|
|
|
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
|
|
|
|
|
|
const fs_reg &src1, const fs_reg &src2)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-26 18:44:17 -07:00
|
|
|
|
fs_inst *
|
2014-06-28 23:11:22 -07:00
|
|
|
|
fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
|
2014-05-26 18:44:17 -07:00
|
|
|
|
fs_reg src[], int sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
|
* it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
|
{
|
2013-08-05 16:24:43 -07:00
|
|
|
|
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
|
2014-08-29 17:18:42 -07:00
|
|
|
|
(this->dst.width * type_sz(this->dst.type)) < 32 ||
|
|
|
|
|
|
!this->dst.is_contiguous());
|
2012-06-04 08:59:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
int
|
2015-02-06 01:24:17 +02:00
|
|
|
|
fs_inst::regs_read(int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
if (is_tex() && arg == 0 && src[0].file == GRF) {
|
2014-08-18 14:27:55 -07:00
|
|
|
|
return mlen;
|
2014-09-12 16:17:37 -07:00
|
|
|
|
} else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
|
|
|
|
|
|
return mlen;
|
2014-10-20 23:00:50 -07:00
|
|
|
|
} else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
|
|
|
|
|
|
return mlen;
|
2014-09-11 16:13:15 -07:00
|
|
|
|
} else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
|
|
|
|
|
|
return mlen;
|
2014-09-11 16:43:37 -07:00
|
|
|
|
} else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
|
|
|
|
|
|
return mlen;
|
2014-12-07 10:12:36 +13:00
|
|
|
|
} else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
|
|
|
|
|
|
return mlen;
|
2015-04-11 15:00:19 -07:00
|
|
|
|
} else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
|
|
|
|
|
|
return exec_size / 4;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
switch (src[arg].file) {
|
|
|
|
|
|
case BAD_FILE:
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
case IMM:
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
case GRF:
|
|
|
|
|
|
case HW_REG:
|
|
|
|
|
|
if (src[arg].stride == 0) {
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
|
|
|
|
|
|
return (size + 31) / 32;
|
|
|
|
|
|
}
|
|
|
|
|
|
case MRF:
|
|
|
|
|
|
unreachable("MRF registers are not allowed as sources");
|
|
|
|
|
|
default:
|
|
|
|
|
|
unreachable("Invalid register file");
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-20 11:32:01 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::reads_flag() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
|
|
|
|
|
return predicate;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::writes_flag() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
2014-11-11 15:56:58 -08:00
|
|
|
|
return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
|
|
|
|
|
|
opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
opcode != BRW_OPCODE_WHILE)) ||
|
2013-10-20 11:32:01 -07:00
|
|
|
|
opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns how many MRFs an FS opcode will write over.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this is not the 0 or 1 implied writes in an actual gen
|
|
|
|
|
|
* instruction -- the FS opcodes often generate MOVs in addition.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int
|
|
|
|
|
|
fs_visitor::implied_mrf_writes(fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (inst->mlen == 0)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->base_mrf == -1)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
switch (inst->opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 1 * dispatch_width / 8;
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_POW:
|
2011-09-28 17:37:54 -07:00
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 2 * dispatch_width / 8;
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
case FS_OPCODE_TXB:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
2013-12-10 16:36:31 +02:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2013-11-30 10:32:16 +13:00
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
2013-03-31 21:31:12 +13:00
|
|
|
|
case SHADER_OPCODE_TG4:
|
2013-10-08 21:42:10 +13:00
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
2013-03-06 14:47:01 -08:00
|
|
|
|
case SHADER_OPCODE_LOD:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
return 2;
|
2012-11-07 10:42:34 -08:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_READ:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
2012-11-07 11:18:34 -08:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
|
2013-03-18 10:16:42 -07:00
|
|
|
|
return inst->mlen;
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 2;
|
2013-09-11 14:01:50 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC:
|
2013-09-11 14:03:13 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
|
2014-10-20 23:00:50 -07:00
|
|
|
|
case SHADER_OPCODE_URB_WRITE_SIMD8:
|
2013-11-18 21:13:13 +13:00
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_CENTROID:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
|
|
|
|
|
|
case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
|
2013-09-11 14:01:50 -07:00
|
|
|
|
return 0;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
default:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached");
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::vgrf(const glsl_type *const type)
|
|
|
|
|
|
{
|
|
|
|
|
|
int reg_width = dispatch_width / 8;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
|
2014-05-16 02:21:51 -07:00
|
|
|
|
brw_type_for_base_type(type), dispatch_width);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-01-16 02:12:17 -08:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::vgrf(int num_components)
|
|
|
|
|
|
{
|
|
|
|
|
|
int reg_width = dispatch_width / 8;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
return fs_reg(GRF, alloc.allocate(num_components * reg_width),
|
2015-01-16 02:12:17 -08:00
|
|
|
|
BRW_REGISTER_TYPE_F, dispatch_width);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
/** Fixed HW reg constructor. */
|
2011-05-15 09:36:19 -07:00
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
2010-09-03 13:21:51 -07:00
|
|
|
|
init();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->file = file;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
this->reg = reg;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
2014-08-13 12:25:58 -07:00
|
|
|
|
|
|
|
|
|
|
switch (file) {
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
this->width = 1;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
this->width = 8;
|
|
|
|
|
|
}
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-15 12:04:52 -07:00
|
|
|
|
/** Fixed HW reg constructor. */
|
2014-06-29 16:02:59 -07:00
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
|
2010-10-15 12:04:52 -07:00
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = file;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
this->reg = reg;
|
2010-10-15 12:04:52 -07:00
|
|
|
|
this->type = type;
|
2014-08-13 12:25:58 -07:00
|
|
|
|
|
|
|
|
|
|
switch (file) {
|
|
|
|
|
|
case UNIFORM:
|
|
|
|
|
|
this->width = 1;
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
this->width = 8;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Fixed HW reg constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
|
|
|
|
|
|
uint8_t width)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = file;
|
|
|
|
|
|
this->reg = reg;
|
|
|
|
|
|
this->type = type;
|
|
|
|
|
|
this->width = width;
|
2010-10-15 12:04:52 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::variable_storage(ir_variable *var)
|
|
|
|
|
|
{
|
|
|
|
|
|
return (fs_reg *)hash_table_find(this->variable_ht, var);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-23 12:50:53 -07:00
|
|
|
|
void
|
|
|
|
|
|
import_uniforms_callback(const void *key,
|
|
|
|
|
|
void *data,
|
|
|
|
|
|
void *closure)
|
|
|
|
|
|
{
|
|
|
|
|
|
struct hash_table *dst_ht = (struct hash_table *)closure;
|
|
|
|
|
|
const fs_reg *reg = (const fs_reg *)data;
|
|
|
|
|
|
|
|
|
|
|
|
if (reg->file != UNIFORM)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
hash_table_insert(dst_ht, data, key);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
|
{
|
2011-07-25 18:13:04 -07:00
|
|
|
|
hash_table_call_foreach(v->variable_ht,
|
2011-03-23 12:50:53 -07:00
|
|
|
|
import_uniforms_callback,
|
|
|
|
|
|
variable_ht);
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->pull_constant_loc = v->pull_constant_loc;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->uniforms = v->uniforms;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->param_size = v->param_size;
|
2011-03-23 12:50:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-09-28 09:31:56 -07:00
|
|
|
|
/* Our support for uniforms is piggy-backed on the struct
|
|
|
|
|
|
* gl_fragment_program, because that's where the values actually
|
|
|
|
|
|
* get stored, rather than in some global gl_shader_program uniform
|
|
|
|
|
|
* store.
|
|
|
|
|
|
*/
|
2012-11-20 17:43:31 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_uniform_values(ir_variable *ir)
|
2010-09-28 09:31:56 -07:00
|
|
|
|
{
|
2012-11-20 17:43:31 -08:00
|
|
|
|
int namelen = strlen(ir->name);
|
2010-09-28 09:31:56 -07:00
|
|
|
|
|
2012-11-20 17:43:31 -08:00
|
|
|
|
/* The data for our (non-builtin) uniforms is stored in a series of
|
|
|
|
|
|
* gl_uniform_driver_storage structs for each subcomponent that
|
|
|
|
|
|
* glGetUniformLocation() could name. We know it's been set up in the same
|
|
|
|
|
|
* order we'd walk the type, so walk the list of storage and find anything
|
|
|
|
|
|
* with our name, or the prefix of a component that starts with our name.
|
|
|
|
|
|
*/
|
2014-02-19 15:27:01 +01:00
|
|
|
|
unsigned params_before = uniforms;
|
2013-04-08 17:17:44 -07:00
|
|
|
|
for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
|
|
|
|
|
|
struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
|
2012-11-20 17:43:31 -08:00
|
|
|
|
|
|
|
|
|
|
if (strncmp(ir->name, storage->name, namelen) != 0 ||
|
|
|
|
|
|
(storage->name[namelen] != 0 &&
|
|
|
|
|
|
storage->name[namelen] != '.' &&
|
|
|
|
|
|
storage->name[namelen] != '[')) {
|
|
|
|
|
|
continue;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-20 17:43:31 -08:00
|
|
|
|
unsigned slots = storage->type->component_slots();
|
|
|
|
|
|
if (storage->array_elements)
|
|
|
|
|
|
slots *= storage->array_elements;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
|
2012-11-20 17:43:31 -08:00
|
|
|
|
for (unsigned i = 0; i < slots; i++) {
|
2014-08-11 12:21:44 +01:00
|
|
|
|
stage_prog_data->param[uniforms++] = &storage->storage[i];
|
2010-09-28 09:31:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-11-20 17:43:31 -08:00
|
|
|
|
|
|
|
|
|
|
/* Make sure we actually initialized the right amount of stuff here. */
|
2014-02-19 15:27:01 +01:00
|
|
|
|
assert(params_before + ir->type->component_slots() == uniforms);
|
2013-04-11 10:38:04 -07:00
|
|
|
|
(void)params_before;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-09-28 16:23:04 -07:00
|
|
|
|
|
|
|
|
|
|
/* Our support for builtin uniforms is even scarier than non-builtin.
|
|
|
|
|
|
* It sits on top of the PROG_STATE_VAR parameters that are
|
|
|
|
|
|
* automatically updated from GL context state.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
|
|
|
|
|
|
{
|
2014-05-14 19:47:28 -07:00
|
|
|
|
const ir_state_slot *const slots = ir->get_state_slots();
|
|
|
|
|
|
assert(slots != NULL);
|
2010-09-28 16:23:04 -07:00
|
|
|
|
|
2014-05-14 19:47:28 -07:00
|
|
|
|
for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
|
2011-03-24 18:31:05 -07:00
|
|
|
|
/* This state reference has already been setup by ir_to_mesa, but we'll
|
|
|
|
|
|
* get the same index back here.
|
|
|
|
|
|
*/
|
2014-08-28 17:34:29 -07:00
|
|
|
|
int index = _mesa_add_state_reference(this->prog->Parameters,
|
2011-03-24 18:31:05 -07:00
|
|
|
|
(gl_state_index *)slots[i].tokens);
|
|
|
|
|
|
|
|
|
|
|
|
/* Add each of the unique swizzles of the element as a parameter.
|
|
|
|
|
|
* This'll end up matching the expected layout of the
|
|
|
|
|
|
* array/matrix/structure we're trying to fill in.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int last_swiz = -1;
|
|
|
|
|
|
for (unsigned int j = 0; j < 4; j++) {
|
|
|
|
|
|
int swiz = GET_SWZ(slots[i].swizzle, j);
|
|
|
|
|
|
if (swiz == last_swiz)
|
|
|
|
|
|
break;
|
|
|
|
|
|
last_swiz = swiz;
|
|
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
stage_prog_data->param[uniforms++] =
|
2014-08-28 17:34:29 -07:00
|
|
|
|
&prog->Parameters->ParameterValues[index][swiz];
|
2010-09-28 16:23:04 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
fs_reg *
|
2014-08-05 11:02:02 -07:00
|
|
|
|
fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
|
|
|
|
|
|
bool origin_upper_left)
|
2010-09-28 13:29:45 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
fs_reg wpos = *reg;
|
2014-08-05 11:02:02 -07:00
|
|
|
|
bool flip = !origin_upper_left ^ key->render_to_fbo;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.x */
|
2014-08-05 11:02:02 -07:00
|
|
|
|
if (pixel_center_integer) {
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(MOV(wpos, this->pixel_x));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
} else {
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
2014-09-06 13:48:34 -07:00
|
|
|
|
wpos = offset(wpos, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.y */
|
2014-08-05 11:02:02 -07:00
|
|
|
|
if (!flip && pixel_center_integer) {
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(MOV(wpos, this->pixel_y));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg pixel_y = this->pixel_y;
|
2014-08-05 11:02:02 -07:00
|
|
|
|
float offset = (pixel_center_integer ? 0.0 : 0.5);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
2010-11-13 14:00:58 -08:00
|
|
|
|
if (flip) {
|
2010-09-28 13:29:45 -07:00
|
|
|
|
pixel_y.negate = true;
|
2014-05-13 21:06:00 -07:00
|
|
|
|
offset += key->drawable_height - 1.0;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(ADD(wpos, pixel_y, fs_reg(offset)));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
2014-09-06 13:48:34 -07:00
|
|
|
|
wpos = offset(wpos, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.z */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
} else {
|
2011-10-21 17:20:32 -07:00
|
|
|
|
emit(FS_OPCODE_LINTERP, wpos,
|
2015-04-06 17:44:40 -07:00
|
|
|
|
this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
|
2013-02-23 09:00:58 -08:00
|
|
|
|
interp_reg(VARYING_SLOT_POS, 2));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
}
|
2014-09-06 13:48:34 -07:00
|
|
|
|
wpos = offset(wpos, 1);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.w: Already set up in emit_interpolation */
|
2011-03-13 00:23:40 -08:00
|
|
|
|
emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
return reg;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-21 11:33:22 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
|
2012-06-18 13:52:02 -07:00
|
|
|
|
glsl_interp_qualifier interpolation_mode,
|
2014-01-06 13:59:18 -08:00
|
|
|
|
bool is_centroid, bool is_sample)
|
2012-06-21 11:33:22 -07:00
|
|
|
|
{
|
|
|
|
|
|
brw_wm_barycentric_interp_mode barycoord_mode;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2013-04-27 11:00:46 +12:00
|
|
|
|
if (is_centroid) {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
|
2014-01-06 13:59:18 -08:00
|
|
|
|
} else if (is_sample) {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
|
2013-04-27 11:00:46 +12:00
|
|
|
|
} else {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
|
|
|
|
|
|
}
|
2012-06-18 13:52:02 -07:00
|
|
|
|
} else {
|
2013-04-27 11:00:46 +12:00
|
|
|
|
/* On Ironlake and below, there is only one interpolation mode.
|
|
|
|
|
|
* Centroid interpolation doesn't mean anything on this hardware --
|
|
|
|
|
|
* there is no multisampling.
|
|
|
|
|
|
*/
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
|
2012-06-18 13:52:02 -07:00
|
|
|
|
}
|
2012-06-21 11:33:22 -07:00
|
|
|
|
return emit(FS_OPCODE_LINTERP, attr,
|
2015-04-06 17:44:40 -07:00
|
|
|
|
this->delta_xy[barycoord_mode], interp);
|
2012-06-21 11:33:22 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-20 18:05:36 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
|
|
|
|
|
|
const glsl_type *type,
|
|
|
|
|
|
glsl_interp_qualifier interpolation_mode,
|
|
|
|
|
|
int location, bool mod_centroid,
|
|
|
|
|
|
bool mod_sample)
|
2010-09-03 13:22:38 -07:00
|
|
|
|
{
|
2014-10-20 18:05:36 -07:00
|
|
|
|
attr.type = brw_type_for_base_type(type->get_scalar_type());
|
2010-09-03 13:22:38 -07:00
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
unsigned int array_elements;
|
|
|
|
|
|
|
2014-10-20 18:05:36 -07:00
|
|
|
|
if (type->is_array()) {
|
|
|
|
|
|
array_elements = type->length;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
if (array_elements == 0) {
|
2014-10-20 18:05:36 -07:00
|
|
|
|
fail("dereferenced array '%s' has length 0\n", name);
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
2014-10-20 18:05:36 -07:00
|
|
|
|
type = type->fields.array;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
array_elements = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-20 18:05:36 -07:00
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_NONE) {
|
|
|
|
|
|
bool is_gl_Color =
|
|
|
|
|
|
location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
|
|
|
|
|
|
if (key->flat_shade && is_gl_Color) {
|
|
|
|
|
|
interpolation_mode = INTERP_QUALIFIER_FLAT;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
interpolation_mode = INTERP_QUALIFIER_SMOOTH;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2011-10-21 07:56:08 -07:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
for (unsigned int i = 0; i < array_elements; i++) {
|
|
|
|
|
|
for (unsigned int j = 0; j < type->matrix_columns; j++) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
if (prog_data->urb_setup[location] == -1) {
|
2010-09-28 14:53:36 -07:00
|
|
|
|
/* If there's no incoming setup data for this slot, don't
|
2010-10-01 12:15:48 -07:00
|
|
|
|
* emit interpolation for it.
|
2010-09-28 14:53:36 -07:00
|
|
|
|
*/
|
2014-09-06 13:48:34 -07:00
|
|
|
|
attr = offset(attr, type->vector_elements);
|
2010-10-01 11:44:27 -07:00
|
|
|
|
location++;
|
2010-09-28 14:53:36 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-10-21 07:56:08 -07:00
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
|
2011-01-12 12:52:16 -08:00
|
|
|
|
/* Constant interpolation (flat shading) case. The SF has
|
|
|
|
|
|
* handed us defined values in only the constant offset
|
|
|
|
|
|
* field of the setup reg.
|
|
|
|
|
|
*/
|
2011-03-29 15:39:01 +01:00
|
|
|
|
for (unsigned int k = 0; k < type->vector_elements; k++) {
|
|
|
|
|
|
struct brw_reg interp = interp_reg(location, k);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
interp = suboffset(interp, 3);
|
2014-10-20 18:05:36 -07:00
|
|
|
|
interp.type = attr.type;
|
2011-03-13 00:23:40 -08:00
|
|
|
|
emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
|
2014-09-06 13:48:34 -07:00
|
|
|
|
attr = offset(attr, 1);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2011-10-22 09:33:16 -07:00
|
|
|
|
/* Smooth/noperspective interpolation case. */
|
2011-03-29 15:39:01 +01:00
|
|
|
|
for (unsigned int k = 0; k < type->vector_elements; k++) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
struct brw_reg interp = interp_reg(location, k);
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
/* Get the pixel/sample mask into f0 so that we know
|
|
|
|
|
|
* which pixels are lit. Then, for each channel that is
|
|
|
|
|
|
* unlit, replace the centroid data with non-centroid
|
|
|
|
|
|
* data.
|
|
|
|
|
|
*/
|
2013-04-08 18:46:23 -07:00
|
|
|
|
emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
|
2014-06-28 18:38:03 -07:00
|
|
|
|
|
|
|
|
|
|
fs_inst *inst;
|
i965/fs: Mark predicated PLN instructions with dependency hints.
To implement the unlit_centroid_workaround, previously we emitted
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 1Q };
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 1Q };
where the flag register contains the channel enable bits from g0.
Since the predicates are complementary, the pair of pln instructions
write to non-overlapping components of the destination, which is the
case that the dependency control hints are designed for.
Typically setting dependency control hints on predicated instructions
isn't safe (if an instruction doesn't execute due to the predicate, it
won't update the scoreboard, leaving it in a bad state) but since we
must have at least one channel executing (i.e., +f0 is true for some
channel) by virtue of the fact that the thread is running, we can put
the +f0 pln instruction last and set the hints:
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 NoDDClr 1Q };
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 NoDDChk 1Q };
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
2014-06-28 23:32:05 -07:00
|
|
|
|
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
|
|
|
|
|
false, false);
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = true;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->has_pln)
|
i965/fs: Mark predicated PLN instructions with dependency hints.
To implement the unlit_centroid_workaround, previously we emitted
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 1Q };
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 1Q };
where the flag register contains the channel enable bits from g0.
Since the predicates are complementary, the pair of pln instructions
write to non-overlapping components of the destination, which is the
case that the dependency control hints are designed for.
Typically setting dependency control hints on predicated instructions
isn't safe (if an instruction doesn't execute due to the predicate, it
won't update the scoreboard, leaving it in a bad state) but since we
must have at least one channel executing (i.e., +f0 is true for some
channel) by virtue of the fact that the thread is running, we can put
the +f0 pln instruction last and set the hints:
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 NoDDClr 1Q };
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 NoDDChk 1Q };
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
2014-06-28 23:32:05 -07:00
|
|
|
|
inst->no_dd_clear = true;
|
|
|
|
|
|
|
2014-06-28 18:38:03 -07:00
|
|
|
|
inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
2014-10-20 18:05:36 -07:00
|
|
|
|
mod_centroid && !key->persample_shading,
|
|
|
|
|
|
mod_sample || key->persample_shading);
|
2014-06-28 18:38:03 -07:00
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->has_pln)
|
i965/fs: Mark predicated PLN instructions with dependency hints.
To implement the unlit_centroid_workaround, previously we emitted
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 1Q };
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 1Q };
where the flag register contains the channel enable bits from g0.
Since the predicates are complementary, the pair of pln instructions
write to non-overlapping components of the destination, which is the
case that the dependency control hints are designed for.
Typically setting dependency control hints on predicated instructions
isn't safe (if an instruction doesn't execute due to the predicate, it
won't update the scoreboard, leaving it in a bad state) but since we
must have at least one channel executing (i.e., +f0 is true for some
channel) by virtue of the fact that the thread is running, we can put
the +f0 pln instruction last and set the hints:
(-f0) pln(8) g20<1>F g16.4<0,1,0>F g2<8,8,1>F { align1 NoDDClr 1Q };
(+f0) pln(8) g20<1>F g16.4<0,1,0>F g4<8,8,1>F { align1 NoDDChk 1Q };
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net>
2014-06-28 23:32:05 -07:00
|
|
|
|
inst->no_dd_check = true;
|
2014-06-28 18:38:03 -07:00
|
|
|
|
|
|
|
|
|
|
} else {
|
|
|
|
|
|
emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
2014-10-20 18:05:36 -07:00
|
|
|
|
mod_centroid && !key->persample_shading,
|
|
|
|
|
|
mod_sample || key->persample_shading);
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
}
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
|
|
|
|
|
|
}
|
2014-09-06 13:48:34 -07:00
|
|
|
|
attr = offset(attr, 1);
|
2010-10-06 11:00:31 -07:00
|
|
|
|
}
|
2011-01-12 12:52:16 -08:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
location++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-06 11:13:22 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
2014-08-05 10:29:00 -07:00
|
|
|
|
fs_visitor::emit_frontfacing_interpolation()
|
2010-10-06 11:13:22 -07:00
|
|
|
|
{
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2014-08-10 09:04:49 -07:00
|
|
|
|
/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
|
* a boolean result from this (~0/true or 0/false).
|
|
|
|
|
|
*
|
|
|
|
|
|
* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
|
|
|
|
|
|
* this task in only one instruction:
|
|
|
|
|
|
* - a negation source modifier will flip the bit; and
|
|
|
|
|
|
* - a W -> D type conversion will sign extend the bit into the high
|
|
|
|
|
|
* word of the destination.
|
|
|
|
|
|
*
|
|
|
|
|
|
* An ASR 15 fills the low word of the destination.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
|
|
|
|
|
|
g0.negate = true;
|
|
|
|
|
|
|
|
|
|
|
|
emit(ASR(*reg, g0, fs_reg(15)));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
} else {
|
2014-08-10 10:28:34 -07:00
|
|
|
|
/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
|
|
|
|
|
|
* a boolean result from this (1/true or 0/false).
|
|
|
|
|
|
*
|
|
|
|
|
|
* Like in the above case, since the bit is the MSB of g1.6:UD we can use
|
|
|
|
|
|
* the negation source modifier to flip it. Unfortunately the SHR
|
|
|
|
|
|
* instruction only operates on UD (or D with an abs source modifier)
|
|
|
|
|
|
* sources without negation.
|
|
|
|
|
|
*
|
2014-12-02 12:28:13 -08:00
|
|
|
|
* Instead, use ASR (which will give ~0/true or 0/false).
|
2010-10-06 11:19:48 -07:00
|
|
|
|
*/
|
2014-08-10 10:28:34 -07:00
|
|
|
|
fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
|
|
|
|
|
|
g1_6.negate = true;
|
|
|
|
|
|
|
2014-12-02 12:28:13 -08:00
|
|
|
|
emit(ASR(*reg, g1_6, fs_reg(31)));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
}
|
2010-10-06 11:13:22 -07:00
|
|
|
|
|
|
|
|
|
|
return reg;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 15:53:05 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2013-10-24 15:53:05 -07:00
|
|
|
|
assert(dst.type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->compute_pos_offset) {
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Convert int_sample_pos to floating point */
|
|
|
|
|
|
emit(MOV(dst, int_sample_pos));
|
|
|
|
|
|
/* Scale to the range [0, 1] */
|
|
|
|
|
|
emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
|
|
|
|
|
|
}
|
|
|
|
|
|
else {
|
|
|
|
|
|
/* From ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SamplePosition will always be
|
|
|
|
|
|
* (0.5, 0.5).
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(MOV(dst, fs_reg(0.5f)));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
2014-08-05 11:10:07 -07:00
|
|
|
|
fs_visitor::emit_samplepos_setup()
|
2013-10-24 15:53:05 -07:00
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
|
|
|
|
|
this->current_annotation = "compute sample position";
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
|
2013-10-24 15:53:05 -07:00
|
|
|
|
fs_reg pos = *reg;
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg int_sample_x = vgrf(glsl_type::int_type);
|
|
|
|
|
|
fs_reg int_sample_y = vgrf(glsl_type::int_type);
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
|
|
|
|
|
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
|
|
|
|
|
|
* mode will be enabled.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the Ivy Bridge PRM, volume 2 part 1, page 344:
|
|
|
|
|
|
* R31.1:0 Position Offset X/Y for Slot[3:0]
|
|
|
|
|
|
* R31.3:2 Position Offset X/Y for Slot[7:4]
|
|
|
|
|
|
* .....
|
|
|
|
|
|
*
|
|
|
|
|
|
* The X, Y sample positions come in as bytes in thread payload. So, read
|
|
|
|
|
|
* the positions using vstride=16, width=8, hstride=2.
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_reg sample_pos_reg =
|
2014-05-13 21:52:51 -07:00
|
|
|
|
stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
|
2013-10-24 15:53:05 -07:00
|
|
|
|
BRW_REGISTER_TYPE_B), 16, 8, 2);
|
|
|
|
|
|
|
2014-08-16 10:48:18 -07:00
|
|
|
|
if (dispatch_width == 8) {
|
|
|
|
|
|
emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
|
|
|
|
|
|
} else {
|
2014-09-04 20:40:34 -07:00
|
|
|
|
emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
|
|
|
|
|
|
emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
|
|
|
|
|
|
->force_sechalf = true;
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
/* Compute gl_SamplePosition.x */
|
|
|
|
|
|
compute_sample_position(pos, int_sample_x);
|
2014-09-06 13:48:34 -07:00
|
|
|
|
pos = offset(pos, 1);
|
2014-08-16 10:48:18 -07:00
|
|
|
|
if (dispatch_width == 8) {
|
|
|
|
|
|
emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
|
|
|
|
|
|
} else {
|
2014-09-04 20:40:34 -07:00
|
|
|
|
emit(MOV(half(int_sample_y, 0),
|
|
|
|
|
|
fs_reg(suboffset(sample_pos_reg, 1))));
|
|
|
|
|
|
emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
|
|
|
|
|
|
->force_sechalf = true;
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
/* Compute gl_SamplePosition.y */
|
|
|
|
|
|
compute_sample_position(pos, int_sample_y);
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
fs_reg *
|
2014-10-17 12:59:18 -07:00
|
|
|
|
fs_visitor::emit_sampleid_setup()
|
2013-10-24 16:17:08 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
|
|
|
|
|
this->current_annotation = "compute sample id";
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
|
2013-10-24 16:17:08 -07:00
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->compute_sample_id) {
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg t1 = vgrf(glsl_type::int_type);
|
|
|
|
|
|
fs_reg t2 = vgrf(glsl_type::int_type);
|
2013-10-24 16:17:08 -07:00
|
|
|
|
t2.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
|
|
|
|
|
|
/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
|
|
|
|
|
|
* 8x multisampling, subspan 0 will represent sample N (where N
|
|
|
|
|
|
* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
|
|
|
|
|
|
* 7. We can find the value of N by looking at R0.0 bits 7:6
|
|
|
|
|
|
* ("Starting Sample Pair Index (SSPI)") and multiplying by two
|
|
|
|
|
|
* (since samples are always delivered in pairs). That is, we
|
|
|
|
|
|
* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
|
|
|
|
|
|
* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
|
|
|
|
|
|
* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
|
|
|
|
|
|
* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
|
|
|
|
|
|
* populating a temporary variable with the sequence (0, 1, 2, 3),
|
|
|
|
|
|
* and then reading from it using vstride=1, width=4, hstride=0.
|
|
|
|
|
|
* These computations hold good for 4x multisampling as well.
|
2014-07-18 13:19:45 -07:00
|
|
|
|
*
|
|
|
|
|
|
* For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
|
|
|
|
|
|
* the first four slots are sample 0 of subspan 0; the next four
|
|
|
|
|
|
* are sample 1 of subspan 0; the third group is sample 0 of
|
|
|
|
|
|
* subspan 1, and finally sample 1 of subspan 1.
|
2013-10-24 16:17:08 -07:00
|
|
|
|
*/
|
2014-07-15 20:40:55 -07:00
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
inst = emit(BRW_OPCODE_AND, t1,
|
|
|
|
|
|
fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
|
|
|
|
|
|
fs_reg(0xc0));
|
|
|
|
|
|
inst->force_writemask_all = true;
|
|
|
|
|
|
inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
|
|
|
|
|
|
inst->force_writemask_all = true;
|
2013-10-24 16:17:08 -07:00
|
|
|
|
/* This works for both SIMD8 and SIMD16 */
|
2014-07-18 13:19:45 -07:00
|
|
|
|
inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
|
2014-07-15 20:40:55 -07:00
|
|
|
|
inst->force_writemask_all = true;
|
2013-10-24 16:17:08 -07:00
|
|
|
|
/* This special instruction takes care of setting vstride=1,
|
|
|
|
|
|
* width=4, hstride=0 of t2 during an ADD instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* As per GL_ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SampleID will always be zero."
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-05 20:39:49 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::resolve_source_modifiers(fs_reg *src)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (!src->abs && !src->negate)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg temp = retype(vgrf(1), src->type);
|
|
|
|
|
|
emit(MOV(temp, *src));
|
|
|
|
|
|
*src = temp;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-29 08:39:08 +13:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::fix_math_operand(fs_reg src)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Can't do hstride == 0 args on gen6 math, so expand it out. We
|
|
|
|
|
|
* might be able to do better by doing execsize = 1 math and then
|
|
|
|
|
|
* expanding that result out, but we would need to be careful with
|
|
|
|
|
|
* masking.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The hardware ignores source modifiers (negate and abs) on math
|
|
|
|
|
|
* instructions, so we also move to a temp to set those up.
|
|
|
|
|
|
*/
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
|
2012-11-29 08:39:08 +13:00
|
|
|
|
!src.abs && !src.negate)
|
|
|
|
|
|
return src;
|
|
|
|
|
|
|
|
|
|
|
|
/* Gen7 relaxes most of the above restrictions, but still can't use IMM
|
|
|
|
|
|
* operands to math
|
|
|
|
|
|
*/
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7 && src.file != IMM)
|
2012-11-29 08:39:08 +13:00
|
|
|
|
return src;
|
|
|
|
|
|
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg expanded = vgrf(glsl_type::float_type);
|
2012-11-29 08:39:08 +13:00
|
|
|
|
expanded.type = src.type;
|
|
|
|
|
|
emit(BRW_OPCODE_MOV, expanded, src);
|
|
|
|
|
|
return expanded;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:35:34 -07:00
|
|
|
|
fs_inst *
|
2011-05-03 10:55:50 -07:00
|
|
|
|
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
|
2010-10-08 14:35:34 -07:00
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2010-10-08 14:35:34 -07:00
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-06-29 14:54:01 -07:00
|
|
|
|
unreachable("not reached: bad math opcode");
|
2010-10-08 14:35:34 -07:00
|
|
|
|
}
|
2010-10-11 13:42:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* Can't do hstride == 0 args to gen6 math, so expand it out. We
|
|
|
|
|
|
* might be able to do better by doing execsize = 1 math and then
|
|
|
|
|
|
* expanding that result out, but we would need to be careful with
|
|
|
|
|
|
* masking.
|
2010-12-07 14:50:50 -08:00
|
|
|
|
*
|
2011-10-18 12:24:47 -07:00
|
|
|
|
* Gen 6 hardware ignores source modifiers (negate and abs) on math
|
2010-12-07 14:50:50 -08:00
|
|
|
|
* instructions, so we also move to a temp to set those up.
|
2010-10-11 13:42:11 -07:00
|
|
|
|
*/
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6 || devinfo->gen == 7)
|
2012-11-29 08:39:08 +13:00
|
|
|
|
src = fix_math_operand(src);
|
2010-10-11 13:42:11 -07:00
|
|
|
|
|
2011-03-13 00:23:40 -08:00
|
|
|
|
fs_inst *inst = emit(opcode, dst, src);
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 6) {
|
2010-10-11 13:19:47 -07:00
|
|
|
|
inst->base_mrf = 2;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
inst->mlen = dispatch_width / 8;
|
2010-10-11 13:19:47 -07:00
|
|
|
|
}
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2011-05-03 10:55:50 -07:00
|
|
|
|
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
|
2010-10-08 14:35:34 -07:00
|
|
|
|
{
|
2010-10-11 13:19:47 -07:00
|
|
|
|
int base_mrf = 2;
|
|
|
|
|
|
fs_inst *inst;
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 8) {
|
2014-06-23 13:30:15 -07:00
|
|
|
|
inst = emit(opcode, dst, src0, src1);
|
2015-04-15 18:00:05 -07:00
|
|
|
|
} else if (devinfo->gen >= 6) {
|
2012-11-29 08:39:08 +13:00
|
|
|
|
src0 = fix_math_operand(src0);
|
|
|
|
|
|
src1 = fix_math_operand(src1);
|
2010-10-11 13:42:11 -07:00
|
|
|
|
|
2011-03-13 00:23:40 -08:00
|
|
|
|
inst = emit(opcode, dst, src0, src1);
|
2010-10-11 13:19:47 -07:00
|
|
|
|
} else {
|
2011-09-28 17:37:56 -07:00
|
|
|
|
/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
|
|
|
|
|
|
* "Message Payload":
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Operand0[7]. For the INT DIV functions, this operand is the
|
|
|
|
|
|
* denominator."
|
|
|
|
|
|
* ...
|
|
|
|
|
|
* "Operand1[7]. For the INT DIV functions, this operand is the
|
|
|
|
|
|
* numerator."
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool is_int_div = opcode != SHADER_OPCODE_POW;
|
|
|
|
|
|
fs_reg &op0 = is_int_div ? src1 : src0;
|
|
|
|
|
|
fs_reg &op1 = is_int_div ? src0 : src1;
|
|
|
|
|
|
|
2014-09-25 12:06:42 -07:00
|
|
|
|
emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
|
2011-09-28 17:37:56 -07:00
|
|
|
|
inst = emit(opcode, dst, op0, reg_null_f);
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2010-10-11 13:19:47 -07:00
|
|
|
|
inst->base_mrf = base_mrf;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
inst->mlen = 2 * dispatch_width / 8;
|
2010-10-11 13:19:47 -07:00
|
|
|
|
}
|
2010-10-08 14:35:34 -07:00
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-05 15:48:39 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_discard_jump()
|
|
|
|
|
|
{
|
2015-04-10 10:04:55 -07:00
|
|
|
|
assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
|
|
|
|
|
|
|
2015-03-05 15:48:39 -08:00
|
|
|
|
/* For performance, after a discard, jump to the end of the
|
|
|
|
|
|
* shader if all relevant channels have been discarded.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
|
|
|
|
|
|
discard_jump->flag_subreg = 1;
|
|
|
|
|
|
|
|
|
|
|
|
discard_jump->predicate = (dispatch_width == 8)
|
|
|
|
|
|
? BRW_PREDICATE_ALIGN1_ANY8H
|
|
|
|
|
|
: BRW_PREDICATE_ALIGN1_ANY16H;
|
|
|
|
|
|
discard_jump->predicate_inverse = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
|
{
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 8) {
|
2014-08-29 12:50:46 -07:00
|
|
|
|
prog_data->dispatch_grf_start_reg = payload.num_regs;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
} else {
|
2014-08-30 19:57:39 -07:00
|
|
|
|
if (stage == MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
|
|
|
|
|
|
} else if (stage == MESA_SHADER_COMPUTE) {
|
|
|
|
|
|
brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
|
|
|
|
|
|
prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
unreachable("Unsupported shader type!");
|
|
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
|
prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
|
2014-02-19 15:27:01 +01:00
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
int constant_nr;
|
|
|
|
|
|
if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
|
|
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
|
*/
|
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-13 21:52:51 -07:00
|
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
|
constant_nr / 8,
|
|
|
|
|
|
constant_nr % 8);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
2013-04-29 16:05:05 -07:00
|
|
|
|
inst->src[i].file = HW_REG;
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[i].fixed_hw_reg = byte_offset(
|
|
|
|
|
|
retype(brw_reg, inst->src[i].type),
|
|
|
|
|
|
inst->src[i].subreg_offset);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-16 21:53:02 -07:00
|
|
|
|
void
|
2010-10-01 12:15:48 -07:00
|
|
|
|
fs_visitor::calculate_urb_setup()
|
2010-08-16 21:53:02 -07:00
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
|
2014-08-24 21:51:28 -07:00
|
|
|
|
memset(prog_data->urb_setup, -1,
|
|
|
|
|
|
sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
int urb_next = 0;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6) {
|
2014-08-28 17:34:29 -07:00
|
|
|
|
if (_mesa_bitcount_64(prog->InputsRead &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BRW_FS_VARYING_INPUT_MASK) <= 16) {
|
|
|
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
|
* a different vertex (or geometry) shader.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2014-08-28 17:34:29 -07:00
|
|
|
|
if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(i)) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_vue_map prev_stage_vue_map;
|
2015-04-17 12:52:00 -07:00
|
|
|
|
brw_compute_vue_map(devinfo, &prev_stage_vue_map,
|
2014-05-13 21:06:00 -07:00
|
|
|
|
key->input_slots_valid);
|
2013-09-03 12:15:53 -07:00
|
|
|
|
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
|
|
|
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
|
slot++) {
|
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
|
|
|
|
|
/* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
|
|
|
|
|
|
* unused.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (varying != BRW_VARYING_SLOT_COUNT &&
|
2014-08-28 17:34:29 -07:00
|
|
|
|
(prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
|
2013-09-03 12:15:53 -07:00
|
|
|
|
BITFIELD64_BIT(varying))) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* Point size is packed into the header, not as a general attribute */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
if (i == VARYING_SLOT_PSIZ)
|
2012-07-19 22:00:16 +02:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* The back color slot is skipped when the front color is
|
|
|
|
|
|
* also written to. In addition, some slots can be
|
|
|
|
|
|
* written in the vertex shader and not read in the
|
|
|
|
|
|
* fragment shader. So the register number must always be
|
|
|
|
|
|
* incremented, mapped or not.
|
|
|
|
|
|
*/
|
2013-02-23 08:28:18 -08:00
|
|
|
|
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next;
|
2012-07-19 22:00:16 +02:00
|
|
|
|
urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-02-27 15:46:32 +08:00
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* It's a FS only attribute, and we did interpolation for this attribute
|
|
|
|
|
|
* in SF thread. So, count it here, too.
|
|
|
|
|
|
*
|
|
|
|
|
|
* See compile_sf_prog() for more info.
|
|
|
|
|
|
*/
|
2014-08-28 17:34:29 -07:00
|
|
|
|
if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->num_varying_inputs = urb_next;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_urb_setup()
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
|
|
|
|
|
|
2014-09-02 11:38:29 -07:00
|
|
|
|
int urb_start = payload.num_regs + prog_data->base.curb_read_length;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
2010-08-16 21:53:02 -07:00
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2011-01-12 12:52:16 -08:00
|
|
|
|
if (inst->opcode == FS_OPCODE_LINTERP) {
|
2015-04-06 17:44:40 -07:00
|
|
|
|
assert(inst->src[1].file == HW_REG);
|
|
|
|
|
|
inst->src[1].fixed_hw_reg.nr += urb_start;
|
2011-01-12 12:52:16 -08:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2011-01-12 12:52:16 -08:00
|
|
|
|
if (inst->opcode == FS_OPCODE_CINTERP) {
|
2013-04-29 16:05:05 -07:00
|
|
|
|
assert(inst->src[0].file == HW_REG);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
inst->src[0].fixed_hw_reg.nr += urb_start;
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-02 17:35:32 -07:00
|
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg. */
|
|
|
|
|
|
this->first_non_payload_grf =
|
2014-05-14 00:17:03 -07:00
|
|
|
|
urb_start + prog_data->num_varying_inputs * 2;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_vs_urb_setup()
|
|
|
|
|
|
{
|
|
|
|
|
|
brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
|
|
|
|
|
|
int grf, count, slot, channel, attr;
|
|
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
count = _mesa_bitcount_64(vs_prog_data->inputs_read);
|
|
|
|
|
|
if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
|
|
|
|
|
|
count++;
|
|
|
|
|
|
|
|
|
|
|
|
/* Each attribute is 4 regs. */
|
|
|
|
|
|
this->first_non_payload_grf =
|
|
|
|
|
|
payload.num_regs + prog_data->curb_read_length + count * 4;
|
|
|
|
|
|
|
|
|
|
|
|
unsigned vue_entries =
|
|
|
|
|
|
MAX2(count, vs_prog_data->base.vue_map.num_slots);
|
|
|
|
|
|
|
|
|
|
|
|
vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
|
|
|
|
|
|
vs_prog_data->base.urb_read_length = (count + 1) / 2;
|
|
|
|
|
|
|
|
|
|
|
|
assert(vs_prog_data->base.urb_read_length <= 15);
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite all ATTR file references to the hw grf that they land in. */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == ATTR) {
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].reg == VERT_ATTRIB_MAX) {
|
|
|
|
|
|
slot = count - 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Attributes come in in a contiguous block, ordered by their
|
|
|
|
|
|
* gl_vert_attrib value. That means we can compute the slot
|
|
|
|
|
|
* number for an attribute by masking out the enabled
|
|
|
|
|
|
* attributes before it and counting the bits.
|
|
|
|
|
|
*/
|
|
|
|
|
|
attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
|
|
|
|
|
|
slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
|
|
|
|
|
|
BITFIELD64_MASK(attr));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
channel = inst->src[i].reg_offset & 3;
|
|
|
|
|
|
|
|
|
|
|
|
grf = payload.num_regs +
|
|
|
|
|
|
prog_data->curb_read_length +
|
|
|
|
|
|
slot * 4 + channel;
|
|
|
|
|
|
|
|
|
|
|
|
inst->src[i].file = HW_REG;
|
|
|
|
|
|
inst->src[i].fixed_hw_reg =
|
|
|
|
|
|
retype(brw_vec8_grf(grf, 0), inst->src[i].type);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-13 20:17:15 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Split large virtual GRFs into separate components if we can.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is mostly duplicated with what brw_fs_vector_splitting does,
|
|
|
|
|
|
* but that's really conservative because it's afraid of doing
|
|
|
|
|
|
* splitting that doesn't result in real progress after the rest of
|
|
|
|
|
|
* the optimization phases, which would cause infinite looping in
|
|
|
|
|
|
* optimization. We can do it once here, safely. This also has the
|
|
|
|
|
|
* opportunity to split interpolated values, or maybe even uniforms,
|
|
|
|
|
|
* which we don't have at the IR level.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We want to split, because virtual GRFs are what we register
|
|
|
|
|
|
* allocate and spill (due to contiguousness requirements for some
|
|
|
|
|
|
* instructions), and they're what we naturally generate in the
|
|
|
|
|
|
* codegen process, but most virtual GRFs don't actually need to be
|
|
|
|
|
|
* contiguous sets of GRFs. If we split, we'll end up with reduced
|
|
|
|
|
|
* live intervals and better dead code elimination and coalescing.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::split_virtual_grfs()
|
|
|
|
|
|
{
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int num_vars = this->alloc.count;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* Count the total number of registers */
|
|
|
|
|
|
int reg_count = 0;
|
|
|
|
|
|
int vgrf_to_reg[num_vars];
|
2010-10-13 20:17:15 -07:00
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
vgrf_to_reg[i] = reg_count;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
reg_count += alloc.sizes[i];
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* An array of "split points". For each register slot, this indicates
|
|
|
|
|
|
* if this slot can be separated from the previous slot. Every time an
|
|
|
|
|
|
* instruction uses multiple elements of a register (as a source or
|
|
|
|
|
|
* destination), we mark the used slots as inseparable. Then we go
|
|
|
|
|
|
* through and split the registers into the smallest pieces we can.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool split_points[reg_count];
|
|
|
|
|
|
memset(split_points, 0, sizeof(split_points));
|
|
|
|
|
|
|
|
|
|
|
|
/* Mark all used registers as fully splittable */
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->dst.reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->src[i].reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
|
|
|
|
|
|
for (int j = 1; j < inst->regs_written; j++)
|
|
|
|
|
|
split_points[reg + j] = false;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
|
2015-02-06 01:24:17 +02:00
|
|
|
|
for (int j = 1; j < inst->regs_read(i); j++)
|
2014-08-19 13:57:11 -07:00
|
|
|
|
split_points[reg + j] = false;
|
2013-08-28 11:22:01 -07:00
|
|
|
|
}
|
2013-03-19 15:28:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-19 13:57:11 -07:00
|
|
|
|
int new_virtual_grf[reg_count];
|
|
|
|
|
|
int new_reg_offset[reg_count];
|
|
|
|
|
|
|
|
|
|
|
|
int reg = 0;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* The first one should always be 0 as a quick sanity check. */
|
|
|
|
|
|
assert(split_points[reg] == false);
|
|
|
|
|
|
|
|
|
|
|
|
/* j = 0 case */
|
|
|
|
|
|
new_reg_offset[reg] = 0;
|
|
|
|
|
|
reg++;
|
|
|
|
|
|
int offset = 1;
|
|
|
|
|
|
|
|
|
|
|
|
/* j > 0 case */
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned j = 1; j < alloc.sizes[i]; j++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
/* If this is a split point, reset the offset to 0 and allocate a
|
|
|
|
|
|
* new virtual GRF for the previous offset many registers
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (split_points[reg]) {
|
2014-10-01 10:54:59 -07:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int grf = alloc.allocate(offset);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int k = reg - offset; k < reg; k++)
|
|
|
|
|
|
new_virtual_grf[k] = grf;
|
|
|
|
|
|
offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
new_reg_offset[reg] = offset;
|
|
|
|
|
|
offset++;
|
|
|
|
|
|
reg++;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* The last one gets the original register number */
|
2014-10-01 10:54:59 -07:00
|
|
|
|
assert(offset <= MAX_VGRF_SIZE);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[i] = offset;
|
2014-08-19 13:57:11 -07:00
|
|
|
|
for (int k = reg - offset; k < reg; k++)
|
|
|
|
|
|
new_virtual_grf[k] = i;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-08-19 13:57:11 -07:00
|
|
|
|
assert(reg == reg_count);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
|
|
|
|
|
|
inst->dst.reg = new_virtual_grf[reg];
|
|
|
|
|
|
inst->dst.reg_offset = new_reg_offset[reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2014-08-19 13:57:11 -07:00
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
|
|
|
|
|
|
inst->src[i].reg = new_virtual_grf[reg];
|
|
|
|
|
|
inst->src[i].reg_offset = new_reg_offset[reg];
|
2015-02-10 15:51:34 +02:00
|
|
|
|
assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
|
2014-08-19 13:57:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove unused virtual GRFs and compact the virtual_grf_* arrays.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During code generation, we create tons of temporary variables, many of
|
|
|
|
|
|
* which get immediately killed and are never used again. Yet, in later
|
|
|
|
|
|
* optimization and analysis passes, such as compute_live_intervals, we need
|
|
|
|
|
|
* to loop over all the virtual GRFs. Compacting them can save a lot of
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool
|
2012-11-01 22:04:50 -07:00
|
|
|
|
fs_visitor::compact_virtual_grfs()
|
|
|
|
|
|
{
|
2014-09-16 13:14:09 -07:00
|
|
|
|
bool progress = false;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int remap_table[this->alloc.count];
|
2012-11-01 22:04:50 -07:00
|
|
|
|
memset(remap_table, -1, sizeof(remap_table));
|
|
|
|
|
|
|
2014-08-19 16:11:36 -07:00
|
|
|
|
/* Mark which virtual GRFs are used. */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, const fs_inst, inst, cfg) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->dst.file == GRF)
|
|
|
|
|
|
remap_table[inst->dst.reg] = 0;
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->src[i].file == GRF)
|
|
|
|
|
|
remap_table[inst->src[i].reg] = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Compact the GRF arrays. */
|
|
|
|
|
|
int new_index = 0;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned i = 0; i < this->alloc.count; i++) {
|
2014-09-16 13:14:09 -07:00
|
|
|
|
if (remap_table[i] == -1) {
|
|
|
|
|
|
/* We just found an unused register. This means that we are
|
|
|
|
|
|
* actually going to compact something.
|
|
|
|
|
|
*/
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
remap_table[i] = new_index;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[new_index] = alloc.sizes[i];
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-11-01 22:04:50 -07:00
|
|
|
|
++new_index;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
this->alloc.count = new_index;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
|
|
|
|
|
|
/* Patch all the instructions to use the newly renumbered registers */
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->dst.file == GRF)
|
|
|
|
|
|
inst->dst.reg = remap_table[inst->dst.reg];
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->src[i].file == GRF)
|
|
|
|
|
|
inst->src[i].reg = remap_table[inst->src[i].reg];
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
/* Patch all the references to delta_xy, since they're used in register
|
|
|
|
|
|
* allocation. If they're unused, switch them to BAD_FILE so we don't
|
|
|
|
|
|
* think some random VGRF is delta_xy.
|
2014-08-10 19:03:34 -07:00
|
|
|
|
*/
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
|
|
|
|
|
if (delta_xy[i].file == GRF) {
|
|
|
|
|
|
if (remap_table[delta_xy[i].reg] != -1) {
|
|
|
|
|
|
delta_xy[i].reg = remap_table[delta_xy[i].reg];
|
2014-09-12 17:45:30 -07:00
|
|
|
|
} else {
|
2015-04-06 17:44:40 -07:00
|
|
|
|
delta_xy[i].file = BAD_FILE;
|
2014-09-12 17:45:30 -07:00
|
|
|
|
}
|
2014-08-10 19:03:34 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 13:14:09 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
2012-11-01 22:04:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
/*
|
|
|
|
|
|
* Implements array access of uniforms by inserting a
|
|
|
|
|
|
* PULL_CONSTANT_LOAD instruction.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Unlike temporary GRF array access (where we don't support it due to
|
|
|
|
|
|
* the difficulty of doing relative addressing on instruction
|
|
|
|
|
|
* destinations), we could potentially do array access of uniforms
|
|
|
|
|
|
* that were loaded in GRF space as push constants. In real-world
|
|
|
|
|
|
* usage we've seen, though, the arrays being used are always larger
|
|
|
|
|
|
* than we could load as push constants, so just always move all
|
|
|
|
|
|
* uniform array access out to a pull constant buffer.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::move_uniform_array_access_to_pull_constants()
|
|
|
|
|
|
{
|
2014-03-07 16:10:50 -08:00
|
|
|
|
if (dispatch_width != 8)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2014-03-07 02:10:14 -08:00
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
2014-08-24 21:51:28 -07:00
|
|
|
|
memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
|
|
|
|
|
/* Walk through and find array access of uniforms. Put a copy of that
|
|
|
|
|
|
* uniform in the pull constant buffer.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that we don't move constant-indexed accesses to arrays. No
|
|
|
|
|
|
* testing has been done of the performance impact of this choice.
|
|
|
|
|
|
*/
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0 ; i < inst->sources; i++) {
|
2012-11-08 16:06:24 -08:00
|
|
|
|
if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
int uniform = inst->src[i].reg;
|
|
|
|
|
|
|
|
|
|
|
|
/* If this array isn't already present in the pull constant buffer,
|
|
|
|
|
|
* add it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (pull_constant_loc[uniform] == -1) {
|
2014-08-11 12:21:44 +01:00
|
|
|
|
const gl_constant_value **values = &stage_prog_data->param[uniform];
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
|
|
|
|
|
assert(param_size[uniform]);
|
|
|
|
|
|
|
|
|
|
|
|
for (int j = 0; j < param_size[uniform]; j++) {
|
2014-03-07 15:45:13 -08:00
|
|
|
|
pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
|
|
|
|
|
|
|
2014-02-19 15:14:02 +01:00
|
|
|
|
stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
|
2012-11-08 16:06:24 -08:00
|
|
|
|
values[j];
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-22 12:57:00 -07:00
|
|
|
|
/**
|
2014-03-11 14:35:27 -07:00
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2010-10-22 12:57:00 -07:00
|
|
|
|
*
|
|
|
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
|
|
|
|
|
* update the program to load them.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-03-11 14:35:27 -07:00
|
|
|
|
fs_visitor::assign_constant_locations()
|
2010-10-22 12:57:00 -07:00
|
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Only the first compile (SIMD8 mode) gets to decide on locations. */
|
|
|
|
|
|
if (dispatch_width != 8)
|
2010-10-22 12:57:00 -07:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Find which UNIFORM registers are still in use. */
|
|
|
|
|
|
bool is_live[uniforms];
|
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
|
|
|
|
|
is_live[i] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
if (constant_nr >= 0 && constant_nr < (int) uniforms)
|
|
|
|
|
|
is_live[constant_nr] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Just demote the end of the list. We could probably do better
|
2010-10-22 12:57:00 -07:00
|
|
|
|
* here, demoting things that are rarely used in the program first.
|
2014-05-19 08:51:12 -07:00
|
|
|
|
*
|
|
|
|
|
|
* If changing this value, note the limitation about total_regs in
|
|
|
|
|
|
* brw_curbe.c.
|
2010-10-22 12:57:00 -07:00
|
|
|
|
*/
|
2014-03-11 14:35:27 -07:00
|
|
|
|
unsigned int max_push_components = 16 * 8;
|
|
|
|
|
|
unsigned int num_push_constants = 0;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
if (!is_live[i] || pull_constant_loc[i] != -1) {
|
|
|
|
|
|
/* This UNIFORM register is either dead, or has already been demoted
|
|
|
|
|
|
* to a pull const. Mark it as no longer living in the param[] array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
push_constant_loc[i] = -1;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (num_push_constants < max_push_components) {
|
|
|
|
|
|
/* Retain as a push constant. Record the location in the params[]
|
|
|
|
|
|
* array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
push_constant_loc[i] = num_push_constants++;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
} else {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Demote to a pull constant. */
|
|
|
|
|
|
push_constant_loc[i] = -1;
|
|
|
|
|
|
|
2014-03-11 22:24:39 -07:00
|
|
|
|
int pull_index = stage_prog_data->nr_pull_params++;
|
|
|
|
|
|
stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
|
|
|
|
|
|
pull_constant_loc[i] = pull_index;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
|
|
|
|
|
stage_prog_data->nr_params = num_push_constants;
|
|
|
|
|
|
|
|
|
|
|
|
/* Up until now, the param[] array has been indexed by reg + reg_offset
|
|
|
|
|
|
* of UNIFORM registers. Condense it to only contain the uniforms we
|
|
|
|
|
|
* chose to upload as push constants.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
|
|
|
|
|
int remapped = push_constant_loc[i];
|
|
|
|
|
|
|
|
|
|
|
|
if (remapped == -1)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-03-24 11:16:38 -07:00
|
|
|
|
assert(remapped <= (int)i);
|
2014-03-11 14:35:27 -07:00
|
|
|
|
stage_prog_data->param[remapped] = stage_prog_data->param[i];
|
|
|
|
|
|
}
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
|
|
|
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-03-11 22:24:39 -07:00
|
|
|
|
fs_visitor::demote_pull_constants()
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
{
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2010-10-22 12:57:00 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-03-10 11:36:43 +01:00
|
|
|
|
int pull_index;
|
|
|
|
|
|
unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
if (location >= uniforms) /* Out of bounds access */
|
|
|
|
|
|
pull_index = -1;
|
|
|
|
|
|
else
|
|
|
|
|
|
pull_index = pull_constant_loc[location];
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
if (pull_index == -1)
|
2010-10-22 12:57:00 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Set up the annotation tracking for new generated instructions. */
|
|
|
|
|
|
base_ir = inst->ir;
|
|
|
|
|
|
current_annotation = inst->annotation;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
|
2014-05-16 02:21:51 -07:00
|
|
|
|
fs_reg dst = vgrf(glsl_type::float_type);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Generate a pull load into dst. */
|
|
|
|
|
|
if (inst->src[i].reladdr) {
|
|
|
|
|
|
exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
|
|
|
|
|
|
surf_index,
|
|
|
|
|
|
*inst->src[i].reladdr,
|
|
|
|
|
|
pull_index);
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->insert_before(block, &list);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
inst->src[i].reladdr = NULL;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
|
|
|
|
|
|
fs_inst *pull =
|
2014-08-14 13:56:24 -07:00
|
|
|
|
new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
dst, surf_index, offset);
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->insert_before(block, pull);
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
inst->src[i].set_smear(pull_index & 3);
|
|
|
|
|
|
}
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
|
|
|
|
|
inst->src[i].file = GRF;
|
|
|
|
|
|
inst->src[i].reg = dst.reg;
|
|
|
|
|
|
inst->src[i].reg_offset = 0;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
inst->src[i].width = dispatch_width;
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_algebraic()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
switch (inst->opcode) {
|
2014-12-21 06:56:54 -08:00
|
|
|
|
case BRW_OPCODE_MOV:
|
|
|
|
|
|
if (inst->src[0].file != IMM)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->saturate) {
|
|
|
|
|
|
if (inst->dst.type != inst->src[0].type)
|
|
|
|
|
|
assert(!"unimplemented: saturate mixed types");
|
|
|
|
|
|
|
|
|
|
|
|
if (brw_saturate_immediate(inst->dst.type,
|
|
|
|
|
|
&inst->src[0].fixed_hw_reg)) {
|
|
|
|
|
|
inst->saturate = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* a * 1.0 = a */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_one()) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-04 18:08:30 -08:00
|
|
|
|
/* a * -1.0 = -a */
|
|
|
|
|
|
if (inst->src[1].is_negative_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].negate = !inst->src[0].negate;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-09-20 11:06:07 +02:00
|
|
|
|
/* a * 0.0 = 0.0 */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_zero()) {
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2012-11-17 15:10:53 -08:00
|
|
|
|
inst->src[0] = inst->src[1];
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-09 17:27:52 -08:00
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2011-05-03 10:55:50 -07:00
|
|
|
|
break;
|
2012-09-20 11:06:07 +02:00
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* a + 0.0 = a */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_zero()) {
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2014-11-09 17:27:52 -08:00
|
|
|
|
|
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2012-09-20 11:06:07 +02:00
|
|
|
|
break;
|
2013-10-27 19:34:48 -07:00
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-01-09 20:57:36 -08:00
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
|
if (inst->src[1].equals(inst->src[2])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = inst->src[1];
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2015-01-05 13:51:03 -08:00
|
|
|
|
case BRW_OPCODE_CMP:
|
|
|
|
|
|
if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
|
|
|
|
|
|
inst->src[0].abs &&
|
|
|
|
|
|
inst->src[0].negate &&
|
|
|
|
|
|
inst->src[1].is_zero()) {
|
|
|
|
|
|
inst->src[0].abs = false;
|
|
|
|
|
|
inst->src[0].negate = false;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
case BRW_OPCODE_SEL:
|
2014-04-18 10:01:41 -07:00
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->predicate = BRW_PREDICATE_NONE;
|
|
|
|
|
|
inst->predicate_inverse = false;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->saturate && inst->src[1].file == IMM) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
|
2013-10-27 20:03:48 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
2015-02-10 21:36:26 -08:00
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 21:26:36 -07:00
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
|
2013-10-27 21:26:36 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2013-10-27 20:03:48 -07:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
case BRW_OPCODE_MAD:
|
|
|
|
|
|
if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[0].is_zero()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MUL;
|
|
|
|
|
|
inst->src[0] = inst->src[2];
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
2015-03-16 10:08:08 +02:00
|
|
|
|
progress = true;
|
2014-11-09 17:27:52 -08:00
|
|
|
|
} else if (inst->src[1].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[1] = inst->src[2];
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[2].is_one()) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
} else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_ADD;
|
|
|
|
|
|
inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-09-27 10:34:56 -07:00
|
|
|
|
case SHADER_OPCODE_RCP: {
|
|
|
|
|
|
fs_inst *prev = (fs_inst *)inst->prev;
|
|
|
|
|
|
if (prev->opcode == SHADER_OPCODE_SQRT) {
|
|
|
|
|
|
if (inst->src[0].equals(prev->dst)) {
|
|
|
|
|
|
inst->opcode = SHADER_OPCODE_RSQ;
|
|
|
|
|
|
inst->src[0] = prev->src[0];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2011-05-03 10:55:50 -07:00
|
|
|
|
default:
|
2011-07-22 16:45:15 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-03-16 10:08:08 +02:00
|
|
|
|
/* Swap if src[0] is immediate. */
|
|
|
|
|
|
if (progress && inst->is_commutative()) {
|
|
|
|
|
|
if (inst->src[0].file == IMM) {
|
|
|
|
|
|
fs_reg tmp = inst->src[1];
|
|
|
|
|
|
inst->src[1] = inst->src[0];
|
|
|
|
|
|
inst->src[0] = tmp;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2011-07-22 16:45:15 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-04-23 16:56:53 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages that have constant zero values for the trailing
|
|
|
|
|
|
* texture coordinates. We can just reduce the message length for these
|
|
|
|
|
|
* instructions instead of reserving a register for it. Trailing parameters
|
|
|
|
|
|
* that aren't sent default to zero anyway. This will cause the dead code
|
|
|
|
|
|
* eliminator to remove the MOV instruction that would otherwise be emitted to
|
|
|
|
|
|
* set up the zero value.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_zero_samples()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Gen4 infers the texturing opcode based on the message length so we can't
|
|
|
|
|
|
* change it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (devinfo->gen < 5)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (!inst->is_tex())
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *load_payload = (fs_inst *) inst->prev;
|
|
|
|
|
|
|
|
|
|
|
|
if (load_payload->is_head_sentinel() ||
|
|
|
|
|
|
load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* We don't want to remove the message header. Removing all of the
|
|
|
|
|
|
* parameters is avoided because it seems to cause a GPU hang but I
|
|
|
|
|
|
* can't find any documentation indicating that this is expected.
|
|
|
|
|
|
*/
|
|
|
|
|
|
while (inst->mlen > inst->header_present + dispatch_width / 8 &&
|
|
|
|
|
|
load_payload->src[(inst->mlen - inst->header_present) /
|
|
|
|
|
|
(dispatch_width / 8) +
|
|
|
|
|
|
inst->header_present - 1].is_zero()) {
|
|
|
|
|
|
inst->mlen -= dispatch_width / 8;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Optimize sample messages which are followed by the final RT write.
|
|
|
|
|
|
*
|
|
|
|
|
|
* CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
|
|
|
|
|
|
* results sent directly to the framebuffer, bypassing the EU. Recognize the
|
|
|
|
|
|
* final texturing results copied to the framebuffer write payload and modify
|
|
|
|
|
|
* them to write to the framebuffer directly.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_sampler_eot()
|
|
|
|
|
|
{
|
|
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
|
|
|
|
|
|
2015-04-28 14:20:06 +01:00
|
|
|
|
if (stage != MESA_SHADER_FRAGMENT)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 9 && !devinfo->is_cherryview)
|
2015-02-08 13:59:57 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* FINISHME: It should be possible to implement this optimization when there
|
|
|
|
|
|
* are multiple drawbuffers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (key->nr_color_regions != 1)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* Look for a texturing instruction immediately before the final FB_WRITE. */
|
|
|
|
|
|
fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
|
|
|
|
|
|
assert(fb_write->eot);
|
|
|
|
|
|
assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *tex_inst = (fs_inst *) fb_write->prev;
|
|
|
|
|
|
|
|
|
|
|
|
/* There wasn't one; nothing to do. */
|
|
|
|
|
|
if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
|
|
|
|
|
|
* It's very likely to be the previous instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *load_payload = (fs_inst *) tex_inst->prev;
|
|
|
|
|
|
if (load_payload->is_head_sentinel() ||
|
|
|
|
|
|
load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
assert(!tex_inst->eot); /* We can't get here twice */
|
|
|
|
|
|
assert((tex_inst->offset & (0xff << 24)) == 0);
|
|
|
|
|
|
|
|
|
|
|
|
tex_inst->offset |= fb_write->target << 24;
|
|
|
|
|
|
tex_inst->eot = true;
|
|
|
|
|
|
fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
|
|
|
|
|
|
|
|
|
|
|
|
/* If a header is present, marking the eot is sufficient. Otherwise, we need
|
|
|
|
|
|
* to create a new LOAD_PAYLOAD command with the same sources and a space
|
|
|
|
|
|
* saved for the header. Using a new destination register not only makes sure
|
|
|
|
|
|
* we have enough space, but it will make sure the dead code eliminator kills
|
|
|
|
|
|
* the instruction that this will replace.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (tex_inst->header_present)
|
|
|
|
|
|
return true;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg send_header = vgrf(load_payload->sources + 1);
|
|
|
|
|
|
fs_reg *new_sources =
|
|
|
|
|
|
ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
|
|
|
|
|
|
|
|
|
|
|
|
new_sources[0] = fs_reg();
|
|
|
|
|
|
for (int i = 0; i < load_payload->sources; i++)
|
|
|
|
|
|
new_sources[i+1] = load_payload->src[i];
|
|
|
|
|
|
|
|
|
|
|
|
/* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
|
|
|
|
|
|
* requires a lot of information about the sources to appropriately figure
|
|
|
|
|
|
* out the number of registers needed to be used. Given this stage in our
|
|
|
|
|
|
* optimization, we may not have the appropriate GRFs required by
|
|
|
|
|
|
* LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
|
|
|
|
|
|
* manually emit the instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
|
|
|
|
|
|
load_payload->exec_size,
|
|
|
|
|
|
send_header,
|
|
|
|
|
|
new_sources,
|
|
|
|
|
|
load_payload->sources + 1);
|
|
|
|
|
|
|
|
|
|
|
|
new_load_payload->regs_written = load_payload->regs_written + 1;
|
|
|
|
|
|
tex_inst->mlen++;
|
|
|
|
|
|
tex_inst->header_present = true;
|
|
|
|
|
|
tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
|
|
|
|
|
|
tex_inst->src[0] = send_header;
|
|
|
|
|
|
tex_inst->dst = reg_null_ud;
|
|
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-14 15:01:37 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_register_renaming()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
int depth = 0;
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int remap[alloc.count];
|
|
|
|
|
|
memset(remap, -1, sizeof(int) * alloc.count);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2014-09-01 13:35:04 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2014-04-14 15:01:37 -07:00
|
|
|
|
if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
|
|
|
|
|
|
depth++;
|
|
|
|
|
|
} else if (inst->opcode == BRW_OPCODE_ENDIF ||
|
|
|
|
|
|
inst->opcode == BRW_OPCODE_WHILE) {
|
|
|
|
|
|
depth--;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Rewrite instruction sources. */
|
|
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
|
|
|
|
|
if (inst->src[i].file == GRF &&
|
|
|
|
|
|
remap[inst->src[i].reg] != -1 &&
|
|
|
|
|
|
remap[inst->src[i].reg] != inst->src[i].reg) {
|
|
|
|
|
|
inst->src[i].reg = remap[inst->src[i].reg];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
const int dst = inst->dst.reg;
|
|
|
|
|
|
|
|
|
|
|
|
if (depth == 0 &&
|
|
|
|
|
|
inst->dst.file == GRF &&
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
|
2014-04-14 15:01:37 -07:00
|
|
|
|
!inst->is_partial_write()) {
|
|
|
|
|
|
if (remap[dst] == -1) {
|
|
|
|
|
|
remap[dst] = dst;
|
|
|
|
|
|
} else {
|
2015-02-10 15:51:34 +02:00
|
|
|
|
remap[dst] = alloc.allocate(inst->dst.width / 8);
|
2014-04-14 15:01:37 -07:00
|
|
|
|
inst->dst.reg = remap[dst];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else if (inst->dst.file == GRF &&
|
|
|
|
|
|
remap[dst] != -1 &&
|
|
|
|
|
|
remap[dst] != dst) {
|
|
|
|
|
|
inst->dst.reg = remap[dst];
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress) {
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2014-04-14 15:01:37 -07:00
|
|
|
|
|
2015-04-06 17:44:40 -07:00
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
|
|
|
|
|
|
if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
|
|
|
|
|
|
delta_xy[i].reg = remap[delta_xy[i].reg];
|
2014-04-14 15:01:37 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-05 22:10:41 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove redundant or useless discard jumps.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, we can eliminate jumps in the following sequence:
|
|
|
|
|
|
*
|
|
|
|
|
|
* discard-jump (redundant with the next jump)
|
|
|
|
|
|
* discard-jump (useless; jumps to the next instruction)
|
|
|
|
|
|
* placeholder-halt
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_redundant_discard_jumps()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *placeholder_halt = NULL;
|
|
|
|
|
|
foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
|
|
|
|
|
|
if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
|
|
|
|
|
|
placeholder_halt = inst;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!placeholder_halt)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* Delete any HALTs immediately before the placeholder halt. */
|
|
|
|
|
|
for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
|
|
|
|
|
|
!prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
|
|
|
|
|
|
prev = (fs_inst *) placeholder_halt->prev) {
|
|
|
|
|
|
prev->remove(last_bblock);
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::compute_to_mrf()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
2014-07-15 12:56:37 -07:00
|
|
|
|
int next_ip = 0;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
|
2014-10-29 14:21:14 -07:00
|
|
|
|
/* No MRFs on Gen >= 7. */
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7)
|
2014-10-29 14:21:14 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2011-01-12 10:10:01 -08:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2014-09-03 23:52:26 -07:00
|
|
|
|
foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
int ip = next_ip;
|
|
|
|
|
|
next_ip++;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
2013-04-11 09:54:41 -07:00
|
|
|
|
inst->is_partial_write() ||
|
2010-10-08 14:00:14 -07:00
|
|
|
|
inst->dst.file != MRF || inst->src[0].file != GRF ||
|
|
|
|
|
|
inst->dst.type != inst->src[0].type ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[0].abs || inst->src[0].negate ||
|
2014-01-15 22:21:30 +01:00
|
|
|
|
!inst->src[0].is_contiguous() ||
|
2014-07-15 12:56:37 -07:00
|
|
|
|
inst->src[0].subreg_offset)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2011-03-28 16:54:39 -07:00
|
|
|
|
/* Work out which hardware MRF registers are written by this
|
|
|
|
|
|
* instruction.
|
|
|
|
|
|
*/
|
2011-05-15 09:36:19 -07:00
|
|
|
|
int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
|
2011-03-28 16:54:39 -07:00
|
|
|
|
int mrf_high;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
if (inst->dst.reg & BRW_MRF_COMPR4) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
mrf_high = mrf_low + 4;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
} else if (inst->exec_size == 16) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
mrf_high = mrf_low + 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
mrf_high = mrf_low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
/* Can't compute-to-MRF this GRF if someone else was going to
|
|
|
|
|
|
* read it later.
|
|
|
|
|
|
*/
|
2013-04-30 15:00:40 -07:00
|
|
|
|
if (this->virtual_grf_end[inst->src[0].reg] > ip)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Found a move of a GRF to a MRF. Let's see if we can go
|
|
|
|
|
|
* rewrite the thing that made this GRF to write into the MRF.
|
|
|
|
|
|
*/
|
2014-09-02 10:08:24 -07:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
if (scan_inst->dst.file == GRF &&
|
|
|
|
|
|
scan_inst->dst.reg == inst->src[0].reg) {
|
|
|
|
|
|
/* Found the last thing to write our reg we want to turn
|
|
|
|
|
|
* into a compute-to-MRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/* If this one instruction didn't populate all the
|
|
|
|
|
|
* channels, bail. We might be able to rewrite everything
|
2011-03-28 16:54:39 -07:00
|
|
|
|
* that writes that reg, but it would require smarter
|
|
|
|
|
|
* tracking to delay the rewriting until complete success.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2012-06-04 08:59:00 -07:00
|
|
|
|
if (scan_inst->is_partial_write())
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2014-07-15 12:56:37 -07:00
|
|
|
|
/* Things returning more than one register would need us to
|
|
|
|
|
|
* understand coalescing out more than one MOV at a time.
|
|
|
|
|
|
*/
|
2014-08-18 14:27:55 -07:00
|
|
|
|
if (scan_inst->regs_written > scan_inst->dst.width / 8)
|
2014-07-15 12:56:37 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* SEND instructions can't have MRF as a destination. */
|
|
|
|
|
|
if (scan_inst->mlen)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen == 6) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
/* gen6 math instructions must have the destination be
|
|
|
|
|
|
* GRF, so no compute-to-MRF for them.
|
|
|
|
|
|
*/
|
2011-01-18 22:48:11 -08:00
|
|
|
|
if (scan_inst->is_math()) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
|
|
|
|
|
|
/* Found the creator of our MRF's source value. */
|
2010-11-18 15:03:50 +08:00
|
|
|
|
scan_inst->dst.file = MRF;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
scan_inst->dst.reg = inst->dst.reg;
|
2010-11-18 15:03:50 +08:00
|
|
|
|
scan_inst->saturate |= inst->saturate;
|
2014-09-03 23:52:26 -07:00
|
|
|
|
inst->remove(block);
|
2010-11-18 15:03:50 +08:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:36:18 -08:00
|
|
|
|
/* We don't handle control flow here. Most computation of
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* values that end up in MRFs are shortly before the MRF
|
|
|
|
|
|
* write anyway.
|
|
|
|
|
|
*/
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->start() == scan_inst)
|
2010-11-18 15:03:50 +08:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* You can't read from an MRF, so if someone else reads our
|
|
|
|
|
|
* MRF's source GRF that we wanted to rewrite, that stops us.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool interfered = false;
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->sources; i++) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
if (scan_inst->src[i].file == GRF &&
|
|
|
|
|
|
scan_inst->src[i].reg == inst->src[0].reg &&
|
|
|
|
|
|
scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
|
|
|
|
|
|
interfered = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (interfered)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-03-28 16:54:39 -07:00
|
|
|
|
if (scan_inst->dst.file == MRF) {
|
|
|
|
|
|
/* If somebody else writes our MRF here, we can't
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* compute-to-MRF before that.
|
|
|
|
|
|
*/
|
2011-05-15 09:36:19 -07:00
|
|
|
|
int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
|
2011-03-28 16:54:39 -07:00
|
|
|
|
int scan_mrf_high;
|
|
|
|
|
|
|
2011-05-15 09:36:19 -07:00
|
|
|
|
if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
scan_mrf_high = scan_mrf_low + 4;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
} else if (scan_inst->exec_size == 16) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
scan_mrf_high = scan_mrf_low + 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
scan_mrf_high = scan_mrf_low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (mrf_low == scan_mrf_low ||
|
|
|
|
|
|
mrf_low == scan_mrf_high ||
|
|
|
|
|
|
mrf_high == scan_mrf_low ||
|
|
|
|
|
|
mrf_high == scan_mrf_high) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-11-18 15:03:50 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
/* Found a SEND instruction, which means that there are
|
|
|
|
|
|
* live values in MRFs from base_mrf to base_mrf +
|
|
|
|
|
|
* scan_inst->mlen - 1. Don't go pushing our MRF write up
|
|
|
|
|
|
* above it.
|
|
|
|
|
|
*/
|
2011-03-28 16:54:39 -07:00
|
|
|
|
if (mrf_low >= scan_inst->base_mrf &&
|
|
|
|
|
|
mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (mrf_high >= scan_inst->base_mrf &&
|
|
|
|
|
|
mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-07 15:27:17 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
|
|
|
|
|
|
* instructions to FS_OPCODE_REP_FB_WRITE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_visitor::emit_repclear_shader()
|
2014-07-07 15:27:17 -07:00
|
|
|
|
{
|
2014-08-19 13:57:11 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-09-26 14:47:03 -07:00
|
|
|
|
int base_mrf = 1;
|
|
|
|
|
|
int color_mrf = base_mrf + 2;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
|
|
|
|
|
|
fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
|
|
|
|
|
|
mov->force_writemask_all = true;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
fs_inst *write;
|
|
|
|
|
|
if (key->nr_color_regions == 1) {
|
|
|
|
|
|
write = emit(FS_OPCODE_REP_FB_WRITE);
|
|
|
|
|
|
write->saturate = key->clamp_fragment_color;
|
|
|
|
|
|
write->base_mrf = color_mrf;
|
|
|
|
|
|
write->target = 0;
|
|
|
|
|
|
write->header_present = false;
|
|
|
|
|
|
write->mlen = 1;
|
|
|
|
|
|
} else {
|
2014-10-23 15:45:15 -07:00
|
|
|
|
assume(key->nr_color_regions > 0);
|
2014-09-26 14:47:03 -07:00
|
|
|
|
for (int i = 0; i < key->nr_color_regions; ++i) {
|
|
|
|
|
|
write = emit(FS_OPCODE_REP_FB_WRITE);
|
|
|
|
|
|
write->saturate = key->clamp_fragment_color;
|
|
|
|
|
|
write->base_mrf = base_mrf;
|
|
|
|
|
|
write->target = i;
|
|
|
|
|
|
write->header_present = true;
|
|
|
|
|
|
write->mlen = 3;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
2014-09-26 14:47:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
write->eot = true;
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
calculate_cfg();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
assign_constant_locations();
|
|
|
|
|
|
assign_curb_setup();
|
2014-07-07 15:27:17 -07:00
|
|
|
|
|
2014-09-26 14:47:03 -07:00
|
|
|
|
/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
|
|
|
|
|
|
assert(mov->src[0].file == HW_REG);
|
|
|
|
|
|
mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
|
2014-07-07 15:27:17 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
2012-01-27 11:06:49 -08:00
|
|
|
|
* Walks through basic blocks, looking for repeated MRF writes and
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* removing the later ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_duplicate_mrf_writes()
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *last_mrf_move[16];
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2011-03-23 14:00:01 -07:00
|
|
|
|
/* Need to update the MRF tracking for compressed instructions. */
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16)
|
2011-03-23 14:00:01 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2013-02-05 15:36:18 -08:00
|
|
|
|
if (inst->is_control_flow()) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
|
2010-11-19 15:57:05 +08:00
|
|
|
|
if (prev_inst && inst->equals(prev_inst)) {
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->remove(block);
|
2010-11-19 15:57:05 +08:00
|
|
|
|
progress = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out the last-write records for MRFs that were overwritten. */
|
|
|
|
|
|
if (inst->dst.file == MRF) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
last_mrf_move[inst->dst.reg] = NULL;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->mlen > 0 && inst->base_mrf != -1) {
|
2011-01-18 13:28:32 -08:00
|
|
|
|
/* Found a SEND instruction, which will include two or fewer
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* implied MRF writes. We could do better here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < implied_mrf_writes(inst); i++) {
|
|
|
|
|
|
last_mrf_move[inst->base_mrf + i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out any MRF move records whose sources got overwritten. */
|
|
|
|
|
|
if (inst->dst.file == GRF) {
|
2015-02-28 09:05:29 -07:00
|
|
|
|
for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
if (last_mrf_move[i] &&
|
|
|
|
|
|
last_mrf_move[i]->src[0].reg == inst->dst.reg) {
|
|
|
|
|
|
last_mrf_move[i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF &&
|
|
|
|
|
|
inst->src[0].file == GRF &&
|
2013-04-11 09:54:41 -07:00
|
|
|
|
!inst->is_partial_write()) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
last_mrf_move[inst->dst.reg] = inst;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
static void
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int grf;
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
grf = inst->src[i].reg;
|
2013-04-29 16:05:05 -07:00
|
|
|
|
} else if (inst->src[i].file == HW_REG &&
|
2013-02-05 15:46:22 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
|
|
|
|
|
|
grf = inst->src[i].fixed_hw_reg.nr;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (grf >= first_grf &&
|
|
|
|
|
|
grf < first_grf + grf_len) {
|
|
|
|
|
|
deps[grf - first_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (inst->exec_size == 16)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
deps[grf - first_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
|
|
|
|
|
|
* check for post destination dependencies on this instruction, software
|
|
|
|
|
|
* must ensure that there is no destination hazard for the case of ‘write
|
|
|
|
|
|
* followed by a posted write’ shown in the following example.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1. mov r3 0
|
|
|
|
|
|
* 2. send r3.xy <rest of send instruction>
|
|
|
|
|
|
* 3. mov r2 r3
|
|
|
|
|
|
*
|
|
|
|
|
|
* Due to no post-destination dependency check on the ‘send’, the above
|
|
|
|
|
|
* code sequence could have two instructions (1 and 2) in flight at the
|
|
|
|
|
|
* same time that both consider ‘r3’ as the target of their final writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
|
|
|
|
|
|
fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2014-08-18 14:27:55 -07:00
|
|
|
|
int write_len = inst->regs_written;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int first_write_grf = inst->dst.reg;
|
|
|
|
|
|
bool needs_dep[BRW_MAX_MRF];
|
|
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Walk backwards looking for writes to registers we're writing which
|
|
|
|
|
|
* aren't read since being written. If we hit the start of the program,
|
|
|
|
|
|
* we assume that there are no outstanding dependencies on entry to the
|
|
|
|
|
|
* program.
|
|
|
|
|
|
*/
|
2014-09-02 10:08:24 -07:00
|
|
|
|
foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, assume that there *are* outstanding
|
|
|
|
|
|
* dependencies, and force their cleanup before our instruction.
|
|
|
|
|
|
*/
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->start() == scan_inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i]) {
|
2014-08-24 19:07:01 -07:00
|
|
|
|
inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible on the assumption that any
|
|
|
|
|
|
* instruction but a MOV that might have left us an outstanding
|
|
|
|
|
|
* dependency has more latency than a MOV.
|
|
|
|
|
|
*/
|
2013-03-06 17:50:50 -08:00
|
|
|
|
if (scan_inst->dst.file == GRF) {
|
2013-03-18 11:30:57 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->regs_written; i++) {
|
2014-08-18 14:27:55 -07:00
|
|
|
|
int reg = scan_inst->dst.reg + i;
|
2013-03-06 17:50:50 -08:00
|
|
|
|
|
|
|
|
|
|
if (reg >= first_write_grf &&
|
|
|
|
|
|
reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[reg - first_write_grf]) {
|
2014-08-24 19:07:01 -07:00
|
|
|
|
inst->insert_before(block, DEP_RESOLVE_MOV(reg));
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf] = false;
|
2014-08-16 11:34:56 -07:00
|
|
|
|
if (scan_inst->exec_size == 16)
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Errata: A destination register from a send can not be
|
|
|
|
|
|
* used as a destination register until after it has been sourced by an
|
|
|
|
|
|
* instruction with a different destination register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-08-24 19:07:01 -07:00
|
|
|
|
fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
{
|
2014-08-18 14:27:55 -07:00
|
|
|
|
int write_len = inst->regs_written;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int first_write_grf = inst->dst.reg;
|
|
|
|
|
|
bool needs_dep[BRW_MAX_MRF];
|
|
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
/* Walk forwards looking for writes to registers we're writing which aren't
|
|
|
|
|
|
* read before being written.
|
|
|
|
|
|
*/
|
2014-09-02 10:08:24 -07:00
|
|
|
|
foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* If we hit control flow, force resolve all remaining dependencies. */
|
2014-09-01 15:01:23 -07:00
|
|
|
|
if (block->end() == scan_inst) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
2014-08-24 19:07:01 -07:00
|
|
|
|
scan_inst->insert_before(block,
|
|
|
|
|
|
DEP_RESOLVE_MOV(first_write_grf + i));
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2015-02-27 18:06:25 -08:00
|
|
|
|
clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible since they're reading the
|
|
|
|
|
|
* result of a SEND, which has massive latency.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (scan_inst->dst.file == GRF &&
|
|
|
|
|
|
scan_inst->dst.reg >= first_write_grf &&
|
|
|
|
|
|
scan_inst->dst.reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[scan_inst->dst.reg - first_write_grf]) {
|
2014-08-24 19:07:01 -07:00
|
|
|
|
scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
|
2013-02-05 15:46:22 -08:00
|
|
|
|
needs_dep[scan_inst->dst.reg - first_write_grf] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::insert_gen4_send_dependency_workarounds()
|
|
|
|
|
|
{
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen != 4 || devinfo->is_g4x)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-06-09 02:59:22 -07:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* Note that we're done with register allocation, so GRF fs_regs always
|
|
|
|
|
|
* have a .reg_offset of 0.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2014-08-24 19:07:01 -07:00
|
|
|
|
foreach_block_and_inst(block, fs_inst, inst, cfg) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
if (inst->mlen != 0 && inst->dst.file == GRF) {
|
2014-08-24 19:07:01 -07:00
|
|
|
|
insert_gen4_pre_send_dependency_workarounds(block, inst);
|
|
|
|
|
|
insert_gen4_post_send_dependency_workarounds(block, inst);
|
2014-06-09 02:59:22 -07:00
|
|
|
|
progress = true;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-06-09 02:59:22 -07:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-15 19:26:48 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Turns the generic expression-style uniform pull constant load instruction
|
|
|
|
|
|
* into a hardware-specific series of instructions for loading a pull
|
|
|
|
|
|
* constant.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The expression style allows the CSE pass before this to optimize out
|
|
|
|
|
|
* repeated loads from the same offset, and gives the pre-register-allocation
|
|
|
|
|
|
* scheduling full flexibility, while the conversion to native instructions
|
|
|
|
|
|
* allows the post-register-allocation scheduler the best information
|
|
|
|
|
|
* possible.
|
2013-03-06 14:47:22 -08:00
|
|
|
|
*
|
|
|
|
|
|
* Note that execution masking for setting up pull constant loads is special:
|
|
|
|
|
|
* the channels that need to be written are unrelated to the current execution
|
|
|
|
|
|
* mask, since a later instruction will use one of the result channels as a
|
|
|
|
|
|
* source operand for all 8 or 16 of its channels.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::lower_uniform_pull_constant_loads()
|
|
|
|
|
|
{
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst (block, fs_inst, inst, cfg) {
|
2013-02-15 19:26:48 -08:00
|
|
|
|
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 7) {
|
2013-03-14 14:41:37 -07:00
|
|
|
|
/* The offset arg before was a vec4-aligned byte offset. We need to
|
|
|
|
|
|
* turn it into a dword offset.
|
|
|
|
|
|
*/
|
2013-02-15 19:26:48 -08:00
|
|
|
|
fs_reg const_offset_reg = inst->src[1];
|
|
|
|
|
|
assert(const_offset_reg.file == IMM &&
|
|
|
|
|
|
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
|
2014-06-29 15:13:24 -07:00
|
|
|
|
const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
|
2015-02-19 15:49:34 -08:00
|
|
|
|
fs_reg payload = fs_reg(GRF, alloc.allocate(1));
|
2013-03-06 14:47:22 -08:00
|
|
|
|
|
2014-12-10 14:59:26 -08:00
|
|
|
|
/* We have to use a message header on Skylake to get SIMD4x2 mode.
|
|
|
|
|
|
* Reserve space for the register.
|
|
|
|
|
|
*/
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 9) {
|
2014-12-10 14:59:26 -08:00
|
|
|
|
payload.reg_offset++;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
alloc.sizes[payload.reg] = 2;
|
2014-12-10 14:59:26 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
/* This is actually going to be a MOV, but since only the first dword
|
|
|
|
|
|
* is accessed, we have a special opcode to do just that one. Note
|
|
|
|
|
|
* that this needs to be an operation that will be considered a def
|
|
|
|
|
|
* by live variable analysis, or register allocation will explode.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
2013-03-06 14:47:22 -08:00
|
|
|
|
fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
|
2014-08-14 13:56:24 -07:00
|
|
|
|
8, payload, const_offset_reg);
|
2013-03-06 14:47:22 -08:00
|
|
|
|
setup->force_writemask_all = true;
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
setup->ir = inst->ir;
|
|
|
|
|
|
setup->annotation = inst->annotation;
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->insert_before(block, setup);
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
/* Similarly, this will only populate the first 4 channels of the
|
|
|
|
|
|
* result register (since we only use smear values from 0-3), but we
|
|
|
|
|
|
* don't tell the optimizer.
|
|
|
|
|
|
*/
|
2013-02-15 19:26:48 -08:00
|
|
|
|
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
inst->src[1] = payload;
|
2013-03-06 15:58:46 -08:00
|
|
|
|
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Before register allocation, we didn't tell the scheduler about the
|
|
|
|
|
|
* MRF we use. We know it's safe to use this MRF because nothing
|
|
|
|
|
|
* else does except for register spill/unspill, which generates and
|
|
|
|
|
|
* uses its MRF within a single IR instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->base_mrf = 14;
|
|
|
|
|
|
inst->mlen = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-18 11:56:46 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::lower_load_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
int vgrf_to_reg[alloc.count];
|
2015-02-04 19:49:32 +02:00
|
|
|
|
int reg_count = 0;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned i = 0; i < alloc.count; ++i) {
|
2014-08-18 14:27:55 -07:00
|
|
|
|
vgrf_to_reg[i] = reg_count;
|
2015-02-10 15:51:34 +02:00
|
|
|
|
reg_count += alloc.sizes[i];
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
struct {
|
|
|
|
|
|
bool written:1; /* Whether this register has ever been written */
|
|
|
|
|
|
bool force_writemask_all:1;
|
|
|
|
|
|
bool force_sechalf:1;
|
|
|
|
|
|
} metadata[reg_count];
|
|
|
|
|
|
memset(metadata, 0, sizeof(metadata));
|
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
2014-09-30 17:27:33 -07:00
|
|
|
|
if (inst->dst.file == GRF) {
|
2015-02-06 15:06:05 +02:00
|
|
|
|
const int dst_reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
|
2015-01-17 14:12:34 +02:00
|
|
|
|
bool force_sechalf = inst->force_sechalf &&
|
|
|
|
|
|
!inst->force_writemask_all;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
bool toggle_sechalf = inst->dst.width == 16 &&
|
2015-01-17 14:12:34 +02:00
|
|
|
|
type_sz(inst->dst.type) == 4 &&
|
|
|
|
|
|
!inst->force_writemask_all;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
for (int i = 0; i < inst->regs_written; ++i) {
|
|
|
|
|
|
metadata[dst_reg + i].written = true;
|
|
|
|
|
|
metadata[dst_reg + i].force_sechalf = force_sechalf;
|
|
|
|
|
|
metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
|
|
|
|
|
|
force_sechalf = (toggle_sechalf != force_sechalf);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-18 11:56:46 -07:00
|
|
|
|
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
|
2014-08-18 14:27:55 -07:00
|
|
|
|
assert(inst->dst.file == MRF || inst->dst.file == GRF);
|
2014-04-18 11:56:46 -07:00
|
|
|
|
fs_reg dst = inst->dst;
|
|
|
|
|
|
|
2014-08-18 14:27:55 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2014-09-19 20:36:52 -07:00
|
|
|
|
dst.width = inst->src[i].effective_width;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
dst.type = inst->src[i].type;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->src[i].file == BAD_FILE) {
|
|
|
|
|
|
/* Do nothing but otherwise increment as normal */
|
2014-09-16 16:28:53 -07:00
|
|
|
|
} else if (dst.file == MRF &&
|
|
|
|
|
|
dst.width == 8 &&
|
2015-04-15 18:00:05 -07:00
|
|
|
|
devinfo->has_compr4 &&
|
2014-09-16 16:28:53 -07:00
|
|
|
|
i + 4 < inst->sources &&
|
|
|
|
|
|
inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
|
|
|
|
|
|
fs_reg compr4_dst = dst;
|
|
|
|
|
|
compr4_dst.reg += BRW_MRF_COMPR4;
|
|
|
|
|
|
compr4_dst.width = 16;
|
|
|
|
|
|
fs_reg compr4_src = inst->src[i];
|
|
|
|
|
|
compr4_src.width = 16;
|
|
|
|
|
|
fs_inst *mov = MOV(compr4_dst, compr4_src);
|
|
|
|
|
|
mov->force_writemask_all = true;
|
|
|
|
|
|
inst->insert_before(block, mov);
|
|
|
|
|
|
/* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
|
|
|
|
|
|
inst->src[i + 4].file = BAD_FILE;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
fs_inst *mov = MOV(dst, inst->src[i]);
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
int src_reg = vgrf_to_reg[inst->src[i].reg] +
|
|
|
|
|
|
inst->src[i].reg_offset;
|
|
|
|
|
|
mov->force_sechalf = metadata[src_reg].force_sechalf;
|
|
|
|
|
|
mov->force_writemask_all = metadata[src_reg].force_writemask_all;
|
2015-02-05 12:20:03 +02:00
|
|
|
|
} else {
|
|
|
|
|
|
/* We don't have any useful metadata for immediates or
|
|
|
|
|
|
* uniforms. Assume that any of the channels of the
|
|
|
|
|
|
* destination may be used.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(inst->src[i].file == IMM ||
|
|
|
|
|
|
inst->src[i].file == UNIFORM);
|
|
|
|
|
|
mov->force_writemask_all = true;
|
2015-02-04 19:49:32 +02:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (dst.file == GRF) {
|
2015-02-06 15:06:05 +02:00
|
|
|
|
const int dst_reg = vgrf_to_reg[dst.reg] + dst.reg_offset;
|
2015-02-04 19:49:32 +02:00
|
|
|
|
const bool force_writemask = mov->force_writemask_all;
|
|
|
|
|
|
metadata[dst_reg].force_writemask_all = force_writemask;
|
|
|
|
|
|
metadata[dst_reg].force_sechalf = mov->force_sechalf;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
if (dst.width * type_sz(dst.type) > 32) {
|
2015-02-04 19:49:32 +02:00
|
|
|
|
assert(!mov->force_sechalf);
|
|
|
|
|
|
metadata[dst_reg + 1].force_writemask_all = force_writemask;
|
|
|
|
|
|
metadata[dst_reg + 1].force_sechalf = !force_writemask;
|
2014-08-18 14:27:55 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2015-02-04 19:49:32 +02:00
|
|
|
|
|
2014-08-18 14:27:55 -07:00
|
|
|
|
inst->insert_before(block, mov);
|
|
|
|
|
|
}
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
2014-08-18 14:27:55 -07:00
|
|
|
|
dst = offset(dst, 1);
|
2014-04-18 11:56:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-12 21:18:39 -07:00
|
|
|
|
inst->remove(block);
|
2014-04-18 11:56:46 -07:00
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (progress)
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2014-04-18 11:56:46 -07:00
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:34:01 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions()
|
2014-05-29 13:08:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instructions(NULL);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions(const char *name)
|
2013-08-04 23:34:01 -07:00
|
|
|
|
{
|
2014-05-29 13:08:59 -07:00
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
|
if (name && geteuid() != 0) {
|
|
|
|
|
|
file = fopen(name, "w");
|
|
|
|
|
|
if (!file)
|
|
|
|
|
|
file = stderr;
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
|
2015-02-13 10:46:32 -08:00
|
|
|
|
if (cfg) {
|
|
|
|
|
|
calculate_register_pressure();
|
|
|
|
|
|
int ip = 0, max_pressure = 0;
|
|
|
|
|
|
foreach_block_and_inst(block, backend_instruction, inst, cfg) {
|
|
|
|
|
|
max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
|
|
|
|
|
|
fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
ip++;
|
|
|
|
|
|
}
|
|
|
|
|
|
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
int ip = 0;
|
|
|
|
|
|
foreach_in_list(backend_instruction, inst, &instructions) {
|
|
|
|
|
|
fprintf(file, "%4d: ", ip++);
|
|
|
|
|
|
dump_instruction(inst, file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
2014-05-29 13:08:59 -07:00
|
|
|
|
|
|
|
|
|
|
if (file != stderr) {
|
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
void
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instruction(be_inst, stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
|
2012-10-30 15:35:44 -07:00
|
|
|
|
{
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_inst *inst = (fs_inst *)be_inst;
|
|
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->predicate) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(%cf0.%d) ",
|
2012-12-06 10:36:11 -08:00
|
|
|
|
inst->predicate_inverse ? '-' : '+',
|
|
|
|
|
|
inst->flag_subreg);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", brw_instruction_name(inst->opcode));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->saturate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".sat");
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->conditional_mod) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (!inst->predicate &&
|
2015-04-15 18:00:05 -07:00
|
|
|
|
(devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
|
2012-12-06 10:36:11 -08:00
|
|
|
|
inst->opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
inst->opcode != BRW_OPCODE_WHILE))) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".f0.%d", inst->flag_subreg);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-09-16 18:02:52 -07:00
|
|
|
|
fprintf(file, "(%d) ", inst->exec_size);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->dst.file) {
|
|
|
|
|
|
case GRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->dst.reg);
|
2014-08-13 14:42:40 -07:00
|
|
|
|
if (inst->dst.width != dispatch_width)
|
|
|
|
|
|
fprintf(file, "@%d", inst->dst.width);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->dst.subreg_offset)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d",
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->dst.reg_offset, inst->dst.subreg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case MRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "m%d", inst->dst.reg);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case UNIFORM:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
|
|
|
|
|
fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
|
|
|
|
|
|
break;
|
2013-10-08 23:30:08 -07:00
|
|
|
|
case HW_REG:
|
2013-11-25 15:37:18 -08:00
|
|
|
|
if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
|
|
|
|
|
|
switch (inst->dst.fixed_hw_reg.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "null");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->dst.fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->dst.fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->dst.fixed_hw_reg.subnr)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-09-16 15:56:47 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->src[i].file) {
|
|
|
|
|
|
case GRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->src[i].reg);
|
2014-08-13 14:42:40 -07:00
|
|
|
|
if (inst->src[i].width != dispatch_width)
|
|
|
|
|
|
fprintf(file, "@%d", inst->src[i].width);
|
2015-02-10 15:51:34 +02:00
|
|
|
|
if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[i].subreg_offset)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->src[i].subreg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case MRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "***m%d***", inst->src[i].reg);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2014-10-20 23:16:48 -07:00
|
|
|
|
case ATTR:
|
|
|
|
|
|
fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
|
|
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
case UNIFORM:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
|
2014-03-11 00:11:42 -07:00
|
|
|
|
if (inst->src[i].reladdr) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+reladdr");
|
2014-09-05 17:07:16 -07:00
|
|
|
|
} else if (inst->src[i].subreg_offset) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->src[i].subreg_offset);
|
2014-03-11 00:11:42 -07:00
|
|
|
|
}
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case IMM:
|
|
|
|
|
|
switch (inst->src[i].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_W:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2015-01-08 22:55:16 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UW:
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
2014-06-29 15:13:24 -07:00
|
|
|
|
fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
2014-03-08 17:25:34 -08:00
|
|
|
|
case BRW_REGISTER_TYPE_VF:
|
2014-12-31 16:54:44 -08:00
|
|
|
|
fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
|
2014-03-08 17:25:34 -08:00
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
|
|
|
|
|
|
brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
|
|
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-08 23:30:08 -07:00
|
|
|
|
case HW_REG:
|
|
|
|
|
|
if (inst->src[i].fixed_hw_reg.negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
|
|
|
|
|
|
switch (inst->src[i].fixed_hw_reg.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "null");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.subnr)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2013-12-02 13:10:29 -08:00
|
|
|
|
if (inst->src[i].file != IMM) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
|
2013-12-02 13:10:29 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ", ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-09-16 18:02:52 -07:00
|
|
|
|
if (dispatch_width == 16 && inst->exec_size == 8) {
|
|
|
|
|
|
if (inst->force_sechalf)
|
|
|
|
|
|
fprintf(file, "2ndhalf ");
|
|
|
|
|
|
else
|
|
|
|
|
|
fprintf(file, "1sthalf ");
|
|
|
|
|
|
}
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "\n");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-03-10 13:48:42 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Possibly returns an instruction that set up @param reg.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Sometimes we want to take the result of some expression/variable
|
|
|
|
|
|
* dereference tree and rewrite the instruction generating the result
|
|
|
|
|
|
* of the tree. When processing the tree, we know that the
|
|
|
|
|
|
* instructions generated are all writing temporaries that are dead
|
|
|
|
|
|
* outside of this tree. So, if we have some instructions that write
|
|
|
|
|
|
* a temporary, we're free to point that temp write somewhere else.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this doesn't guarantee that the instruction generated
|
|
|
|
|
|
* only reg -- it might be the size=4 destination of a texture instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::get_instruction_generating_reg(fs_inst *start,
|
|
|
|
|
|
fs_inst *end,
|
2014-02-19 20:31:14 -08:00
|
|
|
|
const fs_reg ®)
|
2012-03-10 13:48:42 -08:00
|
|
|
|
{
|
|
|
|
|
|
if (end == start ||
|
2012-06-04 08:59:00 -07:00
|
|
|
|
end->is_partial_write() ||
|
2012-11-08 16:06:24 -08:00
|
|
|
|
reg.reladdr ||
|
2012-05-10 16:10:14 -07:00
|
|
|
|
!reg.equals(end->dst)) {
|
2012-03-10 13:48:42 -08:00
|
|
|
|
return NULL;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return end;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-13 19:36:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_payload_gen6()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool uses_depth =
|
2014-08-28 17:34:29 -07:00
|
|
|
|
(prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
unsigned barycentric_interp_modes =
|
|
|
|
|
|
(stage == MESA_SHADER_FRAGMENT) ?
|
|
|
|
|
|
((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 6);
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
|
|
|
|
|
/* R0-1: masks, pixel X/Y coordinates. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs = 2;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R2: only for 32-pixel dispatch.*/
|
|
|
|
|
|
|
|
|
|
|
|
/* R3-26: barycentric interpolation coordinates. These appear in the
|
|
|
|
|
|
* same order that they appear in the brw_wm_barycentric_interp_mode
|
|
|
|
|
|
* enum. Each set of coordinates occupies 2 registers if dispatch width
|
|
|
|
|
|
* == 8 and 4 registers if dispatch width == 16. Coordinates only
|
|
|
|
|
|
* appear if they were enabled using the "Barycentric Interpolation
|
|
|
|
|
|
* Mode" bits in WM_STATE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
|
|
|
|
|
|
if (barycentric_interp_modes & (1 << i)) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.barycentric_coord_reg[i] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += 2;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs += 2;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* R27: interpolated depth if uses source depth */
|
|
|
|
|
|
if (uses_depth) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.source_depth_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R28: interpolated depth if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
|
|
|
|
|
|
if (uses_depth) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.source_w_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R30: interpolated W if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2014-08-29 12:50:46 -07:00
|
|
|
|
if (stage == MESA_SHADER_FRAGMENT) {
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
prog_data->uses_pos_offset = key->compute_pos_offset;
|
|
|
|
|
|
/* R31: MSAA position offsets. */
|
|
|
|
|
|
if (prog_data->uses_pos_offset) {
|
|
|
|
|
|
payload.sample_pos_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
|
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 20:29:43 +13:00
|
|
|
|
/* R32: MSAA input coverage mask */
|
2014-08-28 17:34:29 -07:00
|
|
|
|
if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
|
2015-04-15 18:00:05 -07:00
|
|
|
|
assert(devinfo->gen >= 7);
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.sample_mask_in_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R33: input coverage mask if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* R34-: bary for 32-pixel. */
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R58-59: interp W for 32-pixel. */
|
|
|
|
|
|
|
2014-08-28 17:34:29 -07:00
|
|
|
|
if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
2014-05-14 00:08:58 -07:00
|
|
|
|
source_depth_to_render_target = true;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_vs_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
/* R0: thread header, R1: urb handles */
|
|
|
|
|
|
payload.num_regs = 2;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_cs_payload()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(brw->gen >= 7);
|
|
|
|
|
|
|
|
|
|
|
|
payload.num_regs = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-02 14:07:40 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_binding_table_offsets()
|
|
|
|
|
|
{
|
2014-08-29 12:50:46 -07:00
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
|
2014-08-29 12:50:46 -07:00
|
|
|
|
brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
|
2013-10-03 09:58:43 -07:00
|
|
|
|
uint32_t next_binding_table_offset = 0;
|
2013-10-02 18:53:04 -07:00
|
|
|
|
|
2013-11-26 00:30:19 -08:00
|
|
|
|
/* If there are no color regions, we still perform an FB write to a null
|
|
|
|
|
|
* renderbuffer, which we place at surface index 0.
|
|
|
|
|
|
*/
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->binding_table.render_target_start = next_binding_table_offset;
|
2014-05-13 21:06:00 -07:00
|
|
|
|
next_binding_table_offset += MAX2(key->nr_color_regions, 1);
|
2013-10-02 18:53:04 -07:00
|
|
|
|
|
2013-10-03 09:58:43 -07:00
|
|
|
|
assign_common_binding_table_offsets(next_binding_table_offset);
|
2013-10-02 14:07:40 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:27:14 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::calculate_register_pressure()
|
|
|
|
|
|
{
|
2014-09-01 10:54:00 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2014-09-26 16:08:52 -07:00
|
|
|
|
unsigned num_instructions = 0;
|
|
|
|
|
|
foreach_block(block, cfg)
|
|
|
|
|
|
num_instructions += block->instructions.length();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
|
|
|
|
|
|
regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
|
|
|
|
|
|
|
2015-02-10 15:51:34 +02:00
|
|
|
|
for (unsigned reg = 0; reg < alloc.count; reg++) {
|
2013-08-04 23:27:14 -07:00
|
|
|
|
for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
|
2015-02-10 15:51:34 +02:00
|
|
|
|
regs_live_at_ip[ip] += alloc.sizes[reg];
|
2013-08-04 23:27:14 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::optimize()
|
|
|
|
|
|
{
|
|
|
|
|
|
split_virtual_grfs();
|
|
|
|
|
|
|
|
|
|
|
|
move_uniform_array_access_to_pull_constants();
|
|
|
|
|
|
assign_constant_locations();
|
|
|
|
|
|
demote_pull_constants();
|
|
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
#define OPT(pass, args...) ({ \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
pass_num++; \
|
|
|
|
|
|
bool this_progress = pass(args); \
|
|
|
|
|
|
\
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
|
|
|
|
|
|
char filename[64]; \
|
2014-10-27 22:42:50 -07:00
|
|
|
|
snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
|
2015-03-12 05:37:43 -07:00
|
|
|
|
stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
|
2014-11-13 16:28:18 -08:00
|
|
|
|
\
|
|
|
|
|
|
backend_visitor::dump_instructions(filename); \
|
|
|
|
|
|
} \
|
|
|
|
|
|
\
|
|
|
|
|
|
progress = progress || this_progress; \
|
2015-01-16 01:05:21 -08:00
|
|
|
|
this_progress; \
|
|
|
|
|
|
})
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
|
|
|
|
|
|
char filename[64];
|
2014-10-27 22:42:50 -07:00
|
|
|
|
snprintf(filename, 64, "%s%d-%04d-00-start",
|
2015-03-12 05:37:43 -07:00
|
|
|
|
stage_abbrev, dispatch_width,
|
|
|
|
|
|
shader_prog ? shader_prog->Name : 0);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
|
|
|
|
|
|
backend_visitor::dump_instructions(filename);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool progress;
|
|
|
|
|
|
int iteration = 0;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
int pass_num = 0;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
do {
|
|
|
|
|
|
progress = false;
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
2014-11-13 16:28:18 -08:00
|
|
|
|
iteration++;
|
|
|
|
|
|
|
|
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
OPT(opt_cse);
|
|
|
|
|
|
OPT(opt_copy_propagate);
|
|
|
|
|
|
OPT(opt_peephole_predicated_break);
|
2014-08-22 10:54:43 -07:00
|
|
|
|
OPT(opt_cmod_propagation);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
OPT(dead_control_flow_eliminate, this);
|
|
|
|
|
|
OPT(opt_register_renaming);
|
2014-07-05 22:10:41 -07:00
|
|
|
|
OPT(opt_redundant_discard_jumps);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(opt_saturate_propagation);
|
2015-04-23 16:56:53 -07:00
|
|
|
|
OPT(opt_zero_samples);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
|
|
|
|
|
|
OPT(compact_virtual_grfs);
|
|
|
|
|
|
} while (progress);
|
|
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
pass_num = 0;
|
|
|
|
|
|
|
2015-02-08 13:59:57 -08:00
|
|
|
|
OPT(opt_sampler_eot);
|
|
|
|
|
|
|
2015-01-16 01:05:21 -08:00
|
|
|
|
if (OPT(lower_load_payload)) {
|
2014-11-13 16:28:18 -08:00
|
|
|
|
split_virtual_grfs();
|
2015-01-16 01:05:21 -08:00
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
2014-11-13 16:28:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-12 11:00:46 -08:00
|
|
|
|
OPT(opt_combine_constants);
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
lower_uniform_pull_constant_loads();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Three source instruction must have a GRF/MRF destination register.
|
|
|
|
|
|
* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fixup_3src_null_dest()
|
|
|
|
|
|
{
|
|
|
|
|
|
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
|
|
|
|
|
|
if (inst->is_3src() && inst->dst.is_null()) {
|
2015-02-10 15:51:34 +02:00
|
|
|
|
inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
|
2014-12-29 20:33:12 -08:00
|
|
|
|
inst->dst.type);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:19 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::allocate_registers()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool allocated_without_spills;
|
|
|
|
|
|
|
2014-12-19 12:55:13 -08:00
|
|
|
|
static const enum instruction_scheduler_mode pre_modes[] = {
|
2014-11-13 16:28:19 -08:00
|
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
|
|
|
|
|
schedule_instructions(pre_modes[i]);
|
|
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
|
assign_regs_trivial();
|
|
|
|
|
|
allocated_without_spills = true;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
allocated_without_spills = assign_regs(false);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (allocated_without_spills)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!allocated_without_spills) {
|
|
|
|
|
|
/* We assume that any spilling is worse than just dropping back to
|
|
|
|
|
|
* SIMD8. There's probably actually some intermediate point where
|
|
|
|
|
|
* SIMD16 with a couple of spills is still better.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (dispatch_width == 16) {
|
|
|
|
|
|
fail("Failure to register allocate. Reduce number of "
|
|
|
|
|
|
"live scalar values to avoid this.");
|
|
|
|
|
|
} else {
|
2014-10-27 22:42:50 -07:00
|
|
|
|
perf_debug("%s shader triggered register spilling. "
|
2014-11-13 16:28:19 -08:00
|
|
|
|
"Try reducing the number of live scalar values to "
|
2014-10-27 22:42:50 -07:00
|
|
|
|
"improve performance.\n", stage_name);
|
2014-11-13 16:28:19 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Since we're out of heuristics, just go spill registers until we
|
|
|
|
|
|
* get an allocation.
|
|
|
|
|
|
*/
|
|
|
|
|
|
while (!assign_regs(true)) {
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* This must come after all optimization and register allocation, since
|
|
|
|
|
|
* it inserts dead code that happens to have side effects, and it does
|
|
|
|
|
|
* so based on the actual physical registers in use.
|
|
|
|
|
|
*/
|
|
|
|
|
|
insert_gen4_send_dependency_workarounds();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
if (!allocated_without_spills)
|
|
|
|
|
|
schedule_instructions(SCHEDULE_POST);
|
|
|
|
|
|
|
|
|
|
|
|
if (last_scratch > 0)
|
|
|
|
|
|
prog_data->total_scratch = brw_get_scratch_size(last_scratch);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_vs()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_VERTEX);
|
|
|
|
|
|
|
|
|
|
|
|
assign_common_binding_table_offsets(0);
|
|
|
|
|
|
setup_vs_payload();
|
|
|
|
|
|
|
|
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2015-04-02 17:02:43 -07:00
|
|
|
|
if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
|
2015-03-09 01:58:59 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
} else {
|
|
|
|
|
|
foreach_in_list(ir_instruction, ir, shader->base.ir) {
|
|
|
|
|
|
base_ir = ir;
|
|
|
|
|
|
this->result = reg_undef;
|
|
|
|
|
|
ir->accept(this);
|
|
|
|
|
|
}
|
|
|
|
|
|
base_ir = NULL;
|
2014-10-27 22:42:50 -07:00
|
|
|
|
}
|
2015-03-09 01:58:59 -07:00
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_urb_writes();
|
|
|
|
|
|
|
2015-04-12 03:52:39 -07:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-10-27 22:42:50 -07:00
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_vs_urb_setup();
|
|
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2014-10-27 22:42:50 -07:00
|
|
|
|
allocate_registers();
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
bool
|
2014-10-27 23:36:31 -07:00
|
|
|
|
fs_visitor::run_fs()
|
2010-08-26 12:12:00 -07:00
|
|
|
|
{
|
2014-10-27 23:36:31 -07:00
|
|
|
|
brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
|
|
|
|
|
|
brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
|
|
|
|
|
|
|
|
|
|
|
|
assert(stage == MESA_SHADER_FRAGMENT);
|
|
|
|
|
|
|
2014-08-28 17:34:29 -07:00
|
|
|
|
sanity_param_count = prog->Parameters->NumParameters;
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2013-10-02 14:07:40 -07:00
|
|
|
|
assign_binding_table_offsets();
|
|
|
|
|
|
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen >= 6)
|
2012-11-13 19:36:18 -08:00
|
|
|
|
setup_payload_gen6();
|
|
|
|
|
|
else
|
2012-11-19 14:59:14 -08:00
|
|
|
|
setup_payload_gen4();
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
if (0) {
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_dummy_fs();
|
2014-09-26 14:47:03 -07:00
|
|
|
|
} else if (brw->use_rep_send && dispatch_width == 16) {
|
|
|
|
|
|
emit_repclear_shader();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
} else {
|
2012-11-27 14:10:52 -08:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
calculate_urb_setup();
|
2014-08-28 17:34:29 -07:00
|
|
|
|
if (prog->InputsRead > 0) {
|
2015-04-15 18:00:05 -07:00
|
|
|
|
if (devinfo->gen < 6)
|
2013-10-19 21:27:37 -07:00
|
|
|
|
emit_interpolation_setup_gen4();
|
|
|
|
|
|
else
|
|
|
|
|
|
emit_interpolation_setup_gen6();
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2012-12-06 12:15:13 -08:00
|
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
|
*/
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_prog_data->uses_kill) {
|
2012-12-06 12:15:13 -08:00
|
|
|
|
fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
|
|
|
|
|
|
discard_init->flag_subreg = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
/* Generate FS IR for main(). (the visitor only descends into
|
|
|
|
|
|
* functions called "main").
|
|
|
|
|
|
*/
|
2015-04-02 17:02:43 -07:00
|
|
|
|
if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
|
2015-01-30 01:16:49 -08:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
} else if (shader) {
|
|
|
|
|
|
foreach_in_list(ir_instruction, ir, shader->base.ir) {
|
|
|
|
|
|
base_ir = ir;
|
|
|
|
|
|
this->result = reg_undef;
|
|
|
|
|
|
ir->accept(this);
|
2012-08-27 14:35:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
emit_fragment_program_code();
|
2010-08-26 14:42:06 -07:00
|
|
|
|
}
|
2012-11-29 16:51:13 -08:00
|
|
|
|
base_ir = NULL;
|
2011-06-10 16:00:03 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
|
2015-04-10 10:04:55 -07:00
|
|
|
|
if (wm_prog_data->uses_kill)
|
|
|
|
|
|
emit(FS_OPCODE_PLACEHOLDER_HALT);
|
2013-03-27 23:19:39 -07:00
|
|
|
|
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (wm_key->alpha_test_func)
|
2013-10-27 12:32:03 +13:00
|
|
|
|
emit_alpha_test();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_fb_writes();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2015-02-26 22:55:54 -08:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2015-02-13 10:34:39 -08:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
2014-11-13 16:28:18 -08:00
|
|
|
|
optimize();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_urb_setup();
|
2011-01-18 22:03:34 -08:00
|
|
|
|
|
2014-12-29 20:33:12 -08:00
|
|
|
|
fixup_3src_null_dest();
|
2014-11-13 16:28:19 -08:00
|
|
|
|
allocate_registers();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
|
2014-11-13 16:28:17 -08:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2014-05-13 20:51:32 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (dispatch_width == 8)
|
|
|
|
|
|
wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
|
|
|
|
|
|
else
|
|
|
|
|
|
wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
|
2011-03-23 12:50:53 -07:00
|
|
|
|
|
2012-11-21 13:11:32 -08:00
|
|
|
|
/* If any state parameters were appended, then ParameterValues could have
|
|
|
|
|
|
* been realloced, in which case the driver uniform storage set up by
|
|
|
|
|
|
* _mesa_associate_uniform_storage() would point to freed memory. Make
|
|
|
|
|
|
* sure that didn't happen.
|
|
|
|
|
|
*/
|
2014-08-28 17:34:29 -07:00
|
|
|
|
assert(sanity_param_count == prog->Parameters->NumParameters);
|
2012-11-21 13:11:32 -08:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run_cs()
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(stage == MESA_SHADER_COMPUTE);
|
|
|
|
|
|
assert(shader);
|
|
|
|
|
|
|
|
|
|
|
|
sanity_param_count = prog->Parameters->NumParameters;
|
|
|
|
|
|
|
|
|
|
|
|
assign_common_binding_table_offsets(0);
|
|
|
|
|
|
|
|
|
|
|
|
setup_cs_payload();
|
|
|
|
|
|
|
2015-04-15 18:27:50 -07:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
emit_nir_code();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
emit_cs_terminate();
|
|
|
|
|
|
|
2015-04-15 18:27:50 -07:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_end();
|
|
|
|
|
|
|
2014-08-30 19:57:39 -07:00
|
|
|
|
calculate_cfg();
|
|
|
|
|
|
|
|
|
|
|
|
optimize();
|
|
|
|
|
|
|
|
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
|
|
|
|
|
|
fixup_3src_null_dest();
|
|
|
|
|
|
allocate_registers();
|
|
|
|
|
|
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
/* If any state parameters were appended, then ParameterValues could have
|
|
|
|
|
|
* been realloced, in which case the driver uniform storage set up by
|
|
|
|
|
|
* _mesa_associate_uniform_storage() would point to freed memory. Make
|
|
|
|
|
|
* sure that didn't happen.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(sanity_param_count == prog->Parameters->NumParameters);
|
|
|
|
|
|
|
|
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
const unsigned *
|
2014-05-14 01:21:02 -07:00
|
|
|
|
brw_wm_fs_emit(struct brw_context *brw,
|
|
|
|
|
|
void *mem_ctx,
|
2014-05-14 00:41:41 -07:00
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
2012-11-20 14:41:21 -08:00
|
|
|
|
struct gl_fragment_program *fp,
|
2012-11-20 16:21:27 -08:00
|
|
|
|
struct gl_shader_program *prog,
|
|
|
|
|
|
unsigned *final_assembly_size)
|
2011-03-11 19:19:01 -08:00
|
|
|
|
{
|
2012-08-07 10:05:38 -07:00
|
|
|
|
bool start_busy = false;
|
2014-02-20 14:54:29 -08:00
|
|
|
|
double start_time = 0;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
2013-07-03 14:41:58 -07:00
|
|
|
|
if (unlikely(brw->perf_debug)) {
|
2013-07-03 14:21:19 -07:00
|
|
|
|
start_busy = (brw->batch.last_bo &&
|
|
|
|
|
|
drm_intel_bo_busy(brw->batch.last_bo));
|
2012-08-07 10:05:38 -07:00
|
|
|
|
start_time = get_time();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-08-27 14:35:01 -07:00
|
|
|
|
struct brw_shader *shader = NULL;
|
|
|
|
|
|
if (prog)
|
|
|
|
|
|
shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
2014-02-20 18:23:52 -08:00
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_WM))
|
2014-10-20 15:50:36 -07:00
|
|
|
|
brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
|
|
|
|
|
/* Now the main event: Visit the shader IR and generate our FS IR for it.
|
|
|
|
|
|
*/
|
2014-05-14 00:41:41 -07:00
|
|
|
|
fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (!v.run_fs()) {
|
2013-04-11 09:55:42 -07:00
|
|
|
|
if (prog) {
|
|
|
|
|
|
prog->LinkStatus = false;
|
|
|
|
|
|
ralloc_strcat(&prog->InfoLog, v.fail_msg);
|
|
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-02-09 10:23:45 -08:00
|
|
|
|
_mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
|
2013-04-11 09:55:42 -07:00
|
|
|
|
v.fail_msg);
|
2012-02-09 10:23:45 -08:00
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
return NULL;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-11 21:16:13 -07:00
|
|
|
|
cfg_t *simd16_cfg = NULL;
|
2014-05-14 00:41:41 -07:00
|
|
|
|
fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
|
2015-02-20 14:09:17 -08:00
|
|
|
|
if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
if (!v.simd16_unsupported) {
|
|
|
|
|
|
/* Try a SIMD16 compile */
|
|
|
|
|
|
v2.import_uniforms(&v);
|
2014-10-27 23:36:31 -07:00
|
|
|
|
if (!v2.run_fs()) {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
perf_debug("SIMD16 shader failed to compile, falling back to "
|
|
|
|
|
|
"SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
|
|
|
|
|
|
} else {
|
2014-07-11 21:16:13 -07:00
|
|
|
|
simd16_cfg = v2.cfg;
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
}
|
2012-11-09 01:05:47 -08:00
|
|
|
|
} else {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
perf_debug("SIMD16 shader unsupported, falling back to "
|
|
|
|
|
|
"SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
|
2012-07-12 12:48:58 -07:00
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-07-11 21:16:13 -07:00
|
|
|
|
cfg_t *simd8_cfg;
|
2014-07-07 14:49:12 -07:00
|
|
|
|
int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
|
2015-02-20 14:09:17 -08:00
|
|
|
|
if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
|
2014-07-11 21:16:13 -07:00
|
|
|
|
simd8_cfg = NULL;
|
2014-07-07 14:49:12 -07:00
|
|
|
|
prog_data->no_8 = true;
|
|
|
|
|
|
} else {
|
2014-07-11 21:16:13 -07:00
|
|
|
|
simd8_cfg = v.cfg;
|
2014-07-07 14:49:12 -07:00
|
|
|
|
prog_data->no_8 = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-10-27 19:43:31 -07:00
|
|
|
|
fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
|
2015-03-16 12:18:31 -07:00
|
|
|
|
&fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
|
2014-10-27 19:40:47 -07:00
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
|
|
|
|
|
|
char *name;
|
|
|
|
|
|
if (prog)
|
|
|
|
|
|
name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
|
|
|
|
|
|
prog->Label ? prog->Label : "unnamed",
|
|
|
|
|
|
prog->Name);
|
|
|
|
|
|
else
|
|
|
|
|
|
name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
|
|
|
|
|
|
|
|
|
|
|
|
g.enable_debug(name);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:08 -08:00
|
|
|
|
if (simd8_cfg)
|
|
|
|
|
|
g.generate_code(simd8_cfg, 8);
|
|
|
|
|
|
if (simd16_cfg)
|
|
|
|
|
|
prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
|
2012-11-30 12:55:50 -08:00
|
|
|
|
|
2013-07-03 14:41:58 -07:00
|
|
|
|
if (unlikely(brw->perf_debug) && shader) {
|
2012-07-12 13:19:53 -07:00
|
|
|
|
if (shader->compiled_once)
|
2014-05-14 00:41:41 -07:00
|
|
|
|
brw_wm_debug_recompile(brw, prog, key);
|
2012-07-12 13:19:53 -07:00
|
|
|
|
shader->compiled_once = true;
|
2012-08-07 10:05:38 -07:00
|
|
|
|
|
2013-07-03 14:21:19 -07:00
|
|
|
|
if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
|
2012-08-07 10:05:38 -07:00
|
|
|
|
perf_debug("FS compile took %.03f ms and stalled the GPU\n",
|
2012-08-13 17:49:06 -07:00
|
|
|
|
(get_time() - start_time) * 1000);
|
2012-08-07 10:05:38 -07:00
|
|
|
|
}
|
2012-07-12 13:19:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-11-13 16:28:08 -08:00
|
|
|
|
return g.get_assembly(final_assembly_size);
|
2010-08-26 12:12:00 -07:00
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2014-11-23 23:46:39 -08:00
|
|
|
|
extern "C" bool
|
2014-11-23 23:26:00 -08:00
|
|
|
|
brw_fs_precompile(struct gl_context *ctx,
|
|
|
|
|
|
struct gl_shader_program *shader_prog,
|
|
|
|
|
|
struct gl_program *prog)
|
2011-05-16 15:10:26 -07:00
|
|
|
|
{
|
|
|
|
|
|
struct brw_context *brw = brw_context(ctx);
|
|
|
|
|
|
struct brw_wm_prog_key key;
|
|
|
|
|
|
|
2014-11-23 23:26:00 -08:00
|
|
|
|
struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
|
2011-08-20 15:00:36 -07:00
|
|
|
|
struct brw_fragment_program *bfp = brw_fragment_program(fp);
|
2012-06-20 13:40:45 -07:00
|
|
|
|
bool program_uses_dfdy = fp->UsesDFdy;
|
2011-08-20 15:00:36 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
memset(&key, 0, sizeof(key));
|
|
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen < 6) {
|
2012-08-13 23:59:51 -07:00
|
|
|
|
if (fp->UsesKill)
|
|
|
|
|
|
key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-08-13 23:59:51 -07:00
|
|
|
|
if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
|
|
|
|
|
|
key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-08-13 23:59:51 -07:00
|
|
|
|
/* Just assume depth testing. */
|
|
|
|
|
|
key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
|
|
|
|
|
|
key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
|
|
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
|
if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
|
|
|
|
|
|
BRW_FS_VARYING_INPUT_MASK) > 16)
|
2013-09-03 11:55:17 -07:00
|
|
|
|
key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2015-04-21 00:31:12 -07:00
|
|
|
|
brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2013-02-23 09:00:58 -08:00
|
|
|
|
if (fp->Base.InputsRead & VARYING_BIT_POS) {
|
2011-05-16 15:10:26 -07:00
|
|
|
|
key.drawable_height = ctx->DrawBuffer->Height;
|
2012-06-20 13:40:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-01-26 11:03:33 +13:00
|
|
|
|
key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
|
|
|
|
|
|
~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
|
|
|
|
|
|
BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2014-01-26 11:04:42 +13:00
|
|
|
|
if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
|
|
|
|
|
|
key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
|
|
|
|
|
|
key.nr_color_regions > 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
key.program_string_id = bfp->id;
|
|
|
|
|
|
|
2013-09-01 17:31:54 -07:00
|
|
|
|
uint32_t old_prog_offset = brw->wm.base.prog_offset;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
|
|
|
|
|
|
|
2015-04-16 11:06:57 -07:00
|
|
|
|
bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2013-09-01 17:31:54 -07:00
|
|
|
|
brw->wm.base.prog_offset = old_prog_offset;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
brw->wm.prog_data = old_prog_data;
|
|
|
|
|
|
|
|
|
|
|
|
return success;
|
|
|
|
|
|
}
|
2015-04-21 00:31:12 -07:00
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
brw_setup_tex_for_precompile(struct brw_context *brw,
|
|
|
|
|
|
struct brw_sampler_prog_key_data *tex,
|
|
|
|
|
|
struct gl_program *prog)
|
|
|
|
|
|
{
|
|
|
|
|
|
const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
|
|
|
|
|
|
unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
|
|
|
|
|
|
for (unsigned i = 0; i < sampler_count; i++) {
|
|
|
|
|
|
if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
|
|
|
|
|
|
/* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
|
|
|
|
|
|
tex->swizzles[i] =
|
|
|
|
|
|
MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Color sampler: assume no swizzling. */
|
|
|
|
|
|
tex->swizzles[i] = SWIZZLE_XYZW;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|