2010-08-10 20:39:06 -07:00
|
|
|
|
/*
|
|
|
|
|
|
* Copyright © 2010 Intel Corporation
|
|
|
|
|
|
*
|
|
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
|
|
*
|
|
|
|
|
|
* The above copyright notice and this permission notice (including the next
|
|
|
|
|
|
* paragraph) shall be included in all copies or substantial portions of the
|
|
|
|
|
|
* Software.
|
|
|
|
|
|
*
|
|
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
|
|
|
|
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
|
|
|
|
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
|
|
|
|
|
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
|
|
|
|
|
* IN THE SOFTWARE.
|
2011-05-24 16:45:17 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
/** @file brw_fs.cpp
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*
|
2011-05-24 16:45:17 -07:00
|
|
|
|
* This file drives the GLSL IR -> LIR translation, contains the
|
|
|
|
|
|
* optimizations on the LIR, and drives the generation of native code
|
|
|
|
|
|
* from the LIR.
|
2010-08-10 20:39:06 -07:00
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
extern "C" {
|
2010-08-26 15:43:00 -07:00
|
|
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
|
|
2013-04-08 17:43:06 -07:00
|
|
|
|
#include "main/hash_table.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "main/macros.h"
|
|
|
|
|
|
#include "main/shaderobj.h"
|
2012-04-20 07:58:59 -06:00
|
|
|
|
#include "main/fbobject.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "program/prog_parameter.h"
|
|
|
|
|
|
#include "program/prog_print.h"
|
2010-09-29 12:08:11 -07:00
|
|
|
|
#include "program/register_allocate.h"
|
2010-09-28 10:53:47 -07:00
|
|
|
|
#include "program/sampler.h"
|
2010-08-15 18:58:58 -07:00
|
|
|
|
#include "program/hash_table.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
#include "brw_context.h"
|
|
|
|
|
|
#include "brw_eu.h"
|
|
|
|
|
|
#include "brw_wm.h"
|
|
|
|
|
|
}
|
2010-10-10 15:42:37 -07:00
|
|
|
|
#include "brw_fs.h"
|
2013-10-30 10:32:12 -07:00
|
|
|
|
#include "brw_dead_control_flow.h"
|
2013-09-11 10:59:13 -07:00
|
|
|
|
#include "main/uniforms.h"
|
2012-06-05 11:42:25 -07:00
|
|
|
|
#include "brw_fs_live_variables.h"
|
2011-08-26 13:58:41 -07:00
|
|
|
|
#include "glsl/glsl_types.h"
|
2010-08-10 20:39:06 -07:00
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
memset(this, 0, sizeof(*this));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
this->opcode = opcode;
|
|
|
|
|
|
this->dst = dst;
|
|
|
|
|
|
this->src = src;
|
2014-02-20 08:18:22 -08:00
|
|
|
|
this->sources = sources;
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
|
2013-03-18 11:30:57 -07:00
|
|
|
|
/* This will be the case for almost all instructions. */
|
|
|
|
|
|
this->regs_written = 1;
|
2014-04-04 16:51:59 +03:00
|
|
|
|
|
|
|
|
|
|
this->writes_accumulator = false;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_reg *src = ralloc_array(this, fs_reg, 3);
|
|
|
|
|
|
init(opcode, dst, src, 0);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_reg *src = ralloc_array(this, fs_reg, 3);
|
|
|
|
|
|
src[0] = src0;
|
|
|
|
|
|
init(opcode, dst, src, 1);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
|
|
|
|
|
|
const fs_reg &src1)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_reg *src = ralloc_array(this, fs_reg, 3);
|
|
|
|
|
|
src[0] = src0;
|
|
|
|
|
|
src[1] = src1;
|
|
|
|
|
|
init(opcode, dst, src, 2);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
|
|
|
|
|
|
const fs_reg &src1, const fs_reg &src2)
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
2014-05-27 10:25:05 -07:00
|
|
|
|
fs_reg *src = ralloc_array(this, fs_reg, 3);
|
|
|
|
|
|
src[0] = src0;
|
|
|
|
|
|
src[1] = src1;
|
|
|
|
|
|
src[2] = src2;
|
|
|
|
|
|
init(opcode, dst, src, 3);
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-26 18:44:17 -07:00
|
|
|
|
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
init(opcode, dst, src, sources);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 09:40:02 -08:00
|
|
|
|
fs_inst::fs_inst(const fs_inst &that)
|
|
|
|
|
|
{
|
|
|
|
|
|
memcpy(this, &that, sizeof(that));
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2014-02-20 08:18:22 -08:00
|
|
|
|
this->src = ralloc_array(this, fs_reg, that.sources);
|
2014-02-19 21:18:44 -08:00
|
|
|
|
|
2014-02-20 08:18:22 -08:00
|
|
|
|
for (int i = 0; i < that.sources; i++)
|
2014-02-19 21:18:44 -08:00
|
|
|
|
this->src[i] = that.src[i];
|
2014-02-20 09:40:02 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-20 13:14:05 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_inst::resize_sources(uint8_t num_sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (this->sources != num_sources) {
|
|
|
|
|
|
this->src = reralloc(this, this->src, fs_reg, num_sources);
|
|
|
|
|
|
this->sources = num_sources;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 12:01:05 -08:00
|
|
|
|
#define ALU1(op) \
|
|
|
|
|
|
fs_inst * \
|
|
|
|
|
|
fs_visitor::op(fs_reg dst, fs_reg src0) \
|
|
|
|
|
|
{ \
|
|
|
|
|
|
return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
#define ALU2(op) \
|
|
|
|
|
|
fs_inst * \
|
|
|
|
|
|
fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
|
|
|
|
|
|
{ \
|
|
|
|
|
|
return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-04 16:51:59 +03:00
|
|
|
|
#define ALU2_ACC(op) \
|
|
|
|
|
|
fs_inst * \
|
|
|
|
|
|
fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
|
|
|
|
|
|
{ \
|
|
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
|
|
|
|
|
|
inst->writes_accumulator = true; \
|
|
|
|
|
|
return inst; \
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-12-02 00:08:15 -08:00
|
|
|
|
#define ALU3(op) \
|
|
|
|
|
|
fs_inst * \
|
|
|
|
|
|
fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
|
|
|
|
|
|
{ \
|
|
|
|
|
|
return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 12:01:05 -08:00
|
|
|
|
ALU1(NOT)
|
|
|
|
|
|
ALU1(MOV)
|
|
|
|
|
|
ALU1(FRC)
|
|
|
|
|
|
ALU1(RNDD)
|
|
|
|
|
|
ALU1(RNDE)
|
|
|
|
|
|
ALU1(RNDZ)
|
|
|
|
|
|
ALU2(ADD)
|
|
|
|
|
|
ALU2(MUL)
|
2014-04-04 16:51:59 +03:00
|
|
|
|
ALU2_ACC(MACH)
|
2012-11-09 12:01:05 -08:00
|
|
|
|
ALU2(AND)
|
|
|
|
|
|
ALU2(OR)
|
|
|
|
|
|
ALU2(XOR)
|
|
|
|
|
|
ALU2(SHL)
|
|
|
|
|
|
ALU2(SHR)
|
|
|
|
|
|
ALU2(ASR)
|
2012-12-02 00:08:15 -08:00
|
|
|
|
ALU3(LRP)
|
2013-04-09 19:22:34 -07:00
|
|
|
|
ALU1(BFREV)
|
|
|
|
|
|
ALU3(BFE)
|
|
|
|
|
|
ALU2(BFI1)
|
|
|
|
|
|
ALU3(BFI2)
|
|
|
|
|
|
ALU1(FBH)
|
|
|
|
|
|
ALU1(FBL)
|
|
|
|
|
|
ALU1(CBIT)
|
2013-04-23 17:32:26 -07:00
|
|
|
|
ALU3(MAD)
|
2014-04-04 16:51:59 +03:00
|
|
|
|
ALU2_ACC(ADDC)
|
|
|
|
|
|
ALU2_ACC(SUBB)
|
2013-10-22 19:04:14 -07:00
|
|
|
|
ALU2(SEL)
|
2014-03-28 15:28:32 +02:00
|
|
|
|
ALU2(MAC)
|
2012-11-09 12:01:05 -08:00
|
|
|
|
|
2012-11-09 12:50:03 -08:00
|
|
|
|
/** Gen4 predicated IF. */
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::IF(uint32_t predicate)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
|
|
|
|
|
|
inst->predicate = predicate;
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-14 10:36:12 -08:00
|
|
|
|
/** Gen6 IF with embedded comparison. */
|
2012-11-09 12:50:03 -08:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
|
|
|
|
|
|
{
|
2013-11-14 10:36:12 -08:00
|
|
|
|
assert(brw->gen == 6);
|
2012-11-09 12:50:03 -08:00
|
|
|
|
fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
|
|
|
|
|
|
reg_null_d, src0, src1);
|
|
|
|
|
|
inst->conditional_mod = condition;
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* CMP: Sets the low bit of the destination channels with the result
|
|
|
|
|
|
* of the comparison, while the upper bits are undefined, and updates
|
|
|
|
|
|
* the flag register with the packed 16 bits of the result.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
|
|
|
|
|
/* Take the instruction:
|
|
|
|
|
|
*
|
|
|
|
|
|
* CMP null<d> src0<f> src1<f>
|
|
|
|
|
|
*
|
|
|
|
|
|
* Original gen4 does type conversion to the destination type before
|
|
|
|
|
|
* comparison, producing garbage results for floating point comparisons.
|
|
|
|
|
|
* gen5 does the comparison on the execution type (resolved source types),
|
|
|
|
|
|
* so dst type doesn't matter. gen6 does comparison and then uses the
|
|
|
|
|
|
* result as if it was the dst type with no conversion, which happens to
|
|
|
|
|
|
* mostly work out for float-interpreted-as-int since our comparisons are
|
|
|
|
|
|
* for >0, =0, <0.
|
|
|
|
|
|
*/
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen == 4) {
|
2012-11-09 12:50:03 -08:00
|
|
|
|
dst.type = src0.type;
|
2013-04-29 16:05:05 -07:00
|
|
|
|
if (dst.file == HW_REG)
|
2012-11-09 12:50:03 -08:00
|
|
|
|
dst.fixed_hw_reg.type = dst.type;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
resolve_ud_negate(&src0);
|
|
|
|
|
|
resolve_ud_negate(&src1);
|
|
|
|
|
|
|
|
|
|
|
|
inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
|
|
|
|
|
|
inst->conditional_mod = condition;
|
|
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
exec_list
|
2014-02-19 20:31:14 -08:00
|
|
|
|
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
|
|
|
|
|
|
const fs_reg &surf_index,
|
|
|
|
|
|
const fs_reg &varying_offset,
|
2013-03-13 12:27:17 -07:00
|
|
|
|
uint32_t const_offset)
|
2012-11-08 16:06:24 -08:00
|
|
|
|
{
|
|
|
|
|
|
exec_list instructions;
|
|
|
|
|
|
fs_inst *inst;
|
|
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* We have our constant surface use a pitch of 4 bytes, so our index can
|
|
|
|
|
|
* be any component of a vector, and then we load 4 contiguous
|
|
|
|
|
|
* components starting from that.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We break down the const_offset to a portion added to the variable
|
|
|
|
|
|
* offset and a portion done using reg_offset, which means that if you
|
|
|
|
|
|
* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
|
|
|
|
|
|
* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
|
|
|
|
|
|
* CSE can later notice that those loads are all the same and eliminate
|
|
|
|
|
|
* the redundant ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
|
|
|
|
|
|
instructions.push_tail(ADD(vec4_offset,
|
|
|
|
|
|
varying_offset, const_offset & ~3));
|
|
|
|
|
|
|
|
|
|
|
|
int scale = 1;
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen == 4 && dispatch_width == 8) {
|
2013-03-18 10:16:42 -07:00
|
|
|
|
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
|
|
|
|
|
|
* u, v, r) as parameters, or we can just use the SIMD16 message
|
|
|
|
|
|
* consisting of (header, u). We choose the second, at the cost of a
|
|
|
|
|
|
* longer return length.
|
2012-11-08 16:06:24 -08:00
|
|
|
|
*/
|
2013-03-18 10:16:42 -07:00
|
|
|
|
scale = 2;
|
|
|
|
|
|
}
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
|
enum opcode op;
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 7)
|
2013-03-18 10:16:42 -07:00
|
|
|
|
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
else
|
|
|
|
|
|
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
|
|
|
|
|
|
fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
|
|
|
|
|
|
inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
|
|
|
|
|
|
inst->regs_written = 4 * scale;
|
|
|
|
|
|
instructions.push_tail(inst);
|
|
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen < 7) {
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst->base_mrf = 13;
|
|
|
|
|
|
inst->header_present = true;
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen == 4)
|
2013-03-18 10:16:42 -07:00
|
|
|
|
inst->mlen = 3;
|
|
|
|
|
|
else
|
|
|
|
|
|
inst->mlen = 1 + dispatch_width / 8;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-03-18 10:16:42 -07:00
|
|
|
|
vec4_result.reg_offset += (const_offset & 3) * scale;
|
|
|
|
|
|
instructions.push_tail(MOV(dst, vec4_result));
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
return instructions;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* A helper for MOV generation for fixing up broken hardware SEND dependency
|
|
|
|
|
|
* handling.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::DEP_RESOLVE_MOV(int grf)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
|
|
|
|
|
|
|
|
|
|
|
|
inst->ir = NULL;
|
|
|
|
|
|
inst->annotation = "send dependency resolve";
|
|
|
|
|
|
|
|
|
|
|
|
/* The caller always wants uncompressed to emit the minimal extra
|
|
|
|
|
|
* dependencies, and to avoid having to deal with aligning its regs to 2.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->force_uncompressed = true;
|
|
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::equals(fs_inst *inst) const
|
2012-07-04 13:12:50 -07:00
|
|
|
|
{
|
|
|
|
|
|
return (opcode == inst->opcode &&
|
|
|
|
|
|
dst.equals(inst->dst) &&
|
|
|
|
|
|
src[0].equals(inst->src[0]) &&
|
|
|
|
|
|
src[1].equals(inst->src[1]) &&
|
|
|
|
|
|
src[2].equals(inst->src[2]) &&
|
|
|
|
|
|
saturate == inst->saturate &&
|
2012-10-03 13:23:05 -07:00
|
|
|
|
predicate == inst->predicate &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
conditional_mod == inst->conditional_mod &&
|
|
|
|
|
|
mlen == inst->mlen &&
|
|
|
|
|
|
base_mrf == inst->base_mrf &&
|
|
|
|
|
|
sampler == inst->sampler &&
|
|
|
|
|
|
target == inst->target &&
|
|
|
|
|
|
eot == inst->eot &&
|
|
|
|
|
|
header_present == inst->header_present &&
|
|
|
|
|
|
shadow_compare == inst->shadow_compare &&
|
|
|
|
|
|
offset == inst->offset);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-06 15:06:59 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::overwrites_reg(const fs_reg ®) const
|
2012-07-06 15:06:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
return (reg.file == dst.file &&
|
|
|
|
|
|
reg.reg == dst.reg &&
|
|
|
|
|
|
reg.reg_offset >= dst.reg_offset &&
|
2013-03-18 11:30:57 -07:00
|
|
|
|
reg.reg_offset < dst.reg_offset + regs_written);
|
2012-07-06 15:06:59 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_send_from_grf() const
|
2012-11-09 11:48:20 -08:00
|
|
|
|
{
|
2012-12-05 00:06:30 -08:00
|
|
|
|
return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
|
2013-03-19 15:28:11 -07:00
|
|
|
|
opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
|
2012-12-05 00:06:30 -08:00
|
|
|
|
(opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
src[1].file == GRF) ||
|
|
|
|
|
|
(is_tex() && src[0].file == GRF));
|
2012-11-09 11:48:20 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::can_do_source_mods(fs_inst *inst)
|
|
|
|
|
|
{
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen == 6 && inst->is_math())
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->is_send_from_grf())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2013-09-19 19:48:22 -07:00
|
|
|
|
if (!inst->can_do_source_mods())
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
2012-11-09 11:48:20 -08:00
|
|
|
|
return true;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_reg::init()
|
|
|
|
|
|
{
|
|
|
|
|
|
memset(this, 0, sizeof(*this));
|
2013-12-08 04:57:35 +01:00
|
|
|
|
stride = 1;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Generic unset register constructor. */
|
|
|
|
|
|
fs_reg::fs_reg()
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = BAD_FILE;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(float f)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
|
|
|
|
|
this->imm.f = f;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(int32_t i)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_D;
|
|
|
|
|
|
this->imm.i = i;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/** Immediate value constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(uint32_t u)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = IMM;
|
|
|
|
|
|
this->type = BRW_REGISTER_TYPE_UD;
|
|
|
|
|
|
this->imm.u = u;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-22 16:26:53 -08:00
|
|
|
|
/** Fixed brw_reg. */
|
2012-07-04 13:12:50 -07:00
|
|
|
|
fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
2013-04-29 16:05:05 -07:00
|
|
|
|
this->file = HW_REG;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
this->fixed_hw_reg = fixed_hw_reg;
|
2014-02-21 23:52:24 -08:00
|
|
|
|
this->type = fixed_hw_reg.type;
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::equals(const fs_reg &r) const
|
|
|
|
|
|
{
|
|
|
|
|
|
return (file == r.file &&
|
|
|
|
|
|
reg == r.reg &&
|
|
|
|
|
|
reg_offset == r.reg_offset &&
|
2013-12-08 04:57:08 +01:00
|
|
|
|
subreg_offset == r.subreg_offset &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
type == r.type &&
|
|
|
|
|
|
negate == r.negate &&
|
|
|
|
|
|
abs == r.abs &&
|
2012-11-08 16:06:24 -08:00
|
|
|
|
!reladdr && !r.reladdr &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
|
|
|
|
|
|
sizeof(fixed_hw_reg)) == 0 &&
|
2013-12-08 04:57:35 +01:00
|
|
|
|
stride == r.stride &&
|
2012-07-04 13:12:50 -07:00
|
|
|
|
imm.u == r.imm.u);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
|
fs_reg &
|
|
|
|
|
|
fs_reg::apply_stride(unsigned stride)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert((this->stride * stride) <= 4 &&
|
|
|
|
|
|
(is_power_of_two(stride) || stride == 0) &&
|
|
|
|
|
|
file != HW_REG && file != IMM);
|
|
|
|
|
|
this->stride *= stride;
|
|
|
|
|
|
return *this;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-01-15 22:21:30 +01:00
|
|
|
|
fs_reg &
|
|
|
|
|
|
fs_reg::set_smear(unsigned subreg)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(file != HW_REG && file != IMM);
|
|
|
|
|
|
subreg_offset = subreg * type_sz(type);
|
|
|
|
|
|
stride = 0;
|
|
|
|
|
|
return *this;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 04:57:35 +01:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_contiguous() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return stride == 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-17 15:10:53 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_zero() const
|
|
|
|
|
|
{
|
|
|
|
|
|
if (file != IMM)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_one() const
|
|
|
|
|
|
{
|
|
|
|
|
|
if (file != IMM)
|
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
|
|
return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-22 12:32:23 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_null() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return file == HW_REG &&
|
|
|
|
|
|
fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
|
|
|
|
|
|
fixed_hw_reg.nr == BRW_ARF_NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-04-25 20:20:05 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_valid_3src() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return file == GRF || file == UNIFORM;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-04-09 12:08:12 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_reg::is_accumulator() const
|
|
|
|
|
|
{
|
|
|
|
|
|
return file == HW_REG &&
|
|
|
|
|
|
fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
|
|
|
|
|
|
fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-05-24 16:45:17 -07:00
|
|
|
|
int
|
|
|
|
|
|
fs_visitor::type_size(const struct glsl_type *type)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
|
|
|
|
|
unsigned int size, i;
|
|
|
|
|
|
|
|
|
|
|
|
switch (type->base_type) {
|
|
|
|
|
|
case GLSL_TYPE_UINT:
|
|
|
|
|
|
case GLSL_TYPE_INT:
|
|
|
|
|
|
case GLSL_TYPE_FLOAT:
|
|
|
|
|
|
case GLSL_TYPE_BOOL:
|
2010-08-27 10:44:04 -07:00
|
|
|
|
return type->components();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
case GLSL_TYPE_ARRAY:
|
|
|
|
|
|
return type_size(type->fields.array) * type->length;
|
|
|
|
|
|
case GLSL_TYPE_STRUCT:
|
|
|
|
|
|
size = 0;
|
|
|
|
|
|
for (i = 0; i < type->length; i++) {
|
|
|
|
|
|
size += type_size(type->fields.structure[i].type);
|
|
|
|
|
|
}
|
|
|
|
|
|
return size;
|
|
|
|
|
|
case GLSL_TYPE_SAMPLER:
|
|
|
|
|
|
/* Samplers take up no register space, since they're baked in at
|
|
|
|
|
|
* link time.
|
|
|
|
|
|
*/
|
|
|
|
|
|
return 0;
|
2013-10-20 12:35:47 -07:00
|
|
|
|
case GLSL_TYPE_ATOMIC_UINT:
|
|
|
|
|
|
return 0;
|
2013-11-25 13:50:47 -08:00
|
|
|
|
case GLSL_TYPE_IMAGE:
|
2012-12-11 12:56:03 -08:00
|
|
|
|
case GLSL_TYPE_VOID:
|
|
|
|
|
|
case GLSL_TYPE_ERROR:
|
2012-12-11 12:11:16 -08:00
|
|
|
|
case GLSL_TYPE_INTERFACE:
|
2010-08-15 18:58:58 -07:00
|
|
|
|
assert(!"not reached");
|
2012-12-11 12:56:03 -08:00
|
|
|
|
break;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
2012-12-11 12:56:03 -08:00
|
|
|
|
|
|
|
|
|
|
return 0;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-27 14:10:52 -08:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::get_timestamp()
|
|
|
|
|
|
{
|
2013-07-06 00:36:46 -07:00
|
|
|
|
assert(brw->gen >= 7);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
|
|
|
|
|
|
BRW_ARF_TIMESTAMP,
|
|
|
|
|
|
0),
|
|
|
|
|
|
BRW_REGISTER_TYPE_UD));
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg dst = fs_reg(this, glsl_type::uint_type);
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *mov = emit(MOV(dst, ts));
|
|
|
|
|
|
/* We want to read the 3 fields we care about (mostly field 0, but also 2)
|
|
|
|
|
|
* even if it's not enabled in the dispatch.
|
|
|
|
|
|
*/
|
|
|
|
|
|
mov->force_writemask_all = true;
|
|
|
|
|
|
mov->force_uncompressed = true;
|
|
|
|
|
|
|
|
|
|
|
|
/* The caller wants the low 32 bits of the timestamp. Since it's running
|
|
|
|
|
|
* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
|
|
|
|
|
|
* which is plenty of time for our purposes. It is identical across the
|
|
|
|
|
|
* EUs, but since it's tracking GPU core speed it will increment at a
|
|
|
|
|
|
* varying rate as render P-states change.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The caller could also check if render P-states have changed (or anything
|
|
|
|
|
|
* else that might disrupt timing) by setting smear to 2 and checking if
|
|
|
|
|
|
* that field is != 0.
|
|
|
|
|
|
*/
|
2014-01-15 22:21:30 +01:00
|
|
|
|
dst.set_smear(0);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
return dst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_begin()
|
|
|
|
|
|
{
|
|
|
|
|
|
current_annotation = "shader time start";
|
|
|
|
|
|
shader_start_time = get_timestamp();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_end()
|
|
|
|
|
|
{
|
|
|
|
|
|
current_annotation = "shader time end";
|
|
|
|
|
|
|
2012-12-10 09:44:19 -08:00
|
|
|
|
enum shader_time_shader_type type, written_type, reset_type;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
if (dispatch_width == 8) {
|
|
|
|
|
|
type = ST_FS8;
|
2012-12-10 09:44:19 -08:00
|
|
|
|
written_type = ST_FS8_WRITTEN;
|
|
|
|
|
|
reset_type = ST_FS8_RESET;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
assert(dispatch_width == 16);
|
|
|
|
|
|
type = ST_FS16;
|
2012-12-10 09:44:19 -08:00
|
|
|
|
written_type = ST_FS16_WRITTEN;
|
|
|
|
|
|
reset_type = ST_FS16_RESET;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg shader_end_time = get_timestamp();
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* Check that there weren't any timestamp reset events (assuming these
|
|
|
|
|
|
* were the only two timestamp reads that happened).
|
|
|
|
|
|
*/
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg reset = shader_end_time;
|
2014-01-15 22:21:30 +01:00
|
|
|
|
reset.set_smear(2);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
|
|
|
|
|
|
test->conditional_mod = BRW_CONDITIONAL_Z;
|
|
|
|
|
|
emit(IF(BRW_PREDICATE_NORMAL));
|
|
|
|
|
|
|
|
|
|
|
|
push_force_uncompressed();
|
2012-12-10 09:21:34 -08:00
|
|
|
|
fs_reg start = shader_start_time;
|
2012-11-27 14:10:52 -08:00
|
|
|
|
start.negate = true;
|
|
|
|
|
|
fs_reg diff = fs_reg(this, glsl_type::uint_type);
|
2012-12-10 09:21:34 -08:00
|
|
|
|
emit(ADD(diff, start, shader_end_time));
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
|
|
|
|
|
/* If there were no instructions between the two timestamp gets, the diff
|
|
|
|
|
|
* is 2 cycles. Remove that overhead, so I can forget about that when
|
|
|
|
|
|
* trying to determine the time taken for single instructions.
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(ADD(diff, diff, fs_reg(-2u)));
|
|
|
|
|
|
|
2012-12-10 09:21:34 -08:00
|
|
|
|
emit_shader_time_write(type, diff);
|
2012-12-10 09:44:19 -08:00
|
|
|
|
emit_shader_time_write(written_type, fs_reg(1u));
|
|
|
|
|
|
emit(BRW_OPCODE_ELSE);
|
|
|
|
|
|
emit_shader_time_write(reset_type, fs_reg(1u));
|
2012-12-10 09:21:34 -08:00
|
|
|
|
emit(BRW_OPCODE_ENDIF);
|
|
|
|
|
|
|
|
|
|
|
|
pop_force_uncompressed();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
|
|
|
|
|
|
fs_reg value)
|
|
|
|
|
|
{
|
2013-04-08 17:17:44 -07:00
|
|
|
|
int shader_time_index =
|
|
|
|
|
|
brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
|
2013-03-19 15:28:11 -07:00
|
|
|
|
fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2013-03-19 15:28:11 -07:00
|
|
|
|
fs_reg payload;
|
|
|
|
|
|
if (dispatch_width == 8)
|
|
|
|
|
|
payload = fs_reg(this, glsl_type::uvec2_type);
|
|
|
|
|
|
else
|
|
|
|
|
|
payload = fs_reg(this, glsl_type::uint_type);
|
2012-11-27 14:10:52 -08:00
|
|
|
|
|
2014-02-21 15:23:42 -08:00
|
|
|
|
emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
|
|
|
|
|
|
fs_reg(), payload, offset, value));
|
2012-11-27 14:10:52 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-13 13:43:05 -07:00
|
|
|
|
void
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
fs_visitor::vfail(const char *format, va_list va)
|
2011-03-13 13:43:05 -07:00
|
|
|
|
{
|
2011-05-16 15:10:26 -07:00
|
|
|
|
char *msg;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return;
|
2011-03-13 13:43:05 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
failed = true;
|
|
|
|
|
|
|
|
|
|
|
|
msg = ralloc_vasprintf(mem_ctx, format, va);
|
|
|
|
|
|
msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
|
|
|
|
|
|
|
|
|
|
|
|
this->fail_msg = msg;
|
|
|
|
|
|
|
|
|
|
|
|
if (INTEL_DEBUG & DEBUG_WM) {
|
2011-06-10 15:26:02 -03:00
|
|
|
|
fprintf(stderr, "%s", msg);
|
2011-03-13 13:43:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::fail(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Mark this program as impossible to compile in SIMD16 mode.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During the SIMD8 compile (which happens first), we can detect and flag
|
|
|
|
|
|
* things that are unsupported in SIMD16 mode, so the compiler can skip
|
|
|
|
|
|
* the SIMD16 compile altogether.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During a SIMD16 compile (if one happens anyway), this just calls fail().
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::no16(const char *format, ...)
|
|
|
|
|
|
{
|
|
|
|
|
|
va_list va;
|
|
|
|
|
|
|
|
|
|
|
|
va_start(va, format);
|
|
|
|
|
|
|
|
|
|
|
|
if (dispatch_width == 16) {
|
|
|
|
|
|
vfail(format, va);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
simd16_unsupported = true;
|
|
|
|
|
|
|
2014-04-06 17:16:28 -07:00
|
|
|
|
if (brw->perf_debug) {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
if (no16_msg)
|
|
|
|
|
|
ralloc_vasprintf_append(&no16_msg, format, va);
|
|
|
|
|
|
else
|
|
|
|
|
|
no16_msg = ralloc_vasprintf(mem_ctx, format, va);
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
va_end(va);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-07-04 13:12:50 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode)
|
|
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode, fs_reg dst)
|
|
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
|
|
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
|
|
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode, fs_reg dst,
|
|
|
|
|
|
fs_reg src0, fs_reg src1, fs_reg src2)
|
|
|
|
|
|
{
|
2014-02-21 15:23:42 -08:00
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
|
2012-07-04 13:12:50 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-26 18:44:17 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit(enum opcode opcode, fs_reg dst,
|
|
|
|
|
|
fs_reg src[], int sources)
|
|
|
|
|
|
{
|
|
|
|
|
|
return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::push_force_uncompressed()
|
|
|
|
|
|
{
|
|
|
|
|
|
force_uncompressed_stack++;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::pop_force_uncompressed()
|
|
|
|
|
|
{
|
|
|
|
|
|
force_uncompressed_stack--;
|
|
|
|
|
|
assert(force_uncompressed_stack >= 0);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns true if the instruction has a flag that means it won't
|
|
|
|
|
|
* update an entire destination register.
|
|
|
|
|
|
*
|
|
|
|
|
|
* For example, dead code elimination and live variable analysis want to know
|
|
|
|
|
|
* when a write to a variable screens off any preceding values that were in
|
|
|
|
|
|
* it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::is_partial_write() const
|
2012-06-04 08:59:00 -07:00
|
|
|
|
{
|
2013-08-05 16:24:43 -07:00
|
|
|
|
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
|
2012-06-04 08:59:00 -07:00
|
|
|
|
this->force_uncompressed ||
|
2013-12-08 04:57:35 +01:00
|
|
|
|
this->force_sechalf || !this->dst.is_contiguous());
|
2012-06-04 08:59:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
int
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::regs_read(fs_visitor *v, int arg) const
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
if (is_tex() && arg == 0 && src[0].file == GRF) {
|
|
|
|
|
|
if (v->dispatch_width == 16)
|
|
|
|
|
|
return (mlen + 1) / 2;
|
|
|
|
|
|
else
|
|
|
|
|
|
return mlen;
|
|
|
|
|
|
}
|
|
|
|
|
|
return 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-20 11:32:01 -07:00
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::reads_flag() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
|
|
|
|
|
return predicate;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
bool
|
2014-03-27 09:40:30 -07:00
|
|
|
|
fs_inst::writes_flag() const
|
2013-10-20 11:32:01 -07:00
|
|
|
|
{
|
|
|
|
|
|
return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
|
|
|
|
|
|
opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
|
|
|
|
|
* Returns how many MRFs an FS opcode will write over.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this is not the 0 or 1 implied writes in an actual gen
|
|
|
|
|
|
* instruction -- the FS opcodes often generate MOVs in addition.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int
|
|
|
|
|
|
fs_visitor::implied_mrf_writes(fs_inst *inst)
|
|
|
|
|
|
{
|
|
|
|
|
|
if (inst->mlen == 0)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->base_mrf == -1)
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
switch (inst->opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 1 * dispatch_width / 8;
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_POW:
|
2011-09-28 17:37:54 -07:00
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
2012-11-20 13:50:52 -08:00
|
|
|
|
return 2 * dispatch_width / 8;
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TEX:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
case FS_OPCODE_TXB:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXD:
|
|
|
|
|
|
case SHADER_OPCODE_TXF:
|
2013-12-10 16:36:31 +02:00
|
|
|
|
case SHADER_OPCODE_TXF_CMS:
|
2013-11-30 10:32:16 +13:00
|
|
|
|
case SHADER_OPCODE_TXF_MCS:
|
2013-03-31 21:31:12 +13:00
|
|
|
|
case SHADER_OPCODE_TG4:
|
2013-10-08 21:42:10 +13:00
|
|
|
|
case SHADER_OPCODE_TG4_OFFSET:
|
2011-10-26 12:58:37 -07:00
|
|
|
|
case SHADER_OPCODE_TXL:
|
|
|
|
|
|
case SHADER_OPCODE_TXS:
|
2013-03-06 14:47:01 -08:00
|
|
|
|
case SHADER_OPCODE_LOD:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
|
|
|
|
|
case FS_OPCODE_FB_WRITE:
|
|
|
|
|
|
return 2;
|
2012-11-07 10:42:34 -08:00
|
|
|
|
case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_READ:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 1;
|
2012-11-07 11:18:34 -08:00
|
|
|
|
case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
|
2013-03-18 10:16:42 -07:00
|
|
|
|
return inst->mlen;
|
2013-10-16 11:45:06 -07:00
|
|
|
|
case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return 2;
|
2013-09-11 14:01:50 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_ATOMIC:
|
2013-09-11 14:03:13 -07:00
|
|
|
|
case SHADER_OPCODE_UNTYPED_SURFACE_READ:
|
2013-09-11 14:01:50 -07:00
|
|
|
|
return 0;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
default:
|
|
|
|
|
|
assert(!"not reached");
|
|
|
|
|
|
return inst->mlen;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-09-29 10:43:46 -07:00
|
|
|
|
int
|
|
|
|
|
|
fs_visitor::virtual_grf_alloc(int size)
|
|
|
|
|
|
{
|
2012-07-06 13:45:53 -07:00
|
|
|
|
if (virtual_grf_array_size <= virtual_grf_count) {
|
2010-09-29 10:43:46 -07:00
|
|
|
|
if (virtual_grf_array_size == 0)
|
|
|
|
|
|
virtual_grf_array_size = 16;
|
|
|
|
|
|
else
|
|
|
|
|
|
virtual_grf_array_size *= 2;
|
2011-01-21 14:32:31 -08:00
|
|
|
|
virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
|
|
|
|
|
|
virtual_grf_array_size);
|
2010-09-29 10:43:46 -07:00
|
|
|
|
}
|
2012-07-06 13:45:53 -07:00
|
|
|
|
virtual_grf_sizes[virtual_grf_count] = size;
|
|
|
|
|
|
return virtual_grf_count++;
|
2010-09-29 10:43:46 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
/** Fixed HW reg constructor. */
|
2011-05-15 09:36:19 -07:00
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg)
|
2010-08-15 18:58:58 -07:00
|
|
|
|
{
|
2010-09-03 13:21:51 -07:00
|
|
|
|
init();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->file = file;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
this->reg = reg;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
this->type = BRW_REGISTER_TYPE_F;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-15 12:04:52 -07:00
|
|
|
|
/** Fixed HW reg constructor. */
|
2014-02-21 23:52:24 -08:00
|
|
|
|
fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
|
2010-10-15 12:04:52 -07:00
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
this->file = file;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
this->reg = reg;
|
2010-10-15 12:04:52 -07:00
|
|
|
|
this->type = type;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-09-27 22:26:22 -07:00
|
|
|
|
/** Automatic reg constructor. */
|
|
|
|
|
|
fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
|
|
|
|
|
|
{
|
|
|
|
|
|
init();
|
|
|
|
|
|
|
|
|
|
|
|
this->file = GRF;
|
2011-05-24 16:45:17 -07:00
|
|
|
|
this->reg = v->virtual_grf_alloc(v->type_size(type));
|
2010-09-27 22:26:22 -07:00
|
|
|
|
this->reg_offset = 0;
|
|
|
|
|
|
this->type = brw_type_for_base_type(type);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::variable_storage(ir_variable *var)
|
|
|
|
|
|
{
|
|
|
|
|
|
return (fs_reg *)hash_table_find(this->variable_ht, var);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-23 12:50:53 -07:00
|
|
|
|
void
|
|
|
|
|
|
import_uniforms_callback(const void *key,
|
|
|
|
|
|
void *data,
|
|
|
|
|
|
void *closure)
|
|
|
|
|
|
{
|
|
|
|
|
|
struct hash_table *dst_ht = (struct hash_table *)closure;
|
|
|
|
|
|
const fs_reg *reg = (const fs_reg *)data;
|
|
|
|
|
|
|
|
|
|
|
|
if (reg->file != UNIFORM)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
|
|
hash_table_insert(dst_ht, data, key);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
|
2011-03-23 12:50:53 -07:00
|
|
|
|
* This brings in those uniform definitions
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2011-07-25 18:13:04 -07:00
|
|
|
|
fs_visitor::import_uniforms(fs_visitor *v)
|
2011-03-23 12:50:53 -07:00
|
|
|
|
{
|
2011-07-25 18:13:04 -07:00
|
|
|
|
hash_table_call_foreach(v->variable_ht,
|
2011-03-23 12:50:53 -07:00
|
|
|
|
import_uniforms_callback,
|
|
|
|
|
|
variable_ht);
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->push_constant_loc = v->push_constant_loc;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->pull_constant_loc = v->pull_constant_loc;
|
2014-03-11 14:35:27 -07:00
|
|
|
|
this->uniforms = v->uniforms;
|
2014-03-07 16:10:50 -08:00
|
|
|
|
this->param_size = v->param_size;
|
2011-03-23 12:50:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-09-28 09:31:56 -07:00
|
|
|
|
/* Our support for uniforms is piggy-backed on the struct
|
|
|
|
|
|
* gl_fragment_program, because that's where the values actually
|
|
|
|
|
|
* get stored, rather than in some global gl_shader_program uniform
|
|
|
|
|
|
* store.
|
|
|
|
|
|
*/
|
2012-11-20 17:43:31 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_uniform_values(ir_variable *ir)
|
2010-09-28 09:31:56 -07:00
|
|
|
|
{
|
2012-11-20 17:43:31 -08:00
|
|
|
|
int namelen = strlen(ir->name);
|
2010-09-28 09:31:56 -07:00
|
|
|
|
|
2012-11-20 17:43:31 -08:00
|
|
|
|
/* The data for our (non-builtin) uniforms is stored in a series of
|
|
|
|
|
|
* gl_uniform_driver_storage structs for each subcomponent that
|
|
|
|
|
|
* glGetUniformLocation() could name. We know it's been set up in the same
|
|
|
|
|
|
* order we'd walk the type, so walk the list of storage and find anything
|
|
|
|
|
|
* with our name, or the prefix of a component that starts with our name.
|
|
|
|
|
|
*/
|
2014-02-19 15:27:01 +01:00
|
|
|
|
unsigned params_before = uniforms;
|
2013-04-08 17:17:44 -07:00
|
|
|
|
for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
|
|
|
|
|
|
struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
|
2012-11-20 17:43:31 -08:00
|
|
|
|
|
|
|
|
|
|
if (strncmp(ir->name, storage->name, namelen) != 0 ||
|
|
|
|
|
|
(storage->name[namelen] != 0 &&
|
|
|
|
|
|
storage->name[namelen] != '.' &&
|
|
|
|
|
|
storage->name[namelen] != '[')) {
|
|
|
|
|
|
continue;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-20 17:43:31 -08:00
|
|
|
|
unsigned slots = storage->type->component_slots();
|
|
|
|
|
|
if (storage->array_elements)
|
|
|
|
|
|
slots *= storage->array_elements;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
|
2012-11-20 17:43:31 -08:00
|
|
|
|
for (unsigned i = 0; i < slots; i++) {
|
2014-02-19 15:27:01 +01:00
|
|
|
|
stage_prog_data->param[uniforms++] = &storage->storage[i].f;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-11-20 17:43:31 -08:00
|
|
|
|
|
|
|
|
|
|
/* Make sure we actually initialized the right amount of stuff here. */
|
2014-02-19 15:27:01 +01:00
|
|
|
|
assert(params_before + ir->type->component_slots() == uniforms);
|
2013-04-11 10:38:04 -07:00
|
|
|
|
(void)params_before;
|
2010-09-28 09:31:56 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-09-28 16:23:04 -07:00
|
|
|
|
|
|
|
|
|
|
/* Our support for builtin uniforms is even scarier than non-builtin.
|
|
|
|
|
|
* It sits on top of the PROG_STATE_VAR parameters that are
|
|
|
|
|
|
* automatically updated from GL context state.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
|
|
|
|
|
|
{
|
2011-01-25 10:41:20 -08:00
|
|
|
|
const ir_state_slot *const slots = ir->state_slots;
|
|
|
|
|
|
assert(ir->state_slots != NULL);
|
2010-09-28 16:23:04 -07:00
|
|
|
|
|
2011-03-24 18:31:05 -07:00
|
|
|
|
for (unsigned int i = 0; i < ir->num_state_slots; i++) {
|
|
|
|
|
|
/* This state reference has already been setup by ir_to_mesa, but we'll
|
|
|
|
|
|
* get the same index back here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int index = _mesa_add_state_reference(this->fp->Base.Parameters,
|
|
|
|
|
|
(gl_state_index *)slots[i].tokens);
|
|
|
|
|
|
|
|
|
|
|
|
/* Add each of the unique swizzles of the element as a parameter.
|
|
|
|
|
|
* This'll end up matching the expected layout of the
|
|
|
|
|
|
* array/matrix/structure we're trying to fill in.
|
|
|
|
|
|
*/
|
|
|
|
|
|
int last_swiz = -1;
|
|
|
|
|
|
for (unsigned int j = 0; j < 4; j++) {
|
|
|
|
|
|
int swiz = GET_SWZ(slots[i].swizzle, j);
|
|
|
|
|
|
if (swiz == last_swiz)
|
|
|
|
|
|
break;
|
|
|
|
|
|
last_swiz = swiz;
|
|
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
stage_prog_data->param[uniforms++] =
|
2012-11-20 16:26:22 -08:00
|
|
|
|
&fp->Base.Parameters->ParameterValues[index][swiz].f;
|
2010-09-28 16:23:04 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
fs_reg *
|
2010-09-28 13:29:45 -07:00
|
|
|
|
fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
|
|
|
|
|
|
fs_reg wpos = *reg;
|
2014-05-13 21:06:00 -07:00
|
|
|
|
bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.x */
|
2013-12-12 13:51:01 +02:00
|
|
|
|
if (ir->data.pixel_center_integer) {
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(MOV(wpos, this->pixel_x));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
} else {
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
wpos.reg_offset++;
|
|
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.y */
|
2013-12-12 13:51:01 +02:00
|
|
|
|
if (!flip && ir->data.pixel_center_integer) {
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(MOV(wpos, this->pixel_y));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg pixel_y = this->pixel_y;
|
2013-12-12 13:51:01 +02:00
|
|
|
|
float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
2010-11-13 14:00:58 -08:00
|
|
|
|
if (flip) {
|
2010-09-28 13:29:45 -07:00
|
|
|
|
pixel_y.negate = true;
|
2014-05-13 21:06:00 -07:00
|
|
|
|
offset += key->drawable_height - 1.0;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 12:01:05 -08:00
|
|
|
|
emit(ADD(wpos, pixel_y, fs_reg(offset)));
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
wpos.reg_offset++;
|
|
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.z */
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
} else {
|
2011-10-21 17:20:32 -07:00
|
|
|
|
emit(FS_OPCODE_LINTERP, wpos,
|
|
|
|
|
|
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
|
|
|
|
|
|
this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
|
2013-02-23 09:00:58 -08:00
|
|
|
|
interp_reg(VARYING_SLOT_POS, 2));
|
2010-12-13 13:37:54 -08:00
|
|
|
|
}
|
2010-09-28 13:29:45 -07:00
|
|
|
|
wpos.reg_offset++;
|
|
|
|
|
|
|
|
|
|
|
|
/* gl_FragCoord.w: Already set up in emit_interpolation */
|
2011-03-13 00:23:40 -08:00
|
|
|
|
emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
|
2010-09-28 13:29:45 -07:00
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
return reg;
|
2010-09-28 13:29:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-21 11:33:22 -07:00
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
|
2012-06-18 13:52:02 -07:00
|
|
|
|
glsl_interp_qualifier interpolation_mode,
|
2014-01-06 13:59:18 -08:00
|
|
|
|
bool is_centroid, bool is_sample)
|
2012-06-21 11:33:22 -07:00
|
|
|
|
{
|
|
|
|
|
|
brw_wm_barycentric_interp_mode barycoord_mode;
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6) {
|
2013-04-27 11:00:46 +12:00
|
|
|
|
if (is_centroid) {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
|
2014-01-06 13:59:18 -08:00
|
|
|
|
} else if (is_sample) {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
|
2013-04-27 11:00:46 +12:00
|
|
|
|
} else {
|
|
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
|
|
|
|
|
|
else
|
|
|
|
|
|
barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
|
|
|
|
|
|
}
|
2012-06-18 13:52:02 -07:00
|
|
|
|
} else {
|
2013-04-27 11:00:46 +12:00
|
|
|
|
/* On Ironlake and below, there is only one interpolation mode.
|
|
|
|
|
|
* Centroid interpolation doesn't mean anything on this hardware --
|
|
|
|
|
|
* there is no multisampling.
|
|
|
|
|
|
*/
|
|
|
|
|
|
barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
|
2012-06-18 13:52:02 -07:00
|
|
|
|
}
|
2012-06-21 11:33:22 -07:00
|
|
|
|
return emit(FS_OPCODE_LINTERP, attr,
|
|
|
|
|
|
this->delta_x[barycoord_mode],
|
|
|
|
|
|
this->delta_y[barycoord_mode], interp);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
fs_reg *
|
2010-09-03 13:22:38 -07:00
|
|
|
|
fs_visitor::emit_general_interpolation(ir_variable *ir)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
|
2011-10-24 17:46:26 -07:00
|
|
|
|
reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
|
2010-09-03 13:22:38 -07:00
|
|
|
|
fs_reg attr = *reg;
|
|
|
|
|
|
|
|
|
|
|
|
unsigned int array_elements;
|
|
|
|
|
|
const glsl_type *type;
|
|
|
|
|
|
|
|
|
|
|
|
if (ir->type->is_array()) {
|
|
|
|
|
|
array_elements = ir->type->length;
|
|
|
|
|
|
if (array_elements == 0) {
|
2011-03-13 13:43:05 -07:00
|
|
|
|
fail("dereferenced array '%s' has length 0\n", ir->name);
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
type = ir->type->fields.array;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
array_elements = 1;
|
|
|
|
|
|
type = ir->type;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-10-21 07:56:08 -07:00
|
|
|
|
glsl_interp_qualifier interpolation_mode =
|
2014-05-13 21:06:00 -07:00
|
|
|
|
ir->determine_interpolation_mode(key->flat_shade);
|
2011-10-21 07:56:08 -07:00
|
|
|
|
|
glsl: move variables in to ir_variable::data, part II
This patch moves following bitfields and variables to the data
structure:
explicit_location, explicit_index, explicit_binding, has_initializer,
is_unmatched_generic_inout, location_frac, from_named_ifc_block_nonarray,
from_named_ifc_block_array, depth_layout, location, index, binding,
max_array_access, atomic
Signed-off-by: Tapani Pälli <tapani.palli@intel.com>
Reviewed-by: Paul Berry <stereotype441@gmail.com>
2013-12-12 15:08:59 +02:00
|
|
|
|
int location = ir->data.location;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
for (unsigned int i = 0; i < array_elements; i++) {
|
|
|
|
|
|
for (unsigned int j = 0; j < type->matrix_columns; j++) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
if (prog_data->urb_setup[location] == -1) {
|
2010-09-28 14:53:36 -07:00
|
|
|
|
/* If there's no incoming setup data for this slot, don't
|
2010-10-01 12:15:48 -07:00
|
|
|
|
* emit interpolation for it.
|
2010-09-28 14:53:36 -07:00
|
|
|
|
*/
|
|
|
|
|
|
attr.reg_offset += type->vector_elements;
|
2010-10-01 11:44:27 -07:00
|
|
|
|
location++;
|
2010-09-28 14:53:36 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-10-21 07:56:08 -07:00
|
|
|
|
if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
|
2011-01-12 12:52:16 -08:00
|
|
|
|
/* Constant interpolation (flat shading) case. The SF has
|
|
|
|
|
|
* handed us defined values in only the constant offset
|
|
|
|
|
|
* field of the setup reg.
|
|
|
|
|
|
*/
|
2011-03-29 15:39:01 +01:00
|
|
|
|
for (unsigned int k = 0; k < type->vector_elements; k++) {
|
|
|
|
|
|
struct brw_reg interp = interp_reg(location, k);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
interp = suboffset(interp, 3);
|
2011-10-24 17:46:26 -07:00
|
|
|
|
interp.type = reg->type;
|
2011-03-13 00:23:40 -08:00
|
|
|
|
emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
|
2011-01-12 12:52:16 -08:00
|
|
|
|
attr.reg_offset++;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2011-10-22 09:33:16 -07:00
|
|
|
|
/* Smooth/noperspective interpolation case. */
|
2011-03-29 15:39:01 +01:00
|
|
|
|
for (unsigned int k = 0; k < type->vector_elements; k++) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
struct brw_reg interp = interp_reg(location, k);
|
|
|
|
|
|
emit_linterp(attr, fs_reg(interp), interpolation_mode,
|
2014-05-13 21:06:00 -07:00
|
|
|
|
ir->data.centroid && !key->persample_shading,
|
|
|
|
|
|
ir->data.sample || key->persample_shading);
|
2013-12-12 12:57:57 +02:00
|
|
|
|
if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
/* Get the pixel/sample mask into f0 so that we know
|
|
|
|
|
|
* which pixels are lit. Then, for each channel that is
|
|
|
|
|
|
* unlit, replace the centroid data with non-centroid
|
|
|
|
|
|
* data.
|
|
|
|
|
|
*/
|
2013-04-08 18:46:23 -07:00
|
|
|
|
emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
fs_inst *inst = emit_linterp(attr, fs_reg(interp),
|
2014-01-06 13:59:18 -08:00
|
|
|
|
interpolation_mode,
|
|
|
|
|
|
false, false);
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
inst->predicate = BRW_PREDICATE_NORMAL;
|
|
|
|
|
|
inst->predicate_inverse = true;
|
|
|
|
|
|
}
|
2013-07-07 01:53:45 +12:00
|
|
|
|
if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
|
i965: Remove fixed-function texture projection avoidance optimization.
This optimization attempts to avoid extra attribute interpolation
instructions for texture coordinates where the W-component is 1.0.
Unfortunately, it requires a lot of complexity: the brw_wm_input_sizes
state atom (all the brw_vs_constval.c code) needs to run on each draw.
It computes the input_size_masks array, then uses that to compute
proj_attrib_mask. Differences in proj_attrib_mask can cause
state-dependent fragment shader recompiles. We also often fail to guess
proj_attrib_mask for the fragment shader precompile, causing us to
needlessly compile it twice.
Furthermore, this optimization only applies to fixed-function programs;
it does not help modern GLSL-based programs at all. Generally, older
fixed-function programs run fine on modern hardware anyway.
The optimization has existed in some form since the initial commit. When
we rewrote the fragment shader backend, we dropped it for a while. Eric
readded it in commit eb30820f268608cf451da32de69723036dddbc62 as part of
an attempt to cure a ~1% performance regression caused by converting the
fixed-function fragment shader generation code from Mesa IR to GLSL IR.
However, no performance data was included in the commit message, so it's
unclear whether or not it was successful.
Time has passed, so I decided to re-measure this. Surprisingly,
Eric's OpenArena timedemo actually runs /faster/ after removing this and
the brw_wm_input_sizes atom. On Ivybridge at 1024x768, I measured a
1.39532% +/- 0.91833% increase in FPS (n = 55). On Ironlake, there was
no statistically significant difference (n = 37).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2013-03-12 21:09:19 -07:00
|
|
|
|
emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
|
|
|
|
|
|
}
|
2010-10-06 11:00:31 -07:00
|
|
|
|
attr.reg_offset++;
|
|
|
|
|
|
}
|
2011-01-12 12:52:16 -08:00
|
|
|
|
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
location++;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-06 11:13:22 -07:00
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
|
2010-10-06 11:19:48 -07:00
|
|
|
|
|
|
|
|
|
|
/* The frontfacing comes in as a bit in the thread payload. */
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6) {
|
2011-03-13 00:23:40 -08:00
|
|
|
|
emit(BRW_OPCODE_ASR, *reg,
|
|
|
|
|
|
fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
|
|
|
|
|
|
fs_reg(15));
|
|
|
|
|
|
emit(BRW_OPCODE_NOT, *reg, *reg);
|
|
|
|
|
|
emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
|
|
|
|
|
|
/* bit 31 is "primitive is back face", so checking < (1 << 31) gives
|
|
|
|
|
|
* us front face
|
|
|
|
|
|
*/
|
2012-11-09 12:50:03 -08:00
|
|
|
|
emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
|
2011-03-13 00:23:40 -08:00
|
|
|
|
emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
|
2010-10-06 11:19:48 -07:00
|
|
|
|
}
|
2010-10-06 11:13:22 -07:00
|
|
|
|
|
|
|
|
|
|
return reg;
|
2010-09-03 13:22:38 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 15:53:05 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(dst.type == BRW_REGISTER_TYPE_F);
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->compute_pos_offset) {
|
2013-10-24 15:53:05 -07:00
|
|
|
|
/* Convert int_sample_pos to floating point */
|
|
|
|
|
|
emit(MOV(dst, int_sample_pos));
|
|
|
|
|
|
/* Scale to the range [0, 1] */
|
|
|
|
|
|
emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
|
|
|
|
|
|
}
|
|
|
|
|
|
else {
|
|
|
|
|
|
/* From ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SamplePosition will always be
|
|
|
|
|
|
* (0.5, 0.5).
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(MOV(dst, fs_reg(0.5f)));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_samplepos_setup(ir_variable *ir)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(brw->gen >= 6);
|
|
|
|
|
|
assert(ir->type == glsl_type::vec2_type);
|
|
|
|
|
|
|
|
|
|
|
|
this->current_annotation = "compute sample position";
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
|
|
|
|
|
|
fs_reg pos = *reg;
|
|
|
|
|
|
fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
|
|
|
|
|
|
fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
|
|
|
|
|
|
|
|
|
|
|
|
/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
|
|
|
|
|
|
* mode will be enabled.
|
|
|
|
|
|
*
|
|
|
|
|
|
* From the Ivy Bridge PRM, volume 2 part 1, page 344:
|
|
|
|
|
|
* R31.1:0 Position Offset X/Y for Slot[3:0]
|
|
|
|
|
|
* R31.3:2 Position Offset X/Y for Slot[7:4]
|
|
|
|
|
|
* .....
|
|
|
|
|
|
*
|
|
|
|
|
|
* The X, Y sample positions come in as bytes in thread payload. So, read
|
|
|
|
|
|
* the positions using vstride=16, width=8, hstride=2.
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_reg sample_pos_reg =
|
2014-05-13 21:52:51 -07:00
|
|
|
|
stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
|
2013-10-24 15:53:05 -07:00
|
|
|
|
BRW_REGISTER_TYPE_B), 16, 8, 2);
|
|
|
|
|
|
|
|
|
|
|
|
emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
|
|
|
|
|
|
if (dispatch_width == 16) {
|
2014-01-15 22:21:50 +01:00
|
|
|
|
fs_inst *inst = emit(MOV(half(int_sample_x, 1),
|
2013-10-24 15:53:05 -07:00
|
|
|
|
fs_reg(suboffset(sample_pos_reg, 16))));
|
|
|
|
|
|
inst->force_sechalf = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
/* Compute gl_SamplePosition.x */
|
|
|
|
|
|
compute_sample_position(pos, int_sample_x);
|
|
|
|
|
|
pos.reg_offset++;
|
|
|
|
|
|
emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
|
|
|
|
|
|
if (dispatch_width == 16) {
|
2014-01-15 22:21:50 +01:00
|
|
|
|
fs_inst *inst = emit(MOV(half(int_sample_y, 1),
|
2013-10-24 15:53:05 -07:00
|
|
|
|
fs_reg(suboffset(sample_pos_reg, 17))));
|
|
|
|
|
|
inst->force_sechalf = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
/* Compute gl_SamplePosition.y */
|
|
|
|
|
|
compute_sample_position(pos, int_sample_y);
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-24 16:17:08 -07:00
|
|
|
|
fs_reg *
|
|
|
|
|
|
fs_visitor::emit_sampleid_setup(ir_variable *ir)
|
|
|
|
|
|
{
|
|
|
|
|
|
assert(brw->gen >= 6);
|
|
|
|
|
|
|
|
|
|
|
|
this->current_annotation = "compute sample id";
|
|
|
|
|
|
fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->compute_sample_id) {
|
2013-10-24 16:17:08 -07:00
|
|
|
|
fs_reg t1 = fs_reg(this, glsl_type::int_type);
|
|
|
|
|
|
fs_reg t2 = fs_reg(this, glsl_type::int_type);
|
|
|
|
|
|
t2.type = BRW_REGISTER_TYPE_UW;
|
|
|
|
|
|
|
|
|
|
|
|
/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
|
|
|
|
|
|
* 8x multisampling, subspan 0 will represent sample N (where N
|
|
|
|
|
|
* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
|
|
|
|
|
|
* 7. We can find the value of N by looking at R0.0 bits 7:6
|
|
|
|
|
|
* ("Starting Sample Pair Index (SSPI)") and multiplying by two
|
|
|
|
|
|
* (since samples are always delivered in pairs). That is, we
|
|
|
|
|
|
* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
|
|
|
|
|
|
* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
|
|
|
|
|
|
* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
|
|
|
|
|
|
* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
|
|
|
|
|
|
* populating a temporary variable with the sequence (0, 1, 2, 3),
|
|
|
|
|
|
* and then reading from it using vstride=1, width=4, hstride=0.
|
|
|
|
|
|
* These computations hold good for 4x multisampling as well.
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(BRW_OPCODE_AND, t1,
|
2014-04-17 11:53:22 -07:00
|
|
|
|
fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
|
|
|
|
|
|
fs_reg(0xc0));
|
2013-10-24 16:17:08 -07:00
|
|
|
|
emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
|
|
|
|
|
|
/* This works for both SIMD8 and SIMD16 */
|
|
|
|
|
|
emit(MOV(t2, brw_imm_v(0x3210)));
|
|
|
|
|
|
/* This special instruction takes care of setting vstride=1,
|
|
|
|
|
|
* width=4, hstride=0 of t2 during an ADD instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* As per GL_ARB_sample_shading specification:
|
|
|
|
|
|
* "When rendering to a non-multisample buffer, or if multisample
|
|
|
|
|
|
* rasterization is disabled, gl_SampleID will always be zero."
|
|
|
|
|
|
*/
|
|
|
|
|
|
emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return reg;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-29 08:39:08 +13:00
|
|
|
|
fs_reg
|
|
|
|
|
|
fs_visitor::fix_math_operand(fs_reg src)
|
|
|
|
|
|
{
|
|
|
|
|
|
/* Can't do hstride == 0 args on gen6 math, so expand it out. We
|
|
|
|
|
|
* might be able to do better by doing execsize = 1 math and then
|
|
|
|
|
|
* expanding that result out, but we would need to be careful with
|
|
|
|
|
|
* masking.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The hardware ignores source modifiers (negate and abs) on math
|
|
|
|
|
|
* instructions, so we also move to a temp to set those up.
|
|
|
|
|
|
*/
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
|
2012-11-29 08:39:08 +13:00
|
|
|
|
!src.abs && !src.negate)
|
|
|
|
|
|
return src;
|
|
|
|
|
|
|
|
|
|
|
|
/* Gen7 relaxes most of the above restrictions, but still can't use IMM
|
|
|
|
|
|
* operands to math
|
|
|
|
|
|
*/
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 7 && src.file != IMM)
|
2012-11-29 08:39:08 +13:00
|
|
|
|
return src;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg expanded = fs_reg(this, glsl_type::float_type);
|
|
|
|
|
|
expanded.type = src.type;
|
|
|
|
|
|
emit(BRW_OPCODE_MOV, expanded, src);
|
|
|
|
|
|
return expanded;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:35:34 -07:00
|
|
|
|
fs_inst *
|
2011-05-03 10:55:50 -07:00
|
|
|
|
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
|
2010-10-08 14:35:34 -07:00
|
|
|
|
{
|
|
|
|
|
|
switch (opcode) {
|
2011-08-05 12:38:58 -07:00
|
|
|
|
case SHADER_OPCODE_RCP:
|
|
|
|
|
|
case SHADER_OPCODE_RSQ:
|
|
|
|
|
|
case SHADER_OPCODE_SQRT:
|
|
|
|
|
|
case SHADER_OPCODE_EXP2:
|
|
|
|
|
|
case SHADER_OPCODE_LOG2:
|
|
|
|
|
|
case SHADER_OPCODE_SIN:
|
|
|
|
|
|
case SHADER_OPCODE_COS:
|
2010-10-08 14:35:34 -07:00
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
assert(!"not reached: bad math opcode");
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
2010-10-11 13:42:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* Can't do hstride == 0 args to gen6 math, so expand it out. We
|
|
|
|
|
|
* might be able to do better by doing execsize = 1 math and then
|
|
|
|
|
|
* expanding that result out, but we would need to be careful with
|
|
|
|
|
|
* masking.
|
2010-12-07 14:50:50 -08:00
|
|
|
|
*
|
2011-10-18 12:24:47 -07:00
|
|
|
|
* Gen 6 hardware ignores source modifiers (negate and abs) on math
|
2010-12-07 14:50:50 -08:00
|
|
|
|
* instructions, so we also move to a temp to set those up.
|
2010-10-11 13:42:11 -07:00
|
|
|
|
*/
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6)
|
2012-11-29 08:39:08 +13:00
|
|
|
|
src = fix_math_operand(src);
|
2010-10-11 13:42:11 -07:00
|
|
|
|
|
2011-03-13 00:23:40 -08:00
|
|
|
|
fs_inst *inst = emit(opcode, dst, src);
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen < 6) {
|
2010-10-11 13:19:47 -07:00
|
|
|
|
inst->base_mrf = 2;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
inst->mlen = dispatch_width / 8;
|
2010-10-11 13:19:47 -07:00
|
|
|
|
}
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
|
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
fs_inst *
|
2011-05-03 10:55:50 -07:00
|
|
|
|
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
|
2010-10-08 14:35:34 -07:00
|
|
|
|
{
|
2010-10-11 13:19:47 -07:00
|
|
|
|
int base_mrf = 2;
|
|
|
|
|
|
fs_inst *inst;
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2011-09-28 17:37:54 -07:00
|
|
|
|
switch (opcode) {
|
|
|
|
|
|
case SHADER_OPCODE_INT_QUOTIENT:
|
|
|
|
|
|
case SHADER_OPCODE_INT_REMAINDER:
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
if (brw->gen >= 7)
|
|
|
|
|
|
no16("SIMD16 INTDIV unsupported\n");
|
2012-12-05 14:56:32 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case SHADER_OPCODE_POW:
|
2011-09-28 17:37:54 -07:00
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
assert(!"not reached: unsupported binary math opcode.");
|
|
|
|
|
|
return NULL;
|
|
|
|
|
|
}
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6) {
|
2012-11-29 08:39:08 +13:00
|
|
|
|
src0 = fix_math_operand(src0);
|
|
|
|
|
|
src1 = fix_math_operand(src1);
|
2010-10-11 13:42:11 -07:00
|
|
|
|
|
2011-03-13 00:23:40 -08:00
|
|
|
|
inst = emit(opcode, dst, src0, src1);
|
2010-10-11 13:19:47 -07:00
|
|
|
|
} else {
|
2011-09-28 17:37:56 -07:00
|
|
|
|
/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
|
|
|
|
|
|
* "Message Payload":
|
|
|
|
|
|
*
|
|
|
|
|
|
* "Operand0[7]. For the INT DIV functions, this operand is the
|
|
|
|
|
|
* denominator."
|
|
|
|
|
|
* ...
|
|
|
|
|
|
* "Operand1[7]. For the INT DIV functions, this operand is the
|
|
|
|
|
|
* numerator."
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool is_int_div = opcode != SHADER_OPCODE_POW;
|
|
|
|
|
|
fs_reg &op0 = is_int_div ? src1 : src0;
|
|
|
|
|
|
fs_reg &op1 = is_int_div ? src0 : src1;
|
|
|
|
|
|
|
|
|
|
|
|
emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
|
|
|
|
|
|
inst = emit(opcode, dst, op0, reg_null_f);
|
2010-10-08 14:35:34 -07:00
|
|
|
|
|
2010-10-11 13:19:47 -07:00
|
|
|
|
inst->base_mrf = base_mrf;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
inst->mlen = 2 * dispatch_width / 8;
|
2010-10-11 13:19:47 -07:00
|
|
|
|
}
|
2010-10-08 14:35:34 -07:00
|
|
|
|
return inst;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_curb_setup()
|
|
|
|
|
|
{
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 8) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->first_curbe_grf = payload.num_regs;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
} else {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->first_curbe_grf_16 = payload.num_regs;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
|
2014-02-19 15:27:01 +01:00
|
|
|
|
|
2010-08-26 16:39:41 -07:00
|
|
|
|
/* Map the offsets in the UNIFORM file to fixed HW regs. */
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (unsigned int i = 0; i < inst->sources; i++) {
|
2010-08-26 16:39:41 -07:00
|
|
|
|
if (inst->src[i].file == UNIFORM) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
int constant_nr;
|
|
|
|
|
|
if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
|
|
|
|
|
|
constant_nr = push_constant_loc[uniform_nr];
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Section 5.11 of the OpenGL 4.1 spec says:
|
|
|
|
|
|
* "Out-of-bounds reads return undefined values, which include
|
|
|
|
|
|
* values from other variables of the active program or zero."
|
|
|
|
|
|
* Just return the first push constant.
|
|
|
|
|
|
*/
|
|
|
|
|
|
constant_nr = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-13 21:52:51 -07:00
|
|
|
|
struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
|
2010-08-27 14:15:42 -07:00
|
|
|
|
constant_nr / 8,
|
|
|
|
|
|
constant_nr % 8);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
|
2013-04-29 16:05:05 -07:00
|
|
|
|
inst->src[i].file = HW_REG;
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[i].fixed_hw_reg = byte_offset(
|
|
|
|
|
|
retype(brw_reg, inst->src[i].type),
|
|
|
|
|
|
inst->src[i].subreg_offset);
|
2010-08-26 16:39:41 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-16 21:53:02 -07:00
|
|
|
|
void
|
2010-10-01 12:15:48 -07:00
|
|
|
|
fs_visitor::calculate_urb_setup()
|
2010-08-16 21:53:02 -07:00
|
|
|
|
{
|
2013-02-23 09:00:58 -08:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = -1;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
int urb_next = 0;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
/* Figure out where each of the incoming setup attributes lands. */
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6) {
|
2013-09-03 12:15:53 -07:00
|
|
|
|
if (_mesa_bitcount_64(fp->Base.InputsRead &
|
|
|
|
|
|
BRW_FS_VARYING_INPUT_MASK) <= 16) {
|
|
|
|
|
|
/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
|
|
|
|
|
|
* first 16 varying inputs, so we can put them wherever we want.
|
|
|
|
|
|
* Just put them in order.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is useful because it means that (a) inputs not used by the
|
|
|
|
|
|
* fragment shader won't take up valuable register space, and (b) we
|
|
|
|
|
|
* won't have to recompile the fragment shader if it gets paired with
|
|
|
|
|
|
* a different vertex (or geometry) shader.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
|
|
|
|
|
if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
|
|
|
|
|
|
BITFIELD64_BIT(i)) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next++;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* We have enough input varyings that the SF/SBE pipeline stage can't
|
|
|
|
|
|
* arbitrarily rearrange them to suit our whim; we have to put them
|
|
|
|
|
|
* in an order that matches the output of the previous pipeline stage
|
|
|
|
|
|
* (geometry or vertex shader).
|
|
|
|
|
|
*/
|
|
|
|
|
|
struct brw_vue_map prev_stage_vue_map;
|
|
|
|
|
|
brw_compute_vue_map(brw, &prev_stage_vue_map,
|
2014-05-13 21:06:00 -07:00
|
|
|
|
key->input_slots_valid);
|
2013-09-03 12:15:53 -07:00
|
|
|
|
int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
|
|
|
|
|
|
assert(prev_stage_vue_map.num_slots <= first_slot + 32);
|
|
|
|
|
|
for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
|
|
|
|
|
|
slot++) {
|
|
|
|
|
|
int varying = prev_stage_vue_map.slot_to_varying[slot];
|
|
|
|
|
|
/* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
|
|
|
|
|
|
* unused.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (varying != BRW_VARYING_SLOT_COUNT &&
|
|
|
|
|
|
(fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
|
|
|
|
|
|
BITFIELD64_BIT(varying))) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[varying] = slot - first_slot;
|
2013-09-03 12:15:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
urb_next = prev_stage_vue_map.num_slots - first_slot;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* Point size is packed into the header, not as a general attribute */
|
2013-02-23 07:22:01 -08:00
|
|
|
|
if (i == VARYING_SLOT_PSIZ)
|
2012-07-19 22:00:16 +02:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->input_slots_valid & BITFIELD64_BIT(i)) {
|
2012-07-19 22:00:16 +02:00
|
|
|
|
/* The back color slot is skipped when the front color is
|
|
|
|
|
|
* also written to. In addition, some slots can be
|
|
|
|
|
|
* written in the vertex shader and not read in the
|
|
|
|
|
|
* fragment shader. So the register number must always be
|
|
|
|
|
|
* incremented, mapped or not.
|
|
|
|
|
|
*/
|
2013-02-23 08:28:18 -08:00
|
|
|
|
if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[i] = urb_next;
|
2012-07-19 22:00:16 +02:00
|
|
|
|
urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2012-02-27 15:46:32 +08:00
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
|
* It's a FS only attribute, and we did interpolation for this attribute
|
|
|
|
|
|
* in SF thread. So, count it here, too.
|
|
|
|
|
|
*
|
|
|
|
|
|
* See compile_sf_prog() for more info.
|
|
|
|
|
|
*/
|
2013-02-23 09:00:58 -08:00
|
|
|
|
if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->num_varying_inputs = urb_next;
|
2010-10-01 12:15:48 -07:00
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_urb_setup()
|
|
|
|
|
|
{
|
2014-05-14 00:17:03 -07:00
|
|
|
|
int urb_start = payload.num_regs + prog_data->curb_read_length;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2010-10-01 12:15:48 -07:00
|
|
|
|
/* Offset all the urb_setup[] index by the actual position of the
|
|
|
|
|
|
* setup regs, now that the location of the constants has been chosen.
|
2010-08-16 21:53:02 -07:00
|
|
|
|
*/
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2011-01-12 12:52:16 -08:00
|
|
|
|
if (inst->opcode == FS_OPCODE_LINTERP) {
|
2013-04-29 16:05:05 -07:00
|
|
|
|
assert(inst->src[2].file == HW_REG);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
inst->src[2].fixed_hw_reg.nr += urb_start;
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2011-01-12 12:52:16 -08:00
|
|
|
|
if (inst->opcode == FS_OPCODE_CINTERP) {
|
2013-04-29 16:05:05 -07:00
|
|
|
|
assert(inst->src[0].file == HW_REG);
|
2011-01-12 12:52:16 -08:00
|
|
|
|
inst->src[0].fixed_hw_reg.nr += urb_start;
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-02 17:35:32 -07:00
|
|
|
|
/* Each attribute is 4 setup channels, each of which is half a reg. */
|
|
|
|
|
|
this->first_non_payload_grf =
|
2014-05-14 00:17:03 -07:00
|
|
|
|
urb_start + prog_data->num_varying_inputs * 2;
|
2010-08-16 21:53:02 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-13 20:17:15 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Split large virtual GRFs into separate components if we can.
|
|
|
|
|
|
*
|
|
|
|
|
|
* This is mostly duplicated with what brw_fs_vector_splitting does,
|
|
|
|
|
|
* but that's really conservative because it's afraid of doing
|
|
|
|
|
|
* splitting that doesn't result in real progress after the rest of
|
|
|
|
|
|
* the optimization phases, which would cause infinite looping in
|
|
|
|
|
|
* optimization. We can do it once here, safely. This also has the
|
|
|
|
|
|
* opportunity to split interpolated values, or maybe even uniforms,
|
|
|
|
|
|
* which we don't have at the IR level.
|
|
|
|
|
|
*
|
|
|
|
|
|
* We want to split, because virtual GRFs are what we register
|
|
|
|
|
|
* allocate and spill (due to contiguousness requirements for some
|
|
|
|
|
|
* instructions), and they're what we naturally generate in the
|
|
|
|
|
|
* codegen process, but most virtual GRFs don't actually need to be
|
|
|
|
|
|
* contiguous sets of GRFs. If we split, we'll end up with reduced
|
|
|
|
|
|
* live intervals and better dead code elimination and coalescing.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::split_virtual_grfs()
|
|
|
|
|
|
{
|
2012-07-06 13:45:53 -07:00
|
|
|
|
int num_vars = this->virtual_grf_count;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
bool split_grf[num_vars];
|
|
|
|
|
|
int new_virtual_grf[num_vars];
|
|
|
|
|
|
|
|
|
|
|
|
/* Try to split anything > 0 sized. */
|
|
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
|
|
|
|
|
if (this->virtual_grf_sizes[i] != 1)
|
|
|
|
|
|
split_grf[i] = true;
|
|
|
|
|
|
else
|
|
|
|
|
|
split_grf[i] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-10-21 17:20:32 -07:00
|
|
|
|
if (brw->has_pln &&
|
|
|
|
|
|
this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
|
|
|
|
|
|
/* PLN opcodes rely on the delta_xy being contiguous. We only have to
|
|
|
|
|
|
* check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
|
|
|
|
|
|
* Gen6, that was the only supported interpolation mode, and since Gen6,
|
|
|
|
|
|
* delta_x and delta_y are in fixed hardware registers.
|
|
|
|
|
|
*/
|
|
|
|
|
|
split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
|
|
|
|
|
|
false;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2012-07-06 14:51:44 -07:00
|
|
|
|
/* If there's a SEND message that requires contiguous destination
|
|
|
|
|
|
* registers, no splitting is allowed.
|
|
|
|
|
|
*/
|
2013-03-18 11:30:57 -07:00
|
|
|
|
if (inst->regs_written > 1) {
|
2010-10-13 20:17:15 -07:00
|
|
|
|
split_grf[inst->dst.reg] = false;
|
|
|
|
|
|
}
|
2013-03-19 15:28:11 -07:00
|
|
|
|
|
|
|
|
|
|
/* If we're sending from a GRF, don't split it, on the assumption that
|
|
|
|
|
|
* the send is reading the whole thing.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (inst->is_send_from_grf()) {
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2013-08-28 11:22:01 -07:00
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
split_grf[inst->src[i].reg] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-03-19 15:28:11 -07:00
|
|
|
|
}
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Allocate new space for split regs. Note that the virtual
|
|
|
|
|
|
* numbers will be contiguous.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < num_vars; i++) {
|
|
|
|
|
|
if (split_grf[i]) {
|
|
|
|
|
|
new_virtual_grf[i] = virtual_grf_alloc(1);
|
|
|
|
|
|
for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
|
|
|
|
|
|
int reg = virtual_grf_alloc(1);
|
|
|
|
|
|
assert(reg == new_virtual_grf[i] + j - 1);
|
2010-11-13 21:19:59 -08:00
|
|
|
|
(void) reg;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
this->virtual_grf_sizes[i] = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->dst.file == GRF &&
|
|
|
|
|
|
split_grf[inst->dst.reg] &&
|
|
|
|
|
|
inst->dst.reg_offset != 0) {
|
|
|
|
|
|
inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
|
|
|
|
|
|
inst->dst.reg_offset - 1);
|
|
|
|
|
|
inst->dst.reg_offset = 0;
|
|
|
|
|
|
}
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2010-10-13 20:17:15 -07:00
|
|
|
|
if (inst->src[i].file == GRF &&
|
|
|
|
|
|
split_grf[inst->src[i].reg] &&
|
|
|
|
|
|
inst->src[i].reg_offset != 0) {
|
|
|
|
|
|
inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
|
|
|
|
|
|
inst->src[i].reg_offset - 1);
|
|
|
|
|
|
inst->src[i].reg_offset = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-08-06 02:17:24 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
/**
|
|
|
|
|
|
* Remove unused virtual GRFs and compact the virtual_grf_* arrays.
|
|
|
|
|
|
*
|
|
|
|
|
|
* During code generation, we create tons of temporary variables, many of
|
|
|
|
|
|
* which get immediately killed and are never used again. Yet, in later
|
|
|
|
|
|
* optimization and analysis passes, such as compute_live_intervals, we need
|
|
|
|
|
|
* to loop over all the virtual GRFs. Compacting them can save a lot of
|
|
|
|
|
|
* overhead.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::compact_virtual_grfs()
|
|
|
|
|
|
{
|
2014-04-07 10:25:50 -07:00
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
/* Mark which virtual GRFs are used, and count how many. */
|
|
|
|
|
|
int remap_table[this->virtual_grf_count];
|
|
|
|
|
|
memset(remap_table, -1, sizeof(remap_table));
|
|
|
|
|
|
|
|
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
const fs_inst *inst = (const fs_inst *) node;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->dst.file == GRF)
|
|
|
|
|
|
remap_table[inst->dst.reg] = 0;
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->src[i].file == GRF)
|
|
|
|
|
|
remap_table[inst->src[i].reg] = 0;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Compact the GRF arrays. */
|
|
|
|
|
|
int new_index = 0;
|
|
|
|
|
|
for (int i = 0; i < this->virtual_grf_count; i++) {
|
|
|
|
|
|
if (remap_table[i] != -1) {
|
|
|
|
|
|
remap_table[i] = new_index;
|
|
|
|
|
|
virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
|
2013-08-06 18:37:19 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-11-01 22:04:50 -07:00
|
|
|
|
++new_index;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
this->virtual_grf_count = new_index;
|
|
|
|
|
|
|
|
|
|
|
|
/* Patch all the instructions to use the newly renumbered registers */
|
|
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *) node;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->dst.file == GRF)
|
|
|
|
|
|
inst->dst.reg = remap_table[inst->dst.reg];
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2012-11-01 22:04:50 -07:00
|
|
|
|
if (inst->src[i].file == GRF)
|
|
|
|
|
|
inst->src[i].reg = remap_table[inst->src[i].reg];
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
/*
|
|
|
|
|
|
* Implements array access of uniforms by inserting a
|
|
|
|
|
|
* PULL_CONSTANT_LOAD instruction.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Unlike temporary GRF array access (where we don't support it due to
|
|
|
|
|
|
* the difficulty of doing relative addressing on instruction
|
|
|
|
|
|
* destinations), we could potentially do array access of uniforms
|
|
|
|
|
|
* that were loaded in GRF space as push constants. In real-world
|
|
|
|
|
|
* usage we've seen, though, the arrays being used are always larger
|
|
|
|
|
|
* than we could load as push constants, so just always move all
|
|
|
|
|
|
* uniform array access out to a pull constant buffer.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::move_uniform_array_access_to_pull_constants()
|
|
|
|
|
|
{
|
2014-03-07 16:10:50 -08:00
|
|
|
|
if (dispatch_width != 8)
|
|
|
|
|
|
return;
|
|
|
|
|
|
|
2014-03-07 02:10:14 -08:00
|
|
|
|
pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
2012-11-08 16:06:24 -08:00
|
|
|
|
pull_constant_loc[i] = -1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Walk through and find array access of uniforms. Put a copy of that
|
|
|
|
|
|
* uniform in the pull constant buffer.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that we don't move constant-indexed accesses to arrays. No
|
|
|
|
|
|
* testing has been done of the performance impact of this choice.
|
|
|
|
|
|
*/
|
|
|
|
|
|
foreach_list_safe(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0 ; i < inst->sources; i++) {
|
2012-11-08 16:06:24 -08:00
|
|
|
|
if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
int uniform = inst->src[i].reg;
|
|
|
|
|
|
|
|
|
|
|
|
/* If this array isn't already present in the pull constant buffer,
|
|
|
|
|
|
* add it.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (pull_constant_loc[uniform] == -1) {
|
2014-02-19 15:14:02 +01:00
|
|
|
|
const float **values = &stage_prog_data->param[uniform];
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
|
|
|
|
|
assert(param_size[uniform]);
|
|
|
|
|
|
|
|
|
|
|
|
for (int j = 0; j < param_size[uniform]; j++) {
|
2014-03-07 15:45:13 -08:00
|
|
|
|
pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
|
|
|
|
|
|
|
2014-02-19 15:14:02 +01:00
|
|
|
|
stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
|
2012-11-08 16:06:24 -08:00
|
|
|
|
values[j];
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-22 12:57:00 -07:00
|
|
|
|
/**
|
2014-03-11 14:35:27 -07:00
|
|
|
|
* Assign UNIFORM file registers to either push constants or pull constants.
|
2010-10-22 12:57:00 -07:00
|
|
|
|
*
|
|
|
|
|
|
* We allow a fragment shader to have more than the specified minimum
|
|
|
|
|
|
* maximum number of fragment shader uniform components (64). If
|
|
|
|
|
|
* there are too many of these, they'd fill up all of register space.
|
|
|
|
|
|
* So, this will push some of them out to the pull constant buffer and
|
|
|
|
|
|
* update the program to load them.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-03-11 14:35:27 -07:00
|
|
|
|
fs_visitor::assign_constant_locations()
|
2010-10-22 12:57:00 -07:00
|
|
|
|
{
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Only the first compile (SIMD8 mode) gets to decide on locations. */
|
|
|
|
|
|
if (dispatch_width != 8)
|
2010-10-22 12:57:00 -07:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Find which UNIFORM registers are still in use. */
|
|
|
|
|
|
bool is_live[uniforms];
|
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
|
|
|
|
|
is_live[i] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *) node;
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
|
|
|
|
|
|
if (constant_nr >= 0 && constant_nr < (int) uniforms)
|
|
|
|
|
|
is_live[constant_nr] = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Only allow 16 registers (128 uniform components) as push constants.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Just demote the end of the list. We could probably do better
|
2010-10-22 12:57:00 -07:00
|
|
|
|
* here, demoting things that are rarely used in the program first.
|
|
|
|
|
|
*/
|
2014-03-11 14:35:27 -07:00
|
|
|
|
unsigned int max_push_components = 16 * 8;
|
|
|
|
|
|
unsigned int num_push_constants = 0;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
2014-03-11 14:35:27 -07:00
|
|
|
|
push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
|
|
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
if (!is_live[i] || pull_constant_loc[i] != -1) {
|
|
|
|
|
|
/* This UNIFORM register is either dead, or has already been demoted
|
|
|
|
|
|
* to a pull const. Mark it as no longer living in the param[] array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
push_constant_loc[i] = -1;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (num_push_constants < max_push_components) {
|
|
|
|
|
|
/* Retain as a push constant. Record the location in the params[]
|
|
|
|
|
|
* array.
|
|
|
|
|
|
*/
|
|
|
|
|
|
push_constant_loc[i] = num_push_constants++;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
} else {
|
2014-03-11 14:35:27 -07:00
|
|
|
|
/* Demote to a pull constant. */
|
|
|
|
|
|
push_constant_loc[i] = -1;
|
|
|
|
|
|
|
2014-03-11 22:24:39 -07:00
|
|
|
|
int pull_index = stage_prog_data->nr_pull_params++;
|
|
|
|
|
|
stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
|
|
|
|
|
|
pull_constant_loc[i] = pull_index;
|
2012-11-08 16:06:24 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-03-11 14:35:27 -07:00
|
|
|
|
|
|
|
|
|
|
stage_prog_data->nr_params = num_push_constants;
|
|
|
|
|
|
|
|
|
|
|
|
/* Up until now, the param[] array has been indexed by reg + reg_offset
|
|
|
|
|
|
* of UNIFORM registers. Condense it to only contain the uniforms we
|
|
|
|
|
|
* chose to upload as push constants.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned int i = 0; i < uniforms; i++) {
|
|
|
|
|
|
int remapped = push_constant_loc[i];
|
|
|
|
|
|
|
|
|
|
|
|
if (remapped == -1)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2014-03-24 11:16:38 -07:00
|
|
|
|
assert(remapped <= (int)i);
|
2014-03-11 14:35:27 -07:00
|
|
|
|
stage_prog_data->param[remapped] = stage_prog_data->param[i];
|
|
|
|
|
|
}
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
|
|
|
|
|
|
* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
2014-03-11 22:24:39 -07:00
|
|
|
|
fs_visitor::demote_pull_constants()
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
{
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2010-10-22 12:57:00 -07:00
|
|
|
|
if (inst->src[i].file != UNIFORM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
int pull_index = pull_constant_loc[inst->src[i].reg +
|
|
|
|
|
|
inst->src[i].reg_offset];
|
|
|
|
|
|
if (pull_index == -1)
|
2010-10-22 12:57:00 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Set up the annotation tracking for new generated instructions. */
|
|
|
|
|
|
base_ir = inst->ir;
|
|
|
|
|
|
current_annotation = inst->annotation;
|
|
|
|
|
|
|
|
|
|
|
|
fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
|
|
|
|
|
|
fs_reg dst = fs_reg(this, glsl_type::float_type);
|
2012-11-08 16:06:24 -08:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Generate a pull load into dst. */
|
|
|
|
|
|
if (inst->src[i].reladdr) {
|
|
|
|
|
|
exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
|
|
|
|
|
|
surf_index,
|
|
|
|
|
|
*inst->src[i].reladdr,
|
|
|
|
|
|
pull_index);
|
|
|
|
|
|
inst->insert_before(&list);
|
|
|
|
|
|
inst->src[i].reladdr = NULL;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
|
|
|
|
|
|
fs_inst *pull =
|
|
|
|
|
|
new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
|
|
|
|
|
|
dst, surf_index, offset);
|
|
|
|
|
|
inst->insert_before(pull);
|
|
|
|
|
|
inst->src[i].set_smear(pull_index & 3);
|
|
|
|
|
|
}
|
2010-10-22 12:57:00 -07:00
|
|
|
|
|
i965/fs: Split pull parameter decision making from mechanical demoting.
move_uniform_array_access_to_pull_constants() and setup_pull_constants()
both have two parts:
1. Decide which UNIFORM registers to demote to pull constants, and
assign locations.
2. Mechanically rewrite the instruction stream to pull the uniform
value into a temporary VGRF and use that, eliminating the UNIFORM
file access.
In order to support pull constants in SIMD16 mode, we will need to make
decisions exactly once, but rewrite both instruction streams.
Separating these two tasks will make this easier.
This patch introduces a new helper, demote_pull_constants(), which
takes care of rewriting the instruction stream, in both cases.
For the moment, a single invocation of demote_pull_constants can't
safely handle both reladdr and non-reladdr tasks, since the two callers
still use different names for uniforms due to remove_dead_constants()
remapping of things. So, we get an ugly boolean parameter saying
which to do. This will go away.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com>
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-10 13:14:03 -07:00
|
|
|
|
/* Rewrite the instruction to use the temporary VGRF. */
|
|
|
|
|
|
inst->src[i].file = GRF;
|
|
|
|
|
|
inst->src[i].reg = dst.reg;
|
|
|
|
|
|
inst->src[i].reg_offset = 0;
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-03-10 23:55:21 -07:00
|
|
|
|
invalidate_live_intervals();
|
2010-10-22 12:57:00 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2011-07-22 16:45:15 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::opt_algebraic()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
|
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
|
|
|
|
|
|
|
|
|
|
|
switch (inst->opcode) {
|
|
|
|
|
|
case BRW_OPCODE_MUL:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* a * 1.0 = a */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_one()) {
|
2011-07-22 16:45:15 -07:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-09-20 11:06:07 +02:00
|
|
|
|
/* a * 0.0 = 0.0 */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_zero()) {
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
2012-11-17 15:10:53 -08:00
|
|
|
|
inst->src[0] = inst->src[1];
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-05-03 10:55:50 -07:00
|
|
|
|
break;
|
2012-09-20 11:06:07 +02:00
|
|
|
|
case BRW_OPCODE_ADD:
|
|
|
|
|
|
if (inst->src[1].file != IMM)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* a + 0.0 = a */
|
2012-11-17 15:10:53 -08:00
|
|
|
|
if (inst->src[1].is_zero()) {
|
2012-09-20 11:06:07 +02:00
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 19:34:48 -07:00
|
|
|
|
case BRW_OPCODE_OR:
|
|
|
|
|
|
if (inst->src[0].equals(inst->src[1])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2014-01-09 20:57:36 -08:00
|
|
|
|
case BRW_OPCODE_LRP:
|
|
|
|
|
|
if (inst->src[1].equals(inst->src[2])) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[0] = inst->src[1];
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->src[2] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 20:03:48 -07:00
|
|
|
|
case BRW_OPCODE_SEL:
|
|
|
|
|
|
if (inst->saturate && inst->src[1].file == IMM) {
|
|
|
|
|
|
switch (inst->conditional_mod) {
|
|
|
|
|
|
case BRW_CONDITIONAL_LE:
|
|
|
|
|
|
case BRW_CONDITIONAL_L:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
|
|
|
|
|
if (inst->src[1].imm.f >= 1.0f) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-27 21:26:36 -07:00
|
|
|
|
case BRW_CONDITIONAL_GE:
|
|
|
|
|
|
case BRW_CONDITIONAL_G:
|
|
|
|
|
|
switch (inst->src[1].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
|
|
|
|
|
if (inst->src[1].imm.f <= 0.0f) {
|
|
|
|
|
|
inst->opcode = BRW_OPCODE_MOV;
|
|
|
|
|
|
inst->src[1] = reg_undef;
|
|
|
|
|
|
inst->conditional_mod = BRW_CONDITIONAL_NONE;
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2013-10-27 20:03:48 -07:00
|
|
|
|
default:
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2011-05-03 10:55:50 -07:00
|
|
|
|
default:
|
2011-07-22 16:45:15 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::compute_to_mrf()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
int next_ip = 0;
|
|
|
|
|
|
|
2011-01-12 10:10:01 -08:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list_safe(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-10-08 14:00:14 -07:00
|
|
|
|
|
|
|
|
|
|
int ip = next_ip;
|
|
|
|
|
|
next_ip++;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != BRW_OPCODE_MOV ||
|
2013-04-11 09:54:41 -07:00
|
|
|
|
inst->is_partial_write() ||
|
2010-10-08 14:00:14 -07:00
|
|
|
|
inst->dst.file != MRF || inst->src[0].file != GRF ||
|
|
|
|
|
|
inst->dst.type != inst->src[0].type ||
|
2013-12-08 04:57:08 +01:00
|
|
|
|
inst->src[0].abs || inst->src[0].negate ||
|
2014-01-15 22:21:30 +01:00
|
|
|
|
!inst->src[0].is_contiguous() ||
|
2013-12-08 04:57:35 +01:00
|
|
|
|
inst->src[0].subreg_offset)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
2011-03-28 16:54:39 -07:00
|
|
|
|
/* Work out which hardware MRF registers are written by this
|
|
|
|
|
|
* instruction.
|
|
|
|
|
|
*/
|
2011-05-15 09:36:19 -07:00
|
|
|
|
int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
|
2011-03-28 16:54:39 -07:00
|
|
|
|
int mrf_high;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
if (inst->dst.reg & BRW_MRF_COMPR4) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
mrf_high = mrf_low + 4;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
} else if (dispatch_width == 16 &&
|
2011-03-28 16:54:39 -07:00
|
|
|
|
(!inst->force_uncompressed && !inst->force_sechalf)) {
|
|
|
|
|
|
mrf_high = mrf_low + 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
mrf_high = mrf_low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
/* Can't compute-to-MRF this GRF if someone else was going to
|
|
|
|
|
|
* read it later.
|
|
|
|
|
|
*/
|
2013-04-30 15:00:40 -07:00
|
|
|
|
if (this->virtual_grf_end[inst->src[0].reg] > ip)
|
2010-10-08 14:00:14 -07:00
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
|
|
/* Found a move of a GRF to a MRF. Let's see if we can go
|
|
|
|
|
|
* rewrite the thing that made this GRF to write into the MRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *scan_inst;
|
|
|
|
|
|
for (scan_inst = (fs_inst *)inst->prev;
|
|
|
|
|
|
scan_inst->prev != NULL;
|
|
|
|
|
|
scan_inst = (fs_inst *)scan_inst->prev) {
|
|
|
|
|
|
if (scan_inst->dst.file == GRF &&
|
|
|
|
|
|
scan_inst->dst.reg == inst->src[0].reg) {
|
|
|
|
|
|
/* Found the last thing to write our reg we want to turn
|
|
|
|
|
|
* into a compute-to-MRF.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
2012-06-04 08:59:00 -07:00
|
|
|
|
/* If this one instruction didn't populate all the
|
|
|
|
|
|
* channels, bail. We might be able to rewrite everything
|
2011-03-28 16:54:39 -07:00
|
|
|
|
* that writes that reg, but it would require smarter
|
|
|
|
|
|
* tracking to delay the rewriting until complete success.
|
2010-10-08 14:00:14 -07:00
|
|
|
|
*/
|
2012-06-04 08:59:00 -07:00
|
|
|
|
if (scan_inst->is_partial_write())
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2013-03-15 14:31:46 -07:00
|
|
|
|
/* Things returning more than one register would need us to
|
|
|
|
|
|
* understand coalescing out more than one MOV at a time.
|
|
|
|
|
|
*/
|
2013-03-18 11:30:57 -07:00
|
|
|
|
if (scan_inst->regs_written > 1)
|
2013-03-15 14:31:46 -07:00
|
|
|
|
break;
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
/* SEND instructions can't have MRF as a destination. */
|
|
|
|
|
|
if (scan_inst->mlen)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen == 6) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
/* gen6 math instructions must have the destination be
|
|
|
|
|
|
* GRF, so no compute-to-MRF for them.
|
|
|
|
|
|
*/
|
2011-01-18 22:48:11 -08:00
|
|
|
|
if (scan_inst->is_math()) {
|
2010-10-11 13:38:38 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
|
|
|
|
|
|
/* Found the creator of our MRF's source value. */
|
2010-11-18 15:03:50 +08:00
|
|
|
|
scan_inst->dst.file = MRF;
|
2011-05-15 09:36:19 -07:00
|
|
|
|
scan_inst->dst.reg = inst->dst.reg;
|
2010-11-18 15:03:50 +08:00
|
|
|
|
scan_inst->saturate |= inst->saturate;
|
|
|
|
|
|
inst->remove();
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:36:18 -08:00
|
|
|
|
/* We don't handle control flow here. Most computation of
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* values that end up in MRFs are shortly before the MRF
|
|
|
|
|
|
* write anyway.
|
|
|
|
|
|
*/
|
2013-02-05 15:36:18 -08:00
|
|
|
|
if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
|
2010-11-18 15:03:50 +08:00
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
|
|
/* You can't read from an MRF, so if someone else reads our
|
|
|
|
|
|
* MRF's source GRF that we wanted to rewrite, that stops us.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool interfered = false;
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->sources; i++) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
if (scan_inst->src[i].file == GRF &&
|
|
|
|
|
|
scan_inst->src[i].reg == inst->src[0].reg &&
|
|
|
|
|
|
scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
|
|
|
|
|
|
interfered = true;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
if (interfered)
|
|
|
|
|
|
break;
|
|
|
|
|
|
|
2011-03-28 16:54:39 -07:00
|
|
|
|
if (scan_inst->dst.file == MRF) {
|
|
|
|
|
|
/* If somebody else writes our MRF here, we can't
|
2010-11-18 15:03:50 +08:00
|
|
|
|
* compute-to-MRF before that.
|
|
|
|
|
|
*/
|
2011-05-15 09:36:19 -07:00
|
|
|
|
int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
|
2011-03-28 16:54:39 -07:00
|
|
|
|
int scan_mrf_high;
|
|
|
|
|
|
|
2011-05-15 09:36:19 -07:00
|
|
|
|
if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
|
2011-03-28 16:54:39 -07:00
|
|
|
|
scan_mrf_high = scan_mrf_low + 4;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
} else if (dispatch_width == 16 &&
|
2011-03-28 16:54:39 -07:00
|
|
|
|
(!scan_inst->force_uncompressed &&
|
|
|
|
|
|
!scan_inst->force_sechalf)) {
|
|
|
|
|
|
scan_mrf_high = scan_mrf_low + 1;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
scan_mrf_high = scan_mrf_low;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (mrf_low == scan_mrf_low ||
|
|
|
|
|
|
mrf_low == scan_mrf_high ||
|
|
|
|
|
|
mrf_high == scan_mrf_low ||
|
|
|
|
|
|
mrf_high == scan_mrf_high) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
2010-11-18 15:03:50 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
|
2010-11-18 15:03:50 +08:00
|
|
|
|
/* Found a SEND instruction, which means that there are
|
|
|
|
|
|
* live values in MRFs from base_mrf to base_mrf +
|
|
|
|
|
|
* scan_inst->mlen - 1. Don't go pushing our MRF write up
|
|
|
|
|
|
* above it.
|
|
|
|
|
|
*/
|
2011-03-28 16:54:39 -07:00
|
|
|
|
if (mrf_low >= scan_inst->base_mrf &&
|
|
|
|
|
|
mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (mrf_high >= scan_inst->base_mrf &&
|
|
|
|
|
|
mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
|
2010-10-08 14:00:14 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2013-08-06 02:17:24 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-10-08 14:00:14 -07:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
/**
|
2012-01-27 11:06:49 -08:00
|
|
|
|
* Walks through basic blocks, looking for repeated MRF writes and
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* removing the later ones.
|
|
|
|
|
|
*/
|
|
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::remove_duplicate_mrf_writes()
|
|
|
|
|
|
{
|
|
|
|
|
|
fs_inst *last_mrf_move[16];
|
|
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2011-03-23 14:00:01 -07:00
|
|
|
|
/* Need to update the MRF tracking for compressed instructions. */
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16)
|
2011-03-23 14:00:01 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
|
2011-07-29 11:52:39 -07:00
|
|
|
|
foreach_list_safe(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
|
2013-02-05 15:36:18 -08:00
|
|
|
|
if (inst->is_control_flow()) {
|
2010-11-19 15:57:05 +08:00
|
|
|
|
memset(last_mrf_move, 0, sizeof(last_mrf_move));
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
|
2010-11-19 15:57:05 +08:00
|
|
|
|
if (prev_inst && inst->equals(prev_inst)) {
|
|
|
|
|
|
inst->remove();
|
|
|
|
|
|
progress = true;
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out the last-write records for MRFs that were overwritten. */
|
|
|
|
|
|
if (inst->dst.file == MRF) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
last_mrf_move[inst->dst.reg] = NULL;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
|
i965/fs: Convert gen7 to using GRFs for texture messages.
Looking at Lightsmark's shaders, the way we used MRFs (or in gen7's
case, GRFs) was bad in a couple of ways. One was that it prevented
compute-to-MRF for the common case of a texcoord that gets used
exactly once, but where the texcoord setup all gets emitted before the
texture calls (such as when it's a bare fragment shader input, which
gets interpolated before processing main()). Another was that it
introduced a bunch of dependencies that constrained scheduling, and
forced waits for texture operations to be done before they are
required. For example, we can now move the compute-to-MRF
interpolation for the second texture send down after the first send.
The downside is that this generally prevents
remove_duplicate_mrf_writes() from doing anything, whereas previously
it avoided work for the case of sampling from the same texcoord twice.
However, I suspect that most of the win that originally justified that
code was in avoiding the WAR stall on the first send, which this patch
also avoids, rather than the small cost of the extra instruction. We
see instruction count regressions in shaders in unigine, yofrankie,
savage2, hon, and gstreamer.
Improves GLB2.7 performance by 0.633628% +/- 0.491809% (n=121/125, avg of
~66fps, outliers below 61 dropped).
Improves openarena performance by 1.01092% +/- 0.66897% (n=425).
No significant difference on Lightsmark (n=44).
v2: Squash in the fix for register unspilling for send-from-GRF, fixing a
segfault in lightsmark.
Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Acked-by: Matt Turner <mattst88@gmail.com>
2013-10-09 17:17:59 -07:00
|
|
|
|
if (inst->mlen > 0 && inst->base_mrf != -1) {
|
2011-01-18 13:28:32 -08:00
|
|
|
|
/* Found a SEND instruction, which will include two or fewer
|
2010-11-19 15:57:05 +08:00
|
|
|
|
* implied MRF writes. We could do better here.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < implied_mrf_writes(inst); i++) {
|
|
|
|
|
|
last_mrf_move[inst->base_mrf + i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear out any MRF move records whose sources got overwritten. */
|
|
|
|
|
|
if (inst->dst.file == GRF) {
|
|
|
|
|
|
for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
|
|
|
|
|
|
if (last_mrf_move[i] &&
|
|
|
|
|
|
last_mrf_move[i]->src[0].reg == inst->dst.reg) {
|
|
|
|
|
|
last_mrf_move[i] = NULL;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode == BRW_OPCODE_MOV &&
|
|
|
|
|
|
inst->dst.file == MRF &&
|
|
|
|
|
|
inst->src[0].file == GRF &&
|
2013-04-11 09:54:41 -07:00
|
|
|
|
!inst->is_partial_write()) {
|
2011-05-15 09:36:19 -07:00
|
|
|
|
last_mrf_move[inst->dst.reg] = inst;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-06-05 13:13:33 -07:00
|
|
|
|
if (progress)
|
2013-08-06 02:17:24 -07:00
|
|
|
|
invalidate_live_intervals();
|
2012-06-05 13:13:33 -07:00
|
|
|
|
|
2010-11-19 15:57:05 +08:00
|
|
|
|
return progress;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
static void
|
|
|
|
|
|
clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
|
|
|
|
|
|
int first_grf, int grf_len)
|
|
|
|
|
|
{
|
2013-11-12 15:33:27 -08:00
|
|
|
|
bool inst_simd16 = (dispatch_width > 8 &&
|
2013-02-05 15:46:22 -08:00
|
|
|
|
!inst->force_uncompressed &&
|
|
|
|
|
|
!inst->force_sechalf);
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources; i++) {
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int grf;
|
|
|
|
|
|
if (inst->src[i].file == GRF) {
|
|
|
|
|
|
grf = inst->src[i].reg;
|
2013-04-29 16:05:05 -07:00
|
|
|
|
} else if (inst->src[i].file == HW_REG &&
|
2013-02-05 15:46:22 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
|
|
|
|
|
|
grf = inst->src[i].fixed_hw_reg.nr;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
continue;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (grf >= first_grf &&
|
|
|
|
|
|
grf < first_grf + grf_len) {
|
|
|
|
|
|
deps[grf - first_grf] = false;
|
2013-11-12 15:33:27 -08:00
|
|
|
|
if (inst_simd16)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
deps[grf - first_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
|
|
|
|
|
|
* check for post destination dependencies on this instruction, software
|
|
|
|
|
|
* must ensure that there is no destination hazard for the case of ‘write
|
|
|
|
|
|
* followed by a posted write’ shown in the following example.
|
|
|
|
|
|
*
|
|
|
|
|
|
* 1. mov r3 0
|
|
|
|
|
|
* 2. send r3.xy <rest of send instruction>
|
|
|
|
|
|
* 3. mov r2 r3
|
|
|
|
|
|
*
|
|
|
|
|
|
* Due to no post-destination dependency check on the ‘send’, the above
|
|
|
|
|
|
* code sequence could have two instructions (1 and 2) in flight at the
|
|
|
|
|
|
* same time that both consider ‘r3’ as the target of their final writes.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
|
|
|
|
|
|
{
|
2013-03-06 17:50:50 -08:00
|
|
|
|
int reg_size = dispatch_width / 8;
|
2013-03-18 11:30:57 -07:00
|
|
|
|
int write_len = inst->regs_written * reg_size;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int first_write_grf = inst->dst.reg;
|
|
|
|
|
|
bool needs_dep[BRW_MAX_MRF];
|
|
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
|
|
|
|
|
|
clear_deps_for_inst_src(inst, dispatch_width,
|
|
|
|
|
|
needs_dep, first_write_grf, write_len);
|
|
|
|
|
|
|
|
|
|
|
|
/* Walk backwards looking for writes to registers we're writing which
|
|
|
|
|
|
* aren't read since being written. If we hit the start of the program,
|
|
|
|
|
|
* we assume that there are no outstanding dependencies on entry to the
|
|
|
|
|
|
* program.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (fs_inst *scan_inst = (fs_inst *)inst->prev;
|
2014-06-09 02:59:21 -07:00
|
|
|
|
!scan_inst->is_head_sentinel();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
scan_inst = (fs_inst *)scan_inst->prev) {
|
|
|
|
|
|
|
|
|
|
|
|
/* If we hit control flow, assume that there *are* outstanding
|
|
|
|
|
|
* dependencies, and force their cleanup before our instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (scan_inst->is_control_flow()) {
|
|
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i]) {
|
|
|
|
|
|
inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-11-12 15:33:27 -08:00
|
|
|
|
bool scan_inst_simd16 = (dispatch_width > 8 &&
|
2013-02-05 15:46:22 -08:00
|
|
|
|
!scan_inst->force_uncompressed &&
|
|
|
|
|
|
!scan_inst->force_sechalf);
|
|
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible on the assumption that any
|
|
|
|
|
|
* instruction but a MOV that might have left us an outstanding
|
|
|
|
|
|
* dependency has more latency than a MOV.
|
|
|
|
|
|
*/
|
2013-03-06 17:50:50 -08:00
|
|
|
|
if (scan_inst->dst.file == GRF) {
|
2013-03-18 11:30:57 -07:00
|
|
|
|
for (int i = 0; i < scan_inst->regs_written; i++) {
|
2013-03-06 17:50:50 -08:00
|
|
|
|
int reg = scan_inst->dst.reg + i * reg_size;
|
|
|
|
|
|
|
|
|
|
|
|
if (reg >= first_write_grf &&
|
|
|
|
|
|
reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[reg - first_write_grf]) {
|
|
|
|
|
|
inst->insert_before(DEP_RESOLVE_MOV(reg));
|
|
|
|
|
|
needs_dep[reg - first_write_grf] = false;
|
2013-11-12 15:33:27 -08:00
|
|
|
|
if (scan_inst_simd16)
|
2013-03-06 17:50:50 -08:00
|
|
|
|
needs_dep[reg - first_write_grf + 1] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
|
|
|
|
|
clear_deps_for_inst_src(scan_inst, dispatch_width,
|
|
|
|
|
|
needs_dep, first_write_grf, write_len);
|
|
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
|
* Implements this workaround for the original 965:
|
|
|
|
|
|
*
|
|
|
|
|
|
* "[DevBW, DevCL] Errata: A destination register from a send can not be
|
|
|
|
|
|
* used as a destination register until after it has been sourced by an
|
|
|
|
|
|
* instruction with a different destination register.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
|
|
|
|
|
|
{
|
2013-03-18 11:30:57 -07:00
|
|
|
|
int write_len = inst->regs_written * dispatch_width / 8;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
int first_write_grf = inst->dst.reg;
|
|
|
|
|
|
bool needs_dep[BRW_MAX_MRF];
|
|
|
|
|
|
assert(write_len < (int)sizeof(needs_dep) - 1);
|
|
|
|
|
|
|
|
|
|
|
|
memset(needs_dep, false, sizeof(needs_dep));
|
|
|
|
|
|
memset(needs_dep, true, write_len);
|
|
|
|
|
|
/* Walk forwards looking for writes to registers we're writing which aren't
|
|
|
|
|
|
* read before being written.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (fs_inst *scan_inst = (fs_inst *)inst->next;
|
|
|
|
|
|
!scan_inst->is_tail_sentinel();
|
|
|
|
|
|
scan_inst = (fs_inst *)scan_inst->next) {
|
|
|
|
|
|
/* If we hit control flow, force resolve all remaining dependencies. */
|
|
|
|
|
|
if (scan_inst->is_control_flow()) {
|
|
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
|
|
|
|
|
|
}
|
2013-03-19 17:36:10 -07:00
|
|
|
|
return;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Clear the flag for registers that actually got read (as expected). */
|
|
|
|
|
|
clear_deps_for_inst_src(scan_inst, dispatch_width,
|
|
|
|
|
|
needs_dep, first_write_grf, write_len);
|
|
|
|
|
|
|
|
|
|
|
|
/* We insert our reads as late as possible since they're reading the
|
|
|
|
|
|
* result of a SEND, which has massive latency.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (scan_inst->dst.file == GRF &&
|
|
|
|
|
|
scan_inst->dst.reg >= first_write_grf &&
|
|
|
|
|
|
scan_inst->dst.reg < first_write_grf + write_len &&
|
|
|
|
|
|
needs_dep[scan_inst->dst.reg - first_write_grf]) {
|
|
|
|
|
|
scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
|
|
|
|
|
|
needs_dep[scan_inst->dst.reg - first_write_grf] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Continue the loop only if we haven't resolved all the dependencies */
|
|
|
|
|
|
int i;
|
|
|
|
|
|
for (i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (i == write_len)
|
|
|
|
|
|
return;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* If we hit the end of the program, resolve all remaining dependencies out
|
|
|
|
|
|
* of paranoia.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
|
|
|
|
|
|
assert(last_inst->eot);
|
|
|
|
|
|
for (int i = 0; i < write_len; i++) {
|
|
|
|
|
|
if (needs_dep[i])
|
|
|
|
|
|
last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::insert_gen4_send_dependency_workarounds()
|
|
|
|
|
|
{
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen != 4 || brw->is_g4x)
|
2013-02-05 15:46:22 -08:00
|
|
|
|
return;
|
|
|
|
|
|
|
2014-06-09 02:59:22 -07:00
|
|
|
|
bool progress = false;
|
|
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* Note that we're done with register allocation, so GRF fs_regs always
|
|
|
|
|
|
* have a .reg_offset of 0.
|
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
foreach_list_safe(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->mlen != 0 && inst->dst.file == GRF) {
|
|
|
|
|
|
insert_gen4_pre_send_dependency_workarounds(inst);
|
|
|
|
|
|
insert_gen4_post_send_dependency_workarounds(inst);
|
2014-06-09 02:59:22 -07:00
|
|
|
|
progress = true;
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-06-09 02:59:22 -07:00
|
|
|
|
|
|
|
|
|
|
if (progress)
|
|
|
|
|
|
invalidate_live_intervals();
|
2013-02-05 15:46:22 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-15 19:26:48 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Turns the generic expression-style uniform pull constant load instruction
|
|
|
|
|
|
* into a hardware-specific series of instructions for loading a pull
|
|
|
|
|
|
* constant.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The expression style allows the CSE pass before this to optimize out
|
|
|
|
|
|
* repeated loads from the same offset, and gives the pre-register-allocation
|
|
|
|
|
|
* scheduling full flexibility, while the conversion to native instructions
|
|
|
|
|
|
* allows the post-register-allocation scheduler the best information
|
|
|
|
|
|
* possible.
|
2013-03-06 14:47:22 -08:00
|
|
|
|
*
|
|
|
|
|
|
* Note that execution masking for setting up pull constant loads is special:
|
|
|
|
|
|
* the channels that need to be written are unrelated to the current execution
|
|
|
|
|
|
* mask, since a later instruction will use one of the result channels as a
|
|
|
|
|
|
* source operand for all 8 or 16 of its channels.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::lower_uniform_pull_constant_loads()
|
|
|
|
|
|
{
|
|
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
|
|
|
|
|
|
continue;
|
|
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 7) {
|
2013-03-14 14:41:37 -07:00
|
|
|
|
/* The offset arg before was a vec4-aligned byte offset. We need to
|
|
|
|
|
|
* turn it into a dword offset.
|
|
|
|
|
|
*/
|
2013-02-15 19:26:48 -08:00
|
|
|
|
fs_reg const_offset_reg = inst->src[1];
|
|
|
|
|
|
assert(const_offset_reg.file == IMM &&
|
|
|
|
|
|
const_offset_reg.type == BRW_REGISTER_TYPE_UD);
|
2013-03-14 14:41:37 -07:00
|
|
|
|
const_offset_reg.imm.u /= 4;
|
2013-02-15 19:26:48 -08:00
|
|
|
|
fs_reg payload = fs_reg(this, glsl_type::uint_type);
|
2013-03-06 14:47:22 -08:00
|
|
|
|
|
|
|
|
|
|
/* This is actually going to be a MOV, but since only the first dword
|
|
|
|
|
|
* is accessed, we have a special opcode to do just that one. Note
|
|
|
|
|
|
* that this needs to be an operation that will be considered a def
|
|
|
|
|
|
* by live variable analysis, or register allocation will explode.
|
2013-02-15 19:26:48 -08:00
|
|
|
|
*/
|
2013-03-06 14:47:22 -08:00
|
|
|
|
fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
|
|
|
|
|
|
payload, const_offset_reg);
|
|
|
|
|
|
setup->force_writemask_all = true;
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
setup->ir = inst->ir;
|
|
|
|
|
|
setup->annotation = inst->annotation;
|
|
|
|
|
|
inst->insert_before(setup);
|
2013-02-15 19:26:48 -08:00
|
|
|
|
|
2013-03-06 14:47:22 -08:00
|
|
|
|
/* Similarly, this will only populate the first 4 channels of the
|
|
|
|
|
|
* result register (since we only use smear values from 0-3), but we
|
|
|
|
|
|
* don't tell the optimizer.
|
|
|
|
|
|
*/
|
2013-02-15 19:26:48 -08:00
|
|
|
|
inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
|
|
|
|
|
|
inst->src[1] = payload;
|
2013-03-06 15:58:46 -08:00
|
|
|
|
|
2013-08-06 02:17:24 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-02-15 19:26:48 -08:00
|
|
|
|
} else {
|
|
|
|
|
|
/* Before register allocation, we didn't tell the scheduler about the
|
|
|
|
|
|
* MRF we use. We know it's safe to use this MRF because nothing
|
|
|
|
|
|
* else does except for register spill/unspill, which generates and
|
|
|
|
|
|
* uses its MRF within a single IR instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
inst->base_mrf = 14;
|
|
|
|
|
|
inst->mlen = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:34:01 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions()
|
2014-05-29 13:08:59 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instructions(NULL);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instructions(const char *name)
|
2013-08-04 23:34:01 -07:00
|
|
|
|
{
|
|
|
|
|
|
calculate_register_pressure();
|
2014-05-29 13:08:59 -07:00
|
|
|
|
FILE *file = stderr;
|
|
|
|
|
|
if (name && geteuid() != 0) {
|
|
|
|
|
|
file = fopen(name, "w");
|
|
|
|
|
|
if (!file)
|
|
|
|
|
|
file = stderr;
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
|
2013-12-14 18:37:16 -08:00
|
|
|
|
int ip = 0, max_pressure = 0;
|
2013-08-04 23:34:01 -07:00
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
backend_instruction *inst = (backend_instruction *)node;
|
2013-12-14 18:37:16 -08:00
|
|
|
|
max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
|
2014-05-29 13:08:59 -07:00
|
|
|
|
fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
|
|
|
|
|
|
dump_instruction(inst, file);
|
2013-08-04 23:34:01 -07:00
|
|
|
|
++ip;
|
|
|
|
|
|
}
|
2014-05-29 13:08:59 -07:00
|
|
|
|
fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
|
|
|
|
|
|
|
|
|
|
|
|
if (file != stderr) {
|
|
|
|
|
|
fclose(file);
|
|
|
|
|
|
}
|
2013-08-04 23:34:01 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
void
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
{
|
|
|
|
|
|
dump_instruction(be_inst, stderr);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
|
2012-10-30 15:35:44 -07:00
|
|
|
|
{
|
2013-04-29 14:21:14 -07:00
|
|
|
|
fs_inst *inst = (fs_inst *)be_inst;
|
|
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->predicate) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(%cf0.%d) ",
|
2012-12-06 10:36:11 -08:00
|
|
|
|
inst->predicate_inverse ? '-' : '+',
|
|
|
|
|
|
inst->flag_subreg);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", brw_instruction_name(inst->opcode));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->saturate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".sat");
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (inst->conditional_mod) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
if (!inst->predicate &&
|
2013-07-06 00:36:46 -07:00
|
|
|
|
(brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
|
2012-12-06 10:36:11 -08:00
|
|
|
|
inst->opcode != BRW_OPCODE_IF &&
|
|
|
|
|
|
inst->opcode != BRW_OPCODE_WHILE))) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ".f0.%d", inst->flag_subreg);
|
2012-12-06 10:36:11 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2012-12-06 10:36:11 -08:00
|
|
|
|
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->dst.file) {
|
|
|
|
|
|
case GRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->dst.reg);
|
2013-12-08 04:57:08 +01:00
|
|
|
|
if (virtual_grf_sizes[inst->dst.reg] != 1 ||
|
|
|
|
|
|
inst->dst.subreg_offset)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d",
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->dst.reg_offset, inst->dst.subreg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case MRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "m%d", inst->dst.reg);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case UNIFORM:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2013-10-08 23:30:08 -07:00
|
|
|
|
case HW_REG:
|
2013-11-25 15:37:18 -08:00
|
|
|
|
if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
|
|
|
|
|
|
switch (inst->dst.fixed_hw_reg.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "null");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->dst.fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->dst.fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->dst.fixed_hw_reg.subnr)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
switch (inst->src[i].file) {
|
|
|
|
|
|
case GRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "vgrf%d", inst->src[i].reg);
|
2013-12-08 04:57:08 +01:00
|
|
|
|
if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
|
|
|
|
|
|
inst->src[i].subreg_offset)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->src[i].subreg_offset);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case MRF:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "***m%d***", inst->src[i].reg);
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case UNIFORM:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
|
2014-03-11 00:11:42 -07:00
|
|
|
|
if (inst->src[i].reladdr) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+reladdr");
|
2014-03-11 00:11:42 -07:00
|
|
|
|
} else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
|
|
|
|
|
|
inst->src[i].subreg_offset) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d.%d", inst->src[i].reg_offset,
|
2013-12-22 23:29:31 -08:00
|
|
|
|
inst->src[i].subreg_offset);
|
2014-03-11 00:11:42 -07:00
|
|
|
|
}
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
case BAD_FILE:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "(null)");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
2013-02-15 19:55:46 -08:00
|
|
|
|
case IMM:
|
|
|
|
|
|
switch (inst->src[i].type) {
|
|
|
|
|
|
case BRW_REGISTER_TYPE_F:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%ff", inst->src[i].imm.f);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_REGISTER_TYPE_D:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%dd", inst->src[i].imm.i);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_REGISTER_TYPE_UD:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "%uu", inst->src[i].imm.u);
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2013-02-15 19:55:46 -08:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
break;
|
2013-10-08 23:30:08 -07:00
|
|
|
|
case HW_REG:
|
|
|
|
|
|
if (inst->src[i].fixed_hw_reg.negate)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "-");
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
|
|
|
|
|
|
switch (inst->src[i].fixed_hw_reg.nr) {
|
|
|
|
|
|
case BRW_ARF_NULL:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "null");
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ADDRESS:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_ACCUMULATOR:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
break;
|
|
|
|
|
|
case BRW_ARF_FLAG:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
|
2013-11-25 15:37:18 -08:00
|
|
|
|
inst->src[i].fixed_hw_reg.subnr);
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
|
2013-11-25 15:37:18 -08:00
|
|
|
|
}
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.subnr)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
|
2013-10-08 23:30:08 -07:00
|
|
|
|
if (inst->src[i].fixed_hw_reg.abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2013-10-08 23:30:08 -07:00
|
|
|
|
break;
|
2012-10-30 15:35:44 -07:00
|
|
|
|
default:
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "???");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
if (inst->src[i].abs)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "|");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2013-12-02 13:10:29 -08:00
|
|
|
|
if (inst->src[i].file != IMM) {
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
|
2013-12-02 13:10:29 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-03-17 10:39:43 -07:00
|
|
|
|
if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, ", ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, " ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->force_uncompressed)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "1sthalf ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
|
|
|
|
|
if (inst->force_sechalf)
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "2ndhalf ");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
|
2014-05-29 11:45:15 -07:00
|
|
|
|
fprintf(file, "\n");
|
2012-10-30 15:35:44 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-03-10 13:48:42 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Possibly returns an instruction that set up @param reg.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Sometimes we want to take the result of some expression/variable
|
|
|
|
|
|
* dereference tree and rewrite the instruction generating the result
|
|
|
|
|
|
* of the tree. When processing the tree, we know that the
|
|
|
|
|
|
* instructions generated are all writing temporaries that are dead
|
|
|
|
|
|
* outside of this tree. So, if we have some instructions that write
|
|
|
|
|
|
* a temporary, we're free to point that temp write somewhere else.
|
|
|
|
|
|
*
|
|
|
|
|
|
* Note that this doesn't guarantee that the instruction generated
|
|
|
|
|
|
* only reg -- it might be the size=4 destination of a texture instruction.
|
|
|
|
|
|
*/
|
|
|
|
|
|
fs_inst *
|
|
|
|
|
|
fs_visitor::get_instruction_generating_reg(fs_inst *start,
|
|
|
|
|
|
fs_inst *end,
|
2014-02-19 20:31:14 -08:00
|
|
|
|
const fs_reg ®)
|
2012-03-10 13:48:42 -08:00
|
|
|
|
{
|
|
|
|
|
|
if (end == start ||
|
2012-06-04 08:59:00 -07:00
|
|
|
|
end->is_partial_write() ||
|
2012-11-08 16:06:24 -08:00
|
|
|
|
reg.reladdr ||
|
2012-05-10 16:10:14 -07:00
|
|
|
|
!reg.equals(end->dst)) {
|
2012-03-10 13:48:42 -08:00
|
|
|
|
return NULL;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return end;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-13 19:36:18 -08:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::setup_payload_gen6()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool uses_depth =
|
2013-02-23 09:00:58 -08:00
|
|
|
|
(fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
|
2014-05-14 00:17:03 -07:00
|
|
|
|
unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
assert(brw->gen >= 6);
|
2012-11-13 19:36:18 -08:00
|
|
|
|
|
|
|
|
|
|
/* R0-1: masks, pixel X/Y coordinates. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs = 2;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R2: only for 32-pixel dispatch.*/
|
|
|
|
|
|
|
|
|
|
|
|
/* R3-26: barycentric interpolation coordinates. These appear in the
|
|
|
|
|
|
* same order that they appear in the brw_wm_barycentric_interp_mode
|
|
|
|
|
|
* enum. Each set of coordinates occupies 2 registers if dispatch width
|
|
|
|
|
|
* == 8 and 4 registers if dispatch width == 16. Coordinates only
|
|
|
|
|
|
* appear if they were enabled using the "Barycentric Interpolation
|
|
|
|
|
|
* Mode" bits in WM_STATE.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
|
|
|
|
|
|
if (barycentric_interp_modes & (1 << i)) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.barycentric_coord_reg[i] = payload.num_regs;
|
|
|
|
|
|
payload.num_regs += 2;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs += 2;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* R27: interpolated depth if uses source depth */
|
|
|
|
|
|
if (uses_depth) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.source_depth_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R28: interpolated depth if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
|
|
|
|
|
|
if (uses_depth) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.source_w_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2012-11-20 13:50:52 -08:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R30: interpolated W if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
2013-10-24 15:53:05 -07:00
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
prog_data->uses_pos_offset = key->compute_pos_offset;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R31: MSAA position offsets. */
|
2014-05-14 00:17:03 -07:00
|
|
|
|
if (prog_data->uses_pos_offset) {
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.sample_pos_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2013-10-24 15:53:05 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-12-08 20:29:43 +13:00
|
|
|
|
/* R32: MSAA input coverage mask */
|
|
|
|
|
|
if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
|
|
|
|
|
|
assert(brw->gen >= 7);
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.sample_mask_in_reg = payload.num_regs;
|
|
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
if (dispatch_width == 16) {
|
2013-11-12 15:33:27 -08:00
|
|
|
|
/* R33: input coverage mask if not SIMD8. */
|
2014-05-13 21:52:51 -07:00
|
|
|
|
payload.num_regs++;
|
2013-12-08 20:29:43 +13:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* R34-: bary for 32-pixel. */
|
2012-11-13 19:36:18 -08:00
|
|
|
|
/* R58-59: interp W for 32-pixel. */
|
|
|
|
|
|
|
|
|
|
|
|
if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
|
2014-05-14 00:08:58 -07:00
|
|
|
|
source_depth_to_render_target = true;
|
2012-11-13 19:36:18 -08:00
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-10-02 14:07:40 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::assign_binding_table_offsets()
|
|
|
|
|
|
{
|
2013-10-03 09:58:43 -07:00
|
|
|
|
uint32_t next_binding_table_offset = 0;
|
2013-10-02 18:53:04 -07:00
|
|
|
|
|
2013-11-26 00:30:19 -08:00
|
|
|
|
/* If there are no color regions, we still perform an FB write to a null
|
|
|
|
|
|
* renderbuffer, which we place at surface index 0.
|
|
|
|
|
|
*/
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->binding_table.render_target_start = next_binding_table_offset;
|
2014-05-13 21:06:00 -07:00
|
|
|
|
next_binding_table_offset += MAX2(key->nr_color_regions, 1);
|
2013-10-02 18:53:04 -07:00
|
|
|
|
|
2013-10-03 09:58:43 -07:00
|
|
|
|
assign_common_binding_table_offsets(next_binding_table_offset);
|
2013-10-02 14:07:40 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-08-04 23:27:14 -07:00
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::calculate_register_pressure()
|
|
|
|
|
|
{
|
2014-03-26 13:09:21 -07:00
|
|
|
|
invalidate_live_intervals();
|
2013-08-04 23:27:14 -07:00
|
|
|
|
calculate_live_intervals();
|
|
|
|
|
|
|
|
|
|
|
|
int num_instructions = 0;
|
|
|
|
|
|
foreach_list(node, &this->instructions) {
|
|
|
|
|
|
++num_instructions;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
|
|
|
|
|
|
|
|
|
|
|
|
for (int reg = 0; reg < virtual_grf_count; reg++) {
|
|
|
|
|
|
for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
|
|
|
|
|
|
regs_live_at_ip[ip] += virtual_grf_sizes[reg];
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-14 15:29:01 -08:00
|
|
|
|
/**
|
|
|
|
|
|
* Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
|
|
|
|
|
|
*
|
|
|
|
|
|
* The needs_unlit_centroid_workaround ends up producing one of these per
|
|
|
|
|
|
* channel of centroid input, so it's good to clean them up.
|
|
|
|
|
|
*
|
|
|
|
|
|
* An assumption here is that nothing ever modifies the dispatched pixels
|
|
|
|
|
|
* value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
|
|
|
|
|
|
* dictates that anyway.
|
|
|
|
|
|
*/
|
|
|
|
|
|
void
|
|
|
|
|
|
fs_visitor::opt_drop_redundant_mov_to_flags()
|
|
|
|
|
|
{
|
|
|
|
|
|
bool flag_mov_found[2] = {false};
|
|
|
|
|
|
|
|
|
|
|
|
foreach_list_safe(node, &this->instructions) {
|
|
|
|
|
|
fs_inst *inst = (fs_inst *)node;
|
|
|
|
|
|
|
|
|
|
|
|
if (inst->is_control_flow()) {
|
|
|
|
|
|
memset(flag_mov_found, 0, sizeof(flag_mov_found));
|
|
|
|
|
|
} else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
|
|
|
|
|
|
if (!flag_mov_found[inst->flag_subreg])
|
|
|
|
|
|
flag_mov_found[inst->flag_subreg] = true;
|
|
|
|
|
|
else
|
|
|
|
|
|
inst->remove();
|
|
|
|
|
|
} else if (inst->writes_flag()) {
|
|
|
|
|
|
flag_mov_found[inst->flag_subreg] = false;
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
bool
|
|
|
|
|
|
fs_visitor::run()
|
2010-08-26 12:12:00 -07:00
|
|
|
|
{
|
2012-11-21 13:11:32 -08:00
|
|
|
|
sanity_param_count = fp->Base.Parameters->NumParameters;
|
2013-11-19 13:07:12 -08:00
|
|
|
|
bool allocated_without_spills;
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2013-10-02 14:07:40 -07:00
|
|
|
|
assign_binding_table_offsets();
|
|
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen >= 6)
|
2012-11-13 19:36:18 -08:00
|
|
|
|
setup_payload_gen6();
|
|
|
|
|
|
else
|
2012-11-19 14:59:14 -08:00
|
|
|
|
setup_payload_gen4();
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
if (0) {
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_dummy_fs();
|
2010-08-15 18:58:58 -07:00
|
|
|
|
} else {
|
2012-11-27 14:10:52 -08:00
|
|
|
|
if (INTEL_DEBUG & DEBUG_SHADER_TIME)
|
|
|
|
|
|
emit_shader_time_begin();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
calculate_urb_setup();
|
2013-10-19 21:27:37 -07:00
|
|
|
|
if (fp->Base.InputsRead > 0) {
|
|
|
|
|
|
if (brw->gen < 6)
|
|
|
|
|
|
emit_interpolation_setup_gen4();
|
|
|
|
|
|
else
|
|
|
|
|
|
emit_interpolation_setup_gen6();
|
|
|
|
|
|
}
|
2010-08-16 21:53:02 -07:00
|
|
|
|
|
2012-12-06 12:15:13 -08:00
|
|
|
|
/* We handle discards by keeping track of the still-live pixels in f0.1.
|
|
|
|
|
|
* Initialize it with the dispatched pixels.
|
|
|
|
|
|
*/
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (fp->UsesKill || key->alpha_test_func) {
|
2012-12-06 12:15:13 -08:00
|
|
|
|
fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
|
|
|
|
|
|
discard_init->flag_subreg = 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-08-15 18:58:58 -07:00
|
|
|
|
/* Generate FS IR for main(). (the visitor only descends into
|
|
|
|
|
|
* functions called "main").
|
|
|
|
|
|
*/
|
2012-08-27 14:35:01 -07:00
|
|
|
|
if (shader) {
|
2013-11-28 00:48:57 -08:00
|
|
|
|
foreach_list(node, &*shader->base.ir) {
|
2012-08-27 14:35:01 -07:00
|
|
|
|
ir_instruction *ir = (ir_instruction *)node;
|
|
|
|
|
|
base_ir = ir;
|
|
|
|
|
|
this->result = reg_undef;
|
|
|
|
|
|
ir->accept(this);
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
emit_fragment_program_code();
|
2010-08-26 14:42:06 -07:00
|
|
|
|
}
|
2012-11-29 16:51:13 -08:00
|
|
|
|
base_ir = NULL;
|
2011-06-10 16:00:03 -07:00
|
|
|
|
if (failed)
|
|
|
|
|
|
return false;
|
2010-08-15 18:58:58 -07:00
|
|
|
|
|
2013-03-27 23:19:39 -07:00
|
|
|
|
emit(FS_OPCODE_PLACEHOLDER_HALT);
|
|
|
|
|
|
|
2014-05-13 21:06:00 -07:00
|
|
|
|
if (key->alpha_test_func)
|
2013-10-27 12:32:03 +13:00
|
|
|
|
emit_alpha_test();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
emit_fb_writes();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
split_virtual_grfs();
|
2010-10-13 20:17:15 -07:00
|
|
|
|
|
2012-11-08 16:06:24 -08:00
|
|
|
|
move_uniform_array_access_to_pull_constants();
|
2014-03-11 14:35:27 -07:00
|
|
|
|
assign_constant_locations();
|
2014-03-11 22:24:39 -07:00
|
|
|
|
demote_pull_constants();
|
2010-10-03 15:15:18 -07:00
|
|
|
|
|
2014-02-14 15:29:01 -08:00
|
|
|
|
opt_drop_redundant_mov_to_flags();
|
|
|
|
|
|
|
2014-04-07 10:25:50 -07:00
|
|
|
|
#define OPT(pass, args...) do { \
|
|
|
|
|
|
pass_num++; \
|
|
|
|
|
|
bool this_progress = pass(args); \
|
|
|
|
|
|
\
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
|
|
|
|
|
|
char filename[64]; \
|
|
|
|
|
|
snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
|
|
|
|
|
|
dispatch_width, shader_prog->Name, iteration, pass_num); \
|
|
|
|
|
|
\
|
|
|
|
|
|
backend_visitor::dump_instructions(filename); \
|
|
|
|
|
|
} \
|
|
|
|
|
|
\
|
|
|
|
|
|
progress = progress || this_progress; \
|
|
|
|
|
|
} while (false)
|
|
|
|
|
|
|
|
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
|
|
|
|
|
|
char filename[64];
|
|
|
|
|
|
snprintf(filename, 64, "fs%d-%04d-00-start",
|
|
|
|
|
|
dispatch_width, shader_prog->Name);
|
|
|
|
|
|
|
|
|
|
|
|
backend_visitor::dump_instructions(filename);
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2010-10-03 15:15:18 -07:00
|
|
|
|
bool progress;
|
2014-04-07 10:25:50 -07:00
|
|
|
|
int iteration = 0;
|
2010-10-03 15:15:18 -07:00
|
|
|
|
do {
|
|
|
|
|
|
progress = false;
|
2014-04-07 10:25:50 -07:00
|
|
|
|
iteration++;
|
|
|
|
|
|
int pass_num = 0;
|
2010-11-19 15:57:05 +08:00
|
|
|
|
|
2012-11-01 22:04:50 -07:00
|
|
|
|
compact_virtual_grfs();
|
|
|
|
|
|
|
2014-04-07 10:25:50 -07:00
|
|
|
|
OPT(remove_duplicate_mrf_writes);
|
|
|
|
|
|
|
|
|
|
|
|
OPT(opt_algebraic);
|
|
|
|
|
|
OPT(opt_cse);
|
|
|
|
|
|
OPT(opt_copy_propagate);
|
|
|
|
|
|
OPT(opt_peephole_predicated_break);
|
|
|
|
|
|
OPT(dead_code_eliminate);
|
|
|
|
|
|
OPT(opt_peephole_sel);
|
|
|
|
|
|
OPT(dead_control_flow_eliminate, this);
|
|
|
|
|
|
OPT(opt_saturate_propagation);
|
|
|
|
|
|
OPT(register_coalesce);
|
|
|
|
|
|
OPT(compute_to_mrf);
|
2010-10-03 15:15:18 -07:00
|
|
|
|
} while (progress);
|
|
|
|
|
|
|
2013-02-15 19:26:48 -08:00
|
|
|
|
lower_uniform_pull_constant_loads();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assign_curb_setup();
|
|
|
|
|
|
assign_urb_setup();
|
2011-01-18 22:03:34 -08:00
|
|
|
|
|
2013-11-19 13:07:12 -08:00
|
|
|
|
static enum instruction_scheduler_mode pre_modes[] = {
|
|
|
|
|
|
SCHEDULE_PRE,
|
|
|
|
|
|
SCHEDULE_PRE_NON_LIFO,
|
|
|
|
|
|
SCHEDULE_PRE_LIFO,
|
|
|
|
|
|
};
|
2013-11-06 17:43:25 -08:00
|
|
|
|
|
2013-11-19 13:07:12 -08:00
|
|
|
|
/* Try each scheduling heuristic to see if it can successfully register
|
|
|
|
|
|
* allocate without spilling. They should be ordered by decreasing
|
|
|
|
|
|
* performance but increasing likelihood of allocating.
|
|
|
|
|
|
*/
|
|
|
|
|
|
for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
|
|
|
|
|
|
schedule_instructions(pre_modes[i]);
|
|
|
|
|
|
|
|
|
|
|
|
if (0) {
|
|
|
|
|
|
assign_regs_trivial();
|
|
|
|
|
|
allocated_without_spills = true;
|
|
|
|
|
|
} else {
|
|
|
|
|
|
allocated_without_spills = assign_regs(false);
|
|
|
|
|
|
}
|
|
|
|
|
|
if (allocated_without_spills)
|
|
|
|
|
|
break;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
if (!allocated_without_spills) {
|
|
|
|
|
|
/* We assume that any spilling is worse than just dropping back to
|
|
|
|
|
|
* SIMD8. There's probably actually some intermediate point where
|
|
|
|
|
|
* SIMD16 with a couple of spills is still better.
|
|
|
|
|
|
*/
|
|
|
|
|
|
if (dispatch_width == 16) {
|
|
|
|
|
|
fail("Failure to register allocate. Reduce number of "
|
|
|
|
|
|
"live scalar values to avoid this.");
|
2014-05-13 20:41:27 -07:00
|
|
|
|
} else {
|
|
|
|
|
|
perf_debug("Fragment shader triggered register spilling. "
|
|
|
|
|
|
"Try reducing the number of live scalar values to "
|
|
|
|
|
|
"improve performance.\n");
|
2013-11-19 13:07:12 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
/* Since we're out of heuristics, just go spill registers until we
|
|
|
|
|
|
* get an allocation.
|
|
|
|
|
|
*/
|
|
|
|
|
|
while (!assign_regs(true)) {
|
|
|
|
|
|
if (failed)
|
|
|
|
|
|
break;
|
2013-11-06 17:38:23 -08:00
|
|
|
|
}
|
2010-10-19 09:25:51 -07:00
|
|
|
|
}
|
2010-08-15 18:58:58 -07:00
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
assert(force_uncompressed_stack == 0);
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2013-02-05 15:46:22 -08:00
|
|
|
|
/* This must come after all optimization and register allocation, since
|
|
|
|
|
|
* it inserts dead code that happens to have side effects, and it does
|
|
|
|
|
|
* so based on the actual physical registers in use.
|
|
|
|
|
|
*/
|
|
|
|
|
|
insert_gen4_send_dependency_workarounds();
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
if (failed)
|
2011-03-14 10:29:12 -07:00
|
|
|
|
return false;
|
|
|
|
|
|
|
2013-11-19 13:07:12 -08:00
|
|
|
|
if (!allocated_without_spills)
|
|
|
|
|
|
schedule_instructions(SCHEDULE_POST);
|
2012-12-03 17:58:03 -08:00
|
|
|
|
|
2014-05-13 21:00:35 -07:00
|
|
|
|
if (last_scratch > 0) {
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->total_scratch = brw_get_scratch_size(last_scratch);
|
2014-05-13 20:51:32 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-02-19 15:27:01 +01:00
|
|
|
|
if (dispatch_width == 8)
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->reg_blocks = brw_register_blocks(grf_used);
|
2014-02-19 15:27:01 +01:00
|
|
|
|
else
|
2014-05-14 00:17:03 -07:00
|
|
|
|
prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
|
2011-03-23 12:50:53 -07:00
|
|
|
|
|
2012-11-21 13:11:32 -08:00
|
|
|
|
/* If any state parameters were appended, then ParameterValues could have
|
|
|
|
|
|
* been realloced, in which case the driver uniform storage set up by
|
|
|
|
|
|
* _mesa_associate_uniform_storage() would point to freed memory. Make
|
|
|
|
|
|
* sure that didn't happen.
|
|
|
|
|
|
*/
|
|
|
|
|
|
assert(sanity_param_count == fp->Base.Parameters->NumParameters);
|
|
|
|
|
|
|
2011-03-11 19:19:01 -08:00
|
|
|
|
return !failed;
|
|
|
|
|
|
}
|
2010-08-26 12:12:00 -07:00
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
const unsigned *
|
2014-05-14 01:21:02 -07:00
|
|
|
|
brw_wm_fs_emit(struct brw_context *brw,
|
|
|
|
|
|
void *mem_ctx,
|
2014-05-14 00:41:41 -07:00
|
|
|
|
const struct brw_wm_prog_key *key,
|
|
|
|
|
|
struct brw_wm_prog_data *prog_data,
|
2012-11-20 14:41:21 -08:00
|
|
|
|
struct gl_fragment_program *fp,
|
2012-11-20 16:21:27 -08:00
|
|
|
|
struct gl_shader_program *prog,
|
|
|
|
|
|
unsigned *final_assembly_size)
|
2011-03-11 19:19:01 -08:00
|
|
|
|
{
|
2012-08-07 10:05:38 -07:00
|
|
|
|
bool start_busy = false;
|
2014-02-20 14:54:29 -08:00
|
|
|
|
double start_time = 0;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
2013-07-03 14:41:58 -07:00
|
|
|
|
if (unlikely(brw->perf_debug)) {
|
2013-07-03 14:21:19 -07:00
|
|
|
|
start_busy = (brw->batch.last_bo &&
|
|
|
|
|
|
drm_intel_bo_busy(brw->batch.last_bo));
|
2012-08-07 10:05:38 -07:00
|
|
|
|
start_time = get_time();
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2012-08-27 14:35:01 -07:00
|
|
|
|
struct brw_shader *shader = NULL;
|
|
|
|
|
|
if (prog)
|
|
|
|
|
|
shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
2014-02-20 18:23:52 -08:00
|
|
|
|
if (unlikely(INTEL_DEBUG & DEBUG_WM))
|
|
|
|
|
|
brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
|
2011-03-11 19:19:01 -08:00
|
|
|
|
|
|
|
|
|
|
/* Now the main event: Visit the shader IR and generate our FS IR for it.
|
|
|
|
|
|
*/
|
2014-05-14 00:41:41 -07:00
|
|
|
|
fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
|
2011-03-11 19:19:01 -08:00
|
|
|
|
if (!v.run()) {
|
2013-04-11 09:55:42 -07:00
|
|
|
|
if (prog) {
|
|
|
|
|
|
prog->LinkStatus = false;
|
|
|
|
|
|
ralloc_strcat(&prog->InfoLog, v.fail_msg);
|
|
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-02-09 10:23:45 -08:00
|
|
|
|
_mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
|
2013-04-11 09:55:42 -07:00
|
|
|
|
v.fail_msg);
|
2012-02-09 10:23:45 -08:00
|
|
|
|
|
2012-11-20 16:21:27 -08:00
|
|
|
|
return NULL;
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-11-09 01:05:47 -08:00
|
|
|
|
exec_list *simd16_instructions = NULL;
|
2014-05-14 00:41:41 -07:00
|
|
|
|
fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
|
2013-08-03 17:31:53 -07:00
|
|
|
|
if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
if (!v.simd16_unsupported) {
|
|
|
|
|
|
/* Try a SIMD16 compile */
|
|
|
|
|
|
v2.import_uniforms(&v);
|
|
|
|
|
|
if (!v2.run()) {
|
|
|
|
|
|
perf_debug("SIMD16 shader failed to compile, falling back to "
|
|
|
|
|
|
"SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
simd16_instructions = &v2.instructions;
|
|
|
|
|
|
}
|
2012-11-09 01:05:47 -08:00
|
|
|
|
} else {
|
i965: Accurately bail on SIMD16 compiles.
Ideally, we'd like to never even attempt the SIMD16 compile if we could
know ahead of time that it won't succeed---it's purely a waste of time.
This is especially important for state-based recompiles, which happen at
draw time.
The fragment shader compiler has a number of checks like:
if (dispatch_width == 16)
fail("...some reason...");
This patch introduces a new no16() function which replaces the above
pattern. In the SIMD8 compile, it sets a "SIMD16 will never work" flag.
Then, brw_wm_fs_emit can check that flag, skip the SIMD16 compile, and
issue a helpful performance warning if INTEL_DEBUG=perf is set. (In
SIMD16 mode, no16() calls fail(), for safety's sake.)
The great part is that this is not a heuristic---if the flag is set, we
know with 100% certainty that the SIMD16 compile would fail. (It might
fail anyway if we run out of registers, but it's always worth trying.)
v2: Fix missing va_end in early-return case (caught by Ilia Mirkin).
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Chris Forbes <chrisf@ijw.co.nz> [v1]
Reviewed-by: Ian Romanick <ian.d.romanick@intel.com> [v1]
Reviewed-by: Eric Anholt <eric@anholt.net>
2014-03-07 00:49:45 -08:00
|
|
|
|
perf_debug("SIMD16 shader unsupported, falling back to "
|
|
|
|
|
|
"SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
|
2012-07-12 12:48:58 -07:00
|
|
|
|
}
|
2011-03-11 19:19:01 -08:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-12-06 22:38:26 -08:00
|
|
|
|
const unsigned *assembly = NULL;
|
|
|
|
|
|
if (brw->gen >= 8) {
|
2014-05-14 00:41:41 -07:00
|
|
|
|
gen8_fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src);
|
2012-12-06 22:38:26 -08:00
|
|
|
|
assembly = g.generate_assembly(&v.instructions, simd16_instructions,
|
|
|
|
|
|
final_assembly_size);
|
|
|
|
|
|
} else {
|
2014-05-14 15:05:09 -07:00
|
|
|
|
fs_generator g(brw, mem_ctx, key, prog_data, prog, fp, v.do_dual_src,
|
2014-06-05 15:03:06 +02:00
|
|
|
|
v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
|
2012-12-06 22:38:26 -08:00
|
|
|
|
assembly = g.generate_assembly(&v.instructions, simd16_instructions,
|
|
|
|
|
|
final_assembly_size);
|
|
|
|
|
|
}
|
2012-11-30 12:55:50 -08:00
|
|
|
|
|
2013-07-03 14:41:58 -07:00
|
|
|
|
if (unlikely(brw->perf_debug) && shader) {
|
2012-07-12 13:19:53 -07:00
|
|
|
|
if (shader->compiled_once)
|
2014-05-14 00:41:41 -07:00
|
|
|
|
brw_wm_debug_recompile(brw, prog, key);
|
2012-07-12 13:19:53 -07:00
|
|
|
|
shader->compiled_once = true;
|
2012-08-07 10:05:38 -07:00
|
|
|
|
|
2013-07-03 14:21:19 -07:00
|
|
|
|
if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
|
2012-08-07 10:05:38 -07:00
|
|
|
|
perf_debug("FS compile took %.03f ms and stalled the GPU\n",
|
2012-08-13 17:49:06 -07:00
|
|
|
|
(get_time() - start_time) * 1000);
|
2012-08-07 10:05:38 -07:00
|
|
|
|
}
|
2012-07-12 13:19:53 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2012-12-06 22:38:26 -08:00
|
|
|
|
return assembly;
|
2010-08-26 12:12:00 -07:00
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
|
|
|
|
|
bool
|
|
|
|
|
|
brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
|
|
|
|
|
|
{
|
|
|
|
|
|
struct brw_context *brw = brw_context(ctx);
|
|
|
|
|
|
struct brw_wm_prog_key key;
|
|
|
|
|
|
|
2011-08-20 15:00:36 -07:00
|
|
|
|
if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
|
2011-05-16 15:10:26 -07:00
|
|
|
|
return true;
|
|
|
|
|
|
|
2011-08-20 15:00:36 -07:00
|
|
|
|
struct gl_fragment_program *fp = (struct gl_fragment_program *)
|
|
|
|
|
|
prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
|
|
|
|
|
|
struct brw_fragment_program *bfp = brw_fragment_program(fp);
|
2012-06-20 13:40:45 -07:00
|
|
|
|
bool program_uses_dfdy = fp->UsesDFdy;
|
2011-08-20 15:00:36 -07:00
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
memset(&key, 0, sizeof(key));
|
|
|
|
|
|
|
2013-07-06 00:36:46 -07:00
|
|
|
|
if (brw->gen < 6) {
|
2012-08-13 23:59:51 -07:00
|
|
|
|
if (fp->UsesKill)
|
|
|
|
|
|
key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-08-13 23:59:51 -07:00
|
|
|
|
if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
|
|
|
|
|
|
key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2012-08-13 23:59:51 -07:00
|
|
|
|
/* Just assume depth testing. */
|
|
|
|
|
|
key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
|
|
|
|
|
|
key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
|
|
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2013-09-03 12:15:53 -07:00
|
|
|
|
if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
|
|
|
|
|
|
BRW_FS_VARYING_INPUT_MASK) > 16)
|
2013-09-03 11:55:17 -07:00
|
|
|
|
key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2013-08-14 18:55:15 -07:00
|
|
|
|
unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
|
|
|
|
|
|
for (unsigned i = 0; i < sampler_count; i++) {
|
i965/fs: Assume shadow sampler swizzling is <X, X, X, 1>.
Our previous assumption, SWIZZLE_XYZW, was completely bogus for depth
textures. There are no Y, Z, or W components.
DEPTH_TEXTURE_MODE has three options:
- GL_LUMINANCE: <X, X, X, 1>
- GL_INTENSITY: <X, X, X, X>
- GL_ALPHA: <0, 0, 0, X>
The default value is GL_LUMINANCE, and most applications don't seem to
alter DEPTH_TEXTURE_MODE. Make that our precompile guess.
Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Eric Anholt <eric@anholt.net>
2012-08-26 00:34:03 -07:00
|
|
|
|
if (fp->Base.ShadowSamplers & (1 << i)) {
|
|
|
|
|
|
/* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
|
|
|
|
|
|
key.tex.swizzles[i] =
|
|
|
|
|
|
MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
|
|
|
|
|
|
} else {
|
|
|
|
|
|
/* Color sampler: assume no swizzling. */
|
|
|
|
|
|
key.tex.swizzles[i] = SWIZZLE_XYZW;
|
|
|
|
|
|
}
|
2011-05-16 15:10:26 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2013-02-23 09:00:58 -08:00
|
|
|
|
if (fp->Base.InputsRead & VARYING_BIT_POS) {
|
2011-05-16 15:10:26 -07:00
|
|
|
|
key.drawable_height = ctx->DrawBuffer->Height;
|
2012-06-20 13:40:45 -07:00
|
|
|
|
}
|
|
|
|
|
|
|
2014-01-26 11:03:33 +13:00
|
|
|
|
key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
|
|
|
|
|
|
~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
|
|
|
|
|
|
BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
|
2011-05-16 15:10:26 -07:00
|
|
|
|
|
2014-01-26 11:04:42 +13:00
|
|
|
|
if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
|
|
|
|
|
|
key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
|
|
|
|
|
|
key.nr_color_regions > 1;
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2013-09-12 13:00:52 +08:00
|
|
|
|
/* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
|
|
|
|
|
|
* quality of the derivatives is likely to be determined by the driconf
|
|
|
|
|
|
* option.
|
|
|
|
|
|
*/
|
|
|
|
|
|
key.high_quality_derivatives = brw->disable_derivative_optimization;
|
|
|
|
|
|
|
2011-05-16 15:10:26 -07:00
|
|
|
|
key.program_string_id = bfp->id;
|
|
|
|
|
|
|
2013-09-01 17:31:54 -07:00
|
|
|
|
uint32_t old_prog_offset = brw->wm.base.prog_offset;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
|
|
|
|
|
|
|
|
|
|
|
|
bool success = do_wm_prog(brw, prog, bfp, &key);
|
|
|
|
|
|
|
2013-09-01 17:31:54 -07:00
|
|
|
|
brw->wm.base.prog_offset = old_prog_offset;
|
2011-05-16 15:10:26 -07:00
|
|
|
|
brw->wm.prog_data = old_prog_data;
|
|
|
|
|
|
|
|
|
|
|
|
return success;
|
|
|
|
|
|
}
|