mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 08:50:13 +01:00
intel/brw: Remove vec4 backend
It still exists as part of ELK for older gfx versions. Reviewed-by: Kenneth Graunke <kenneth@whitecape.org> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
This commit is contained in:
parent
7c23b90537
commit
a641aa294e
39 changed files with 0 additions and 17138 deletions
|
|
@ -87,8 +87,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
|
|||
brw_init_isa_info(&compiler->isa, devinfo);
|
||||
|
||||
brw_fs_alloc_reg_sets(compiler);
|
||||
if (devinfo->ver < 8)
|
||||
brw_vec4_alloc_reg_set(compiler);
|
||||
|
||||
compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false);
|
||||
|
||||
|
|
|
|||
|
|
@ -57,16 +57,6 @@ struct brw_compiler {
|
|||
|
||||
struct brw_isa_info isa;
|
||||
|
||||
struct {
|
||||
struct ra_regs *regs;
|
||||
|
||||
/**
|
||||
* Array of the ra classes for the unaligned contiguous register
|
||||
* block sizes used.
|
||||
*/
|
||||
struct ra_class **classes;
|
||||
} vec4_reg_set;
|
||||
|
||||
struct {
|
||||
struct ra_regs *regs;
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,6 @@
|
|||
#include "brw_fs_builder.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
#include "brw_nir.h"
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_dead_control_flow.h"
|
||||
#include "brw_private.h"
|
||||
|
|
|
|||
|
|
@ -23,7 +23,6 @@
|
|||
|
||||
#include "brw_eu.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
using namespace brw;
|
||||
|
|
@ -152,29 +151,6 @@ namespace {
|
|||
rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
|
||||
}
|
||||
|
||||
instruction_info(const struct brw_isa_info *isa,
|
||||
const vec4_instruction *inst) :
|
||||
isa(isa), devinfo(isa->devinfo), op(inst->opcode),
|
||||
td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
|
||||
tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
|
||||
desc(inst->desc), sfid(inst->sfid), rcount(0)
|
||||
{
|
||||
/* Compute the maximum source size. */
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
|
||||
ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
|
||||
|
||||
/* Convert the execution size to GRF units. */
|
||||
sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
|
||||
|
||||
/* 32x32 integer multiplication has half the usual ALU throughput.
|
||||
* Treat it as double-precision.
|
||||
*/
|
||||
if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
|
||||
!brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
|
||||
type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
|
||||
tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
|
||||
}
|
||||
|
||||
/** ISA encoding information */
|
||||
const struct brw_isa_info *isa;
|
||||
/** Device information. */
|
||||
|
|
@ -1505,102 +1481,6 @@ namespace {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Model the performance behavior of a VEC4 back-end instruction.
|
||||
*/
|
||||
void
|
||||
issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
|
||||
const backend_instruction *be_inst)
|
||||
{
|
||||
const struct intel_device_info *devinfo = isa->devinfo;
|
||||
const vec4_instruction *inst =
|
||||
static_cast<const vec4_instruction *>(be_inst);
|
||||
const instruction_info info(isa, inst);
|
||||
const perf_desc perf = instruction_desc(info);
|
||||
|
||||
/* Stall on any source dependencies. */
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
|
||||
for (unsigned j = 0; j < regs_read(inst, i); j++)
|
||||
stall_on_dependency(
|
||||
st, reg_dependency_id(devinfo, inst->src[i], j));
|
||||
}
|
||||
|
||||
if (inst->reads_accumulator_implicitly()) {
|
||||
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
||||
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
||||
inst->exec_size - 1); j++)
|
||||
stall_on_dependency(
|
||||
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
||||
}
|
||||
|
||||
if (inst->base_mrf != -1) {
|
||||
for (unsigned j = 0; j < inst->mlen; j++)
|
||||
stall_on_dependency(
|
||||
st, reg_dependency_id(
|
||||
devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
|
||||
}
|
||||
|
||||
if (inst->reads_flag())
|
||||
stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
|
||||
|
||||
/* Stall on any write dependencies. */
|
||||
if (!inst->no_dd_check) {
|
||||
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
|
||||
for (unsigned j = 0; j < regs_written(inst); j++)
|
||||
stall_on_dependency(
|
||||
st, reg_dependency_id(devinfo, inst->dst, j));
|
||||
}
|
||||
|
||||
if (inst->writes_accumulator_implicitly(devinfo)) {
|
||||
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
||||
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
||||
inst->exec_size - 1); j++)
|
||||
stall_on_dependency(
|
||||
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
||||
}
|
||||
|
||||
if (inst->writes_flag(devinfo))
|
||||
stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
|
||||
}
|
||||
|
||||
/* Execute the instruction. */
|
||||
execute_instruction(st, perf);
|
||||
|
||||
/* Mark any source dependencies. */
|
||||
if (inst->is_send_from_grf()) {
|
||||
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
|
||||
for (unsigned j = 0; j < regs_read(inst, i); j++)
|
||||
mark_read_dependency(
|
||||
st, perf, reg_dependency_id(devinfo, inst->src[i], j));
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->base_mrf != -1) {
|
||||
for (unsigned j = 0; j < inst->mlen; j++)
|
||||
mark_read_dependency(st, perf,
|
||||
reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
|
||||
}
|
||||
|
||||
/* Mark any destination dependencies. */
|
||||
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
|
||||
for (unsigned j = 0; j < regs_written(inst); j++) {
|
||||
mark_write_dependency(st, perf,
|
||||
reg_dependency_id(devinfo, inst->dst, j));
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->writes_accumulator_implicitly(devinfo)) {
|
||||
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
|
||||
j <= accum_reg_of_channel(devinfo, inst, info.tx,
|
||||
inst->exec_size - 1); j++)
|
||||
mark_write_dependency(st, perf,
|
||||
reg_dependency_id(devinfo, brw_acc_reg(8), j));
|
||||
}
|
||||
|
||||
if (inst->writes_flag(devinfo))
|
||||
mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate the maximum possible throughput of the program compatible with
|
||||
* the cycle-count utilization estimated for each asynchronous unit, in
|
||||
|
|
@ -1692,12 +1572,6 @@ brw::performance::performance(const fs_visitor *v) :
|
|||
calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
|
||||
}
|
||||
|
||||
brw::performance::performance(const vec4_visitor *v) :
|
||||
block_latency(new unsigned[v->cfg->num_blocks])
|
||||
{
|
||||
calculate_performance(*this, v, issue_vec4_instruction, 8);
|
||||
}
|
||||
|
||||
brw::performance::~performance()
|
||||
{
|
||||
delete[] block_latency;
|
||||
|
|
|
|||
|
|
@ -28,15 +28,12 @@
|
|||
class fs_visitor;
|
||||
|
||||
namespace brw {
|
||||
class vec4_visitor;
|
||||
|
||||
/**
|
||||
* Various estimates of the performance of a shader based on static
|
||||
* analysis.
|
||||
*/
|
||||
struct performance {
|
||||
performance(const fs_visitor *v);
|
||||
performance(const vec4_visitor *v);
|
||||
~performance();
|
||||
|
||||
analysis_dependency_class
|
||||
|
|
|
|||
|
|
@ -1,475 +0,0 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2011-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_IR_VEC4_H
|
||||
#define BRW_IR_VEC4_H
|
||||
|
||||
#include "brw_shader.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
class dst_reg;
|
||||
|
||||
class src_reg : public backend_reg
|
||||
{
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(src_reg)
|
||||
|
||||
void init();
|
||||
|
||||
src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
|
||||
src_reg();
|
||||
src_reg(struct ::brw_reg reg);
|
||||
|
||||
bool equals(const src_reg &r) const;
|
||||
bool negative_equals(const src_reg &r) const;
|
||||
|
||||
src_reg(class vec4_visitor *v, const struct glsl_type *type);
|
||||
src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
|
||||
|
||||
explicit src_reg(const dst_reg ®);
|
||||
|
||||
src_reg *reladdr;
|
||||
};
|
||||
|
||||
static inline src_reg
|
||||
retype(src_reg reg, enum brw_reg_type type)
|
||||
{
|
||||
reg.type = type;
|
||||
return reg;
|
||||
}
|
||||
|
||||
namespace detail {
|
||||
|
||||
static inline void
|
||||
add_byte_offset(backend_reg *reg, unsigned bytes)
|
||||
{
|
||||
switch (reg->file) {
|
||||
case BAD_FILE:
|
||||
break;
|
||||
case VGRF:
|
||||
case ATTR:
|
||||
case UNIFORM:
|
||||
reg->offset += bytes;
|
||||
assert(reg->offset % 16 == 0);
|
||||
break;
|
||||
case MRF: {
|
||||
const unsigned suboffset = reg->offset + bytes;
|
||||
reg->nr += suboffset / REG_SIZE;
|
||||
reg->offset = suboffset % REG_SIZE;
|
||||
assert(reg->offset % 16 == 0);
|
||||
break;
|
||||
}
|
||||
case ARF:
|
||||
case FIXED_GRF: {
|
||||
const unsigned suboffset = reg->subnr + bytes;
|
||||
reg->nr += suboffset / REG_SIZE;
|
||||
reg->subnr = suboffset % REG_SIZE;
|
||||
assert(reg->subnr % 16 == 0);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
assert(bytes == 0);
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace detail */
|
||||
|
||||
static inline src_reg
|
||||
byte_offset(src_reg reg, unsigned bytes)
|
||||
{
|
||||
detail::add_byte_offset(®, bytes);
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline src_reg
|
||||
offset(src_reg reg, unsigned width, unsigned delta)
|
||||
{
|
||||
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
|
||||
const unsigned num_components = MAX2(width / 4 * stride, 4);
|
||||
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
|
||||
}
|
||||
|
||||
static inline src_reg
|
||||
horiz_offset(src_reg reg, unsigned delta)
|
||||
{
|
||||
return byte_offset(reg, delta * type_sz(reg.type));
|
||||
}
|
||||
|
||||
/**
|
||||
* Reswizzle a given source register.
|
||||
* \sa brw_swizzle().
|
||||
*/
|
||||
static inline src_reg
|
||||
swizzle(src_reg reg, unsigned swizzle)
|
||||
{
|
||||
if (reg.file == IMM)
|
||||
reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
|
||||
else
|
||||
reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
|
||||
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline src_reg
|
||||
negate(src_reg reg)
|
||||
{
|
||||
assert(reg.file != IMM);
|
||||
reg.negate = !reg.negate;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline bool
|
||||
is_uniform(const src_reg ®)
|
||||
{
|
||||
return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
|
||||
(!reg.reladdr || is_uniform(*reg.reladdr));
|
||||
}
|
||||
|
||||
class dst_reg : public backend_reg
|
||||
{
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
|
||||
|
||||
void init();
|
||||
|
||||
dst_reg();
|
||||
dst_reg(enum brw_reg_file file, int nr);
|
||||
dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
|
||||
unsigned writemask);
|
||||
dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
|
||||
unsigned writemask);
|
||||
dst_reg(struct ::brw_reg reg);
|
||||
dst_reg(class vec4_visitor *v, const struct glsl_type *type);
|
||||
|
||||
explicit dst_reg(const src_reg ®);
|
||||
|
||||
bool equals(const dst_reg &r) const;
|
||||
|
||||
src_reg *reladdr;
|
||||
};
|
||||
|
||||
static inline dst_reg
|
||||
retype(dst_reg reg, enum brw_reg_type type)
|
||||
{
|
||||
reg.type = type;
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
byte_offset(dst_reg reg, unsigned bytes)
|
||||
{
|
||||
detail::add_byte_offset(®, bytes);
|
||||
return reg;
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
offset(dst_reg reg, unsigned width, unsigned delta)
|
||||
{
|
||||
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
|
||||
const unsigned num_components = MAX2(width / 4 * stride, 4);
|
||||
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
horiz_offset(const dst_reg ®, unsigned delta)
|
||||
{
|
||||
if (is_uniform(src_reg(reg)))
|
||||
return reg;
|
||||
else
|
||||
return byte_offset(reg, delta * type_sz(reg.type));
|
||||
}
|
||||
|
||||
static inline dst_reg
|
||||
writemask(dst_reg reg, unsigned mask)
|
||||
{
|
||||
assert(reg.file != IMM);
|
||||
assert((reg.writemask & mask) != 0);
|
||||
reg.writemask &= mask;
|
||||
return reg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return an integer identifying the discrete address space a register is
|
||||
* contained in. A register is by definition fully contained in the single
|
||||
* reg_space it belongs to, so two registers with different reg_space ids are
|
||||
* guaranteed not to overlap. Most register files are a single reg_space of
|
||||
* its own, only the VGRF file is composed of multiple discrete address
|
||||
* spaces, one for each VGRF allocation.
|
||||
*/
|
||||
static inline uint32_t
|
||||
reg_space(const backend_reg &r)
|
||||
{
|
||||
return r.file << 16 | (r.file == VGRF ? r.nr : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the base offset in bytes of a register relative to the start of its
|
||||
* reg_space().
|
||||
*/
|
||||
static inline unsigned
|
||||
reg_offset(const backend_reg &r)
|
||||
{
|
||||
return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
|
||||
(r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
|
||||
(r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return whether the register region starting at \p r and spanning \p dr
|
||||
* bytes could potentially overlap the register region starting at \p s and
|
||||
* spanning \p ds bytes.
|
||||
*/
|
||||
static inline bool
|
||||
regions_overlap(const backend_reg &r, unsigned dr,
|
||||
const backend_reg &s, unsigned ds)
|
||||
{
|
||||
if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
|
||||
/* COMPR4 regions are translated by the hardware during decompression
|
||||
* into two separate half-regions 4 MRFs apart from each other.
|
||||
*/
|
||||
backend_reg t0 = r;
|
||||
t0.nr &= ~BRW_MRF_COMPR4;
|
||||
backend_reg t1 = t0;
|
||||
t1.offset += 4 * REG_SIZE;
|
||||
return regions_overlap(t0, dr / 2, s, ds) ||
|
||||
regions_overlap(t1, dr / 2, s, ds);
|
||||
|
||||
} else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
|
||||
return regions_overlap(s, ds, r, dr);
|
||||
|
||||
} else {
|
||||
return reg_space(r) == reg_space(s) &&
|
||||
!(reg_offset(r) + dr <= reg_offset(s) ||
|
||||
reg_offset(s) + ds <= reg_offset(r));
|
||||
}
|
||||
}
|
||||
|
||||
class vec4_instruction : public backend_instruction {
|
||||
public:
|
||||
DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
|
||||
|
||||
vec4_instruction(enum opcode opcode,
|
||||
const dst_reg &dst = dst_reg(),
|
||||
const src_reg &src0 = src_reg(),
|
||||
const src_reg &src1 = src_reg(),
|
||||
const src_reg &src2 = src_reg());
|
||||
|
||||
dst_reg dst;
|
||||
src_reg src[3];
|
||||
|
||||
enum brw_urb_write_flags urb_write_flags;
|
||||
|
||||
unsigned sol_binding; /**< gfx6: SOL binding table index */
|
||||
bool sol_final_write; /**< gfx6: send commit message */
|
||||
unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */
|
||||
|
||||
bool is_send_from_grf() const;
|
||||
unsigned size_read(unsigned arg) const;
|
||||
bool can_reswizzle(const struct intel_device_info *devinfo,
|
||||
int dst_writemask,
|
||||
int swizzle, int swizzle_mask);
|
||||
void reswizzle(int dst_writemask, int swizzle);
|
||||
bool can_do_source_mods(const struct intel_device_info *devinfo);
|
||||
bool can_do_cmod();
|
||||
bool can_do_writemask(const struct intel_device_info *devinfo);
|
||||
bool can_change_types() const;
|
||||
bool has_source_and_destination_hazard() const;
|
||||
unsigned implied_mrf_writes() const;
|
||||
|
||||
bool is_align1_partial_write()
|
||||
{
|
||||
return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
|
||||
opcode == VEC4_OPCODE_SET_HIGH_32BIT;
|
||||
}
|
||||
|
||||
bool reads_flag() const
|
||||
{
|
||||
return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
|
||||
}
|
||||
|
||||
bool reads_flag(unsigned c)
|
||||
{
|
||||
if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
|
||||
return true;
|
||||
|
||||
switch (predicate) {
|
||||
case BRW_PREDICATE_NONE:
|
||||
return false;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_X:
|
||||
return c == 0;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
|
||||
return c == 1;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
|
||||
return c == 2;
|
||||
case BRW_PREDICATE_ALIGN16_REPLICATE_W:
|
||||
return c == 3;
|
||||
default:
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool writes_flag(const intel_device_info *devinfo) const
|
||||
{
|
||||
return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
|
||||
opcode != BRW_OPCODE_CSEL &&
|
||||
opcode != BRW_OPCODE_IF &&
|
||||
opcode != BRW_OPCODE_WHILE));
|
||||
}
|
||||
|
||||
bool reads_g0_implicitly() const
|
||||
{
|
||||
switch (opcode) {
|
||||
case SHADER_OPCODE_TEX:
|
||||
case SHADER_OPCODE_TXL:
|
||||
case SHADER_OPCODE_TXD:
|
||||
case SHADER_OPCODE_TXF:
|
||||
case SHADER_OPCODE_TXF_CMS_W:
|
||||
case SHADER_OPCODE_TXF_CMS:
|
||||
case SHADER_OPCODE_TXF_MCS:
|
||||
case SHADER_OPCODE_TXS:
|
||||
case SHADER_OPCODE_TG4:
|
||||
case SHADER_OPCODE_TG4_OFFSET:
|
||||
case SHADER_OPCODE_SAMPLEINFO:
|
||||
case VS_OPCODE_PULL_CONSTANT_LOAD:
|
||||
case GS_OPCODE_SET_PRIMITIVE_ID:
|
||||
case GS_OPCODE_GET_INSTANCE_ID:
|
||||
case SHADER_OPCODE_GFX4_SCRATCH_READ:
|
||||
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Make the execution of \p inst dependent on the evaluation of a possibly
|
||||
* inverted predicate.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_predicate_inv(enum brw_predicate pred, bool inverse,
|
||||
vec4_instruction *inst)
|
||||
{
|
||||
inst->predicate = pred;
|
||||
inst->predicate_inverse = inverse;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Make the execution of \p inst dependent on the evaluation of a predicate.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_predicate(enum brw_predicate pred, vec4_instruction *inst)
|
||||
{
|
||||
return set_predicate_inv(pred, false, inst);
|
||||
}
|
||||
|
||||
/**
|
||||
* Write the result of evaluating the condition given by \p mod to a flag
|
||||
* register.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
|
||||
{
|
||||
inst->conditional_mod = mod;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Clamp the result of \p inst to the saturation range of its destination
|
||||
* datatype.
|
||||
*/
|
||||
inline vec4_instruction *
|
||||
set_saturate(bool saturate, vec4_instruction *inst)
|
||||
{
|
||||
inst->saturate = saturate;
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of dataflow registers written by the instruction (either
|
||||
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
|
||||
* register_size)'. The somewhat arbitrary register size unit is 16B for the
|
||||
* UNIFORM and IMM files and 32B for all other files.
|
||||
*/
|
||||
inline unsigned
|
||||
regs_written(const vec4_instruction *inst)
|
||||
{
|
||||
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
|
||||
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
|
||||
REG_SIZE);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return the number of dataflow registers read by the instruction (either
|
||||
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
|
||||
* register_size)'. The somewhat arbitrary register size unit is 16B for the
|
||||
* UNIFORM and IMM files and 32B for all other files.
|
||||
*/
|
||||
inline unsigned
|
||||
regs_read(const vec4_instruction *inst, unsigned i)
|
||||
{
|
||||
const unsigned reg_size =
|
||||
inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
|
||||
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
|
||||
reg_size);
|
||||
}
|
||||
|
||||
static inline enum brw_reg_type
|
||||
get_exec_type(const vec4_instruction *inst)
|
||||
{
|
||||
enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file != BAD_FILE) {
|
||||
const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type));
|
||||
if (type_sz(t) > type_sz(exec_type))
|
||||
exec_type = t;
|
||||
else if (type_sz(t) == type_sz(exec_type) &&
|
||||
brw_reg_type_is_floating_point(t))
|
||||
exec_type = t;
|
||||
}
|
||||
}
|
||||
|
||||
if (exec_type == BRW_REGISTER_TYPE_B)
|
||||
exec_type = inst->dst.type;
|
||||
|
||||
/* TODO: We need to handle half-float conversions. */
|
||||
assert(exec_type != BRW_REGISTER_TYPE_HF ||
|
||||
inst->dst.type == BRW_REGISTER_TYPE_HF);
|
||||
assert(exec_type != BRW_REGISTER_TYPE_B);
|
||||
|
||||
return exec_type;
|
||||
}
|
||||
|
||||
static inline unsigned
|
||||
get_exec_type_size(const vec4_instruction *inst)
|
||||
{
|
||||
return type_sz(get_exec_type(inst));
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
#endif
|
||||
|
|
@ -28,7 +28,6 @@
|
|||
#include "brw_eu.h"
|
||||
#include "brw_fs.h"
|
||||
#include "brw_fs_live_variables.h"
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_shader.h"
|
||||
#include <new>
|
||||
|
|
@ -1027,25 +1026,6 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
|
|||
return benefit;
|
||||
}
|
||||
|
||||
class vec4_instruction_scheduler : public instruction_scheduler
|
||||
{
|
||||
public:
|
||||
vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v, int grf_count);
|
||||
void calculate_deps();
|
||||
schedule_node *choose_instruction_to_schedule();
|
||||
const vec4_visitor *v;
|
||||
|
||||
void run();
|
||||
};
|
||||
|
||||
vec4_instruction_scheduler::vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v,
|
||||
int grf_count)
|
||||
: instruction_scheduler(mem_ctx, v, grf_count, /* grf_write_scale */ 1,
|
||||
/* post_reg_alloc */ true),
|
||||
v(v)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
instruction_scheduler::set_current_block(bblock_t *block)
|
||||
{
|
||||
|
|
@ -1534,179 +1514,6 @@ fs_instruction_scheduler::calculate_deps()
|
|||
clear_last_grf_write();
|
||||
}
|
||||
|
||||
void
|
||||
vec4_instruction_scheduler::calculate_deps()
|
||||
{
|
||||
schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)];
|
||||
schedule_node *last_conditional_mod = NULL;
|
||||
schedule_node *last_accumulator_write = NULL;
|
||||
/* Fixed HW registers are assumed to be separate from the virtual
|
||||
* GRFs, so they can be tracked separately. We don't really write
|
||||
* to fixed GRFs much, so don't bother tracking them on a more
|
||||
* granular level.
|
||||
*/
|
||||
schedule_node *last_fixed_grf_write = NULL;
|
||||
|
||||
memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write));
|
||||
memset(last_mrf_write, 0, sizeof(last_mrf_write));
|
||||
|
||||
/* top-to-bottom dependencies: RAW and WAW. */
|
||||
for (schedule_node *n = current.start; n < current.end; n++) {
|
||||
vec4_instruction *inst = (vec4_instruction *)n->inst;
|
||||
|
||||
if (is_scheduling_barrier(inst))
|
||||
add_barrier_deps(n);
|
||||
|
||||
/* read-after-write deps. */
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
for (unsigned j = 0; j < regs_read(inst, i); ++j)
|
||||
add_dep(last_grf_write[inst->src[i].nr + j], n);
|
||||
} else if (inst->src[i].file == FIXED_GRF) {
|
||||
add_dep(last_fixed_grf_write, n);
|
||||
} else if (inst->src[i].is_accumulator()) {
|
||||
assert(last_accumulator_write);
|
||||
add_dep(last_accumulator_write, n);
|
||||
} else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->reads_g0_implicitly())
|
||||
add_dep(last_fixed_grf_write, n);
|
||||
|
||||
if (!inst->is_send_from_grf()) {
|
||||
for (int i = 0; i < inst->mlen; i++) {
|
||||
/* It looks like the MRF regs are released in the send
|
||||
* instruction once it's sent, not when the result comes
|
||||
* back.
|
||||
*/
|
||||
add_dep(last_mrf_write[inst->base_mrf + i], n);
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->reads_flag()) {
|
||||
assert(last_conditional_mod);
|
||||
add_dep(last_conditional_mod, n);
|
||||
}
|
||||
|
||||
if (inst->reads_accumulator_implicitly()) {
|
||||
assert(last_accumulator_write);
|
||||
add_dep(last_accumulator_write, n);
|
||||
}
|
||||
|
||||
/* write-after-write deps. */
|
||||
if (inst->dst.file == VGRF) {
|
||||
for (unsigned j = 0; j < regs_written(inst); ++j) {
|
||||
add_dep(last_grf_write[inst->dst.nr + j], n);
|
||||
last_grf_write[inst->dst.nr + j] = n;
|
||||
}
|
||||
} else if (inst->dst.file == MRF) {
|
||||
add_dep(last_mrf_write[inst->dst.nr], n);
|
||||
last_mrf_write[inst->dst.nr] = n;
|
||||
} else if (inst->dst.file == FIXED_GRF) {
|
||||
add_dep(last_fixed_grf_write, n);
|
||||
last_fixed_grf_write = n;
|
||||
} else if (inst->dst.is_accumulator()) {
|
||||
add_dep(last_accumulator_write, n);
|
||||
last_accumulator_write = n;
|
||||
} else if (inst->dst.file == ARF && !inst->dst.is_null()) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
|
||||
if (inst->mlen > 0 && !inst->is_send_from_grf()) {
|
||||
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
|
||||
add_dep(last_mrf_write[inst->base_mrf + i], n);
|
||||
last_mrf_write[inst->base_mrf + i] = n;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->writes_flag(v->devinfo)) {
|
||||
add_dep(last_conditional_mod, n, 0);
|
||||
last_conditional_mod = n;
|
||||
}
|
||||
|
||||
if (inst->writes_accumulator_implicitly(v->devinfo) &&
|
||||
!inst->dst.is_accumulator()) {
|
||||
add_dep(last_accumulator_write, n);
|
||||
last_accumulator_write = n;
|
||||
}
|
||||
}
|
||||
|
||||
/* bottom-to-top dependencies: WAR */
|
||||
memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write));
|
||||
memset(last_mrf_write, 0, sizeof(last_mrf_write));
|
||||
last_conditional_mod = NULL;
|
||||
last_accumulator_write = NULL;
|
||||
last_fixed_grf_write = NULL;
|
||||
|
||||
for (schedule_node *n = current.end - 1; n >= current.start; n--) {
|
||||
vec4_instruction *inst = (vec4_instruction *)n->inst;
|
||||
|
||||
/* write-after-read deps. */
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
for (unsigned j = 0; j < regs_read(inst, i); ++j)
|
||||
add_dep(n, last_grf_write[inst->src[i].nr + j]);
|
||||
} else if (inst->src[i].file == FIXED_GRF) {
|
||||
add_dep(n, last_fixed_grf_write);
|
||||
} else if (inst->src[i].is_accumulator()) {
|
||||
add_dep(n, last_accumulator_write);
|
||||
} else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
}
|
||||
|
||||
if (!inst->is_send_from_grf()) {
|
||||
for (int i = 0; i < inst->mlen; i++) {
|
||||
/* It looks like the MRF regs are released in the send
|
||||
* instruction once it's sent, not when the result comes
|
||||
* back.
|
||||
*/
|
||||
add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->reads_flag()) {
|
||||
add_dep(n, last_conditional_mod);
|
||||
}
|
||||
|
||||
if (inst->reads_accumulator_implicitly()) {
|
||||
add_dep(n, last_accumulator_write);
|
||||
}
|
||||
|
||||
/* Update the things this instruction wrote, so earlier reads
|
||||
* can mark this as WAR dependency.
|
||||
*/
|
||||
if (inst->dst.file == VGRF) {
|
||||
for (unsigned j = 0; j < regs_written(inst); ++j)
|
||||
last_grf_write[inst->dst.nr + j] = n;
|
||||
} else if (inst->dst.file == MRF) {
|
||||
last_mrf_write[inst->dst.nr] = n;
|
||||
} else if (inst->dst.file == FIXED_GRF) {
|
||||
last_fixed_grf_write = n;
|
||||
} else if (inst->dst.is_accumulator()) {
|
||||
last_accumulator_write = n;
|
||||
} else if (inst->dst.file == ARF && !inst->dst.is_null()) {
|
||||
add_barrier_deps(n);
|
||||
}
|
||||
|
||||
if (inst->mlen > 0 && !inst->is_send_from_grf()) {
|
||||
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
|
||||
last_mrf_write[inst->base_mrf + i] = n;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->writes_flag(v->devinfo)) {
|
||||
last_conditional_mod = n;
|
||||
}
|
||||
|
||||
if (inst->writes_accumulator_implicitly(v->devinfo)) {
|
||||
last_accumulator_write = n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
schedule_node *
|
||||
fs_instruction_scheduler::choose_instruction_to_schedule()
|
||||
{
|
||||
|
|
@ -1837,25 +1644,6 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
|
|||
return chosen;
|
||||
}
|
||||
|
||||
schedule_node *
|
||||
vec4_instruction_scheduler::choose_instruction_to_schedule()
|
||||
{
|
||||
schedule_node *chosen = NULL;
|
||||
int chosen_time = 0;
|
||||
|
||||
/* Of the instructions ready to execute or the closest to being ready,
|
||||
* choose the oldest one.
|
||||
*/
|
||||
foreach_in_list(schedule_node, n, ¤t.available) {
|
||||
if (!chosen || n->tmp.unblocked_time < chosen_time) {
|
||||
chosen = n;
|
||||
chosen_time = n->tmp.unblocked_time;
|
||||
}
|
||||
}
|
||||
|
||||
return chosen;
|
||||
}
|
||||
|
||||
int
|
||||
fs_instruction_scheduler::calculate_issue_time(backend_instruction *inst0)
|
||||
{
|
||||
|
|
@ -2009,41 +1797,6 @@ fs_instruction_scheduler::run(instruction_scheduler_mode mode)
|
|||
}
|
||||
}
|
||||
|
||||
void
|
||||
vec4_instruction_scheduler::run()
|
||||
{
|
||||
foreach_block(block, v->cfg) {
|
||||
set_current_block(block);
|
||||
|
||||
for (schedule_node *n = current.start; n < current.end; n++) {
|
||||
/* We always execute as two vec4s in parallel. */
|
||||
n->issue_time = 2;
|
||||
}
|
||||
|
||||
calculate_deps();
|
||||
|
||||
compute_delays();
|
||||
compute_exits();
|
||||
|
||||
assert(current.available.is_empty());
|
||||
for (schedule_node *n = current.start; n < current.end; n++) {
|
||||
reset_node_tmp(n);
|
||||
|
||||
/* Add DAG heads to the list of available instructions. */
|
||||
if (n->tmp.parent_count == 0)
|
||||
current.available.push_tail(n);
|
||||
}
|
||||
|
||||
current.block->instructions.make_empty();
|
||||
|
||||
while (!current.available.is_empty()) {
|
||||
schedule_node *chosen = choose_instruction_to_schedule();
|
||||
schedule(chosen);
|
||||
update_children(chosen);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fs_instruction_scheduler *
|
||||
fs_visitor::prepare_scheduler(void *mem_ctx)
|
||||
{
|
||||
|
|
@ -2082,16 +1835,3 @@ fs_visitor::schedule_instructions_post_ra()
|
|||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::opt_schedule_instructions()
|
||||
{
|
||||
void *mem_ctx = ralloc_context(NULL);
|
||||
|
||||
vec4_instruction_scheduler sched(mem_ctx, this, prog_data->total_grf);
|
||||
sched.run();
|
||||
|
||||
ralloc_free(mem_ctx);
|
||||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -114,9 +114,6 @@ extern "C" {
|
|||
/* brw_fs_reg_allocate.cpp */
|
||||
void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
|
||||
|
||||
/* brw_vec4_reg_allocate.cpp */
|
||||
void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
|
||||
|
||||
/* brw_disasm.c */
|
||||
extern const char *const conditional_modifier[16];
|
||||
extern const char *const pred_ctrl_align16[16];
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,350 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2011 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_H
|
||||
#define BRW_VEC4_H
|
||||
|
||||
#include "brw_shader.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include "brw_ir_vec4.h"
|
||||
#include "brw_ir_performance.h"
|
||||
#include "brw_vec4_builder.h"
|
||||
#include "brw_vec4_live_variables.h"
|
||||
#endif
|
||||
|
||||
#include "compiler/glsl/ir.h"
|
||||
#include "compiler/nir/nir.h"
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
const unsigned *
|
||||
brw_vec4_generate_assembly(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const nir_shader *nir,
|
||||
struct brw_vue_prog_data *prog_data,
|
||||
const struct cfg_t *cfg,
|
||||
const brw::performance &perf,
|
||||
bool debug_enabled);
|
||||
|
||||
#ifdef __cplusplus
|
||||
} /* extern "C" */
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* The vertex shader front-end.
|
||||
*
|
||||
* Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
|
||||
* fixed-function) into VS IR.
|
||||
*/
|
||||
class vec4_visitor : public backend_shader
|
||||
{
|
||||
public:
|
||||
vec4_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_sampler_prog_key_data *key,
|
||||
struct brw_vue_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
bool no_spills,
|
||||
bool debug_enabled);
|
||||
|
||||
dst_reg dst_null_f()
|
||||
{
|
||||
return dst_reg(brw_null_reg());
|
||||
}
|
||||
|
||||
dst_reg dst_null_df()
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
|
||||
}
|
||||
|
||||
dst_reg dst_null_d()
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
|
||||
}
|
||||
|
||||
dst_reg dst_null_ud()
|
||||
{
|
||||
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
|
||||
}
|
||||
|
||||
const struct brw_sampler_prog_key_data * const key_tex;
|
||||
struct brw_vue_prog_data * const prog_data;
|
||||
char *fail_msg;
|
||||
bool failed;
|
||||
|
||||
/**
|
||||
* GLSL IR currently being processed, which is associated with our
|
||||
* driver IR instructions for debugging purposes.
|
||||
*/
|
||||
const void *base_ir;
|
||||
const char *current_annotation;
|
||||
|
||||
int first_non_payload_grf;
|
||||
unsigned ubo_push_start[4];
|
||||
unsigned push_length;
|
||||
unsigned int max_grf;
|
||||
brw_analysis<brw::vec4_live_variables, backend_shader> live_analysis;
|
||||
brw_analysis<brw::performance, vec4_visitor> performance_analysis;
|
||||
|
||||
/* Regs for vertex results. Generated at ir_variable visiting time
|
||||
* for the ir->location's used.
|
||||
*/
|
||||
dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
|
||||
unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
|
||||
const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
|
||||
int uniforms;
|
||||
|
||||
bool run();
|
||||
void fail(const char *msg, ...);
|
||||
|
||||
int setup_uniforms(int payload_reg);
|
||||
|
||||
bool reg_allocate_trivial();
|
||||
bool reg_allocate();
|
||||
void evaluate_spill_costs(float *spill_costs, bool *no_spill);
|
||||
int choose_spill_reg(struct ra_graph *g);
|
||||
void spill_reg(unsigned spill_reg);
|
||||
void move_grf_array_access_to_scratch();
|
||||
void split_uniform_registers();
|
||||
void setup_push_ranges();
|
||||
virtual void invalidate_analysis(brw::analysis_dependency_class c);
|
||||
void split_virtual_grfs();
|
||||
bool opt_vector_float();
|
||||
bool opt_reduce_swizzle();
|
||||
bool dead_code_eliminate();
|
||||
bool opt_cmod_propagation();
|
||||
bool opt_copy_propagation(bool do_constant_prop = true);
|
||||
bool opt_cse_local(bblock_t *block, const vec4_live_variables &live);
|
||||
bool opt_cse();
|
||||
bool opt_algebraic();
|
||||
bool opt_register_coalesce();
|
||||
bool eliminate_find_live_channel();
|
||||
bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
|
||||
void opt_set_dependency_control();
|
||||
void opt_schedule_instructions();
|
||||
void convert_to_hw_regs();
|
||||
void fixup_3src_null_dest();
|
||||
|
||||
bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
|
||||
bool lower_simd_width();
|
||||
bool scalarize_df();
|
||||
bool lower_64bit_mad_to_mul_add();
|
||||
void apply_logical_swizzle(struct brw_reg *hw_reg,
|
||||
vec4_instruction *inst, int arg);
|
||||
|
||||
vec4_instruction *emit(vec4_instruction *inst);
|
||||
|
||||
vec4_instruction *emit(enum opcode opcode);
|
||||
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
|
||||
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
|
||||
const src_reg &src0);
|
||||
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
|
||||
const src_reg &src0, const src_reg &src1);
|
||||
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
|
||||
const src_reg &src0, const src_reg &src1,
|
||||
const src_reg &src2);
|
||||
|
||||
vec4_instruction *emit_before(bblock_t *block,
|
||||
vec4_instruction *inst,
|
||||
vec4_instruction *new_inst);
|
||||
|
||||
#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
|
||||
#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
|
||||
#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
|
||||
EMIT1(MOV)
|
||||
EMIT1(NOT)
|
||||
EMIT1(RNDD)
|
||||
EMIT1(RNDE)
|
||||
EMIT1(RNDZ)
|
||||
EMIT1(FRC)
|
||||
EMIT1(F32TO16)
|
||||
EMIT1(F16TO32)
|
||||
EMIT2(ADD)
|
||||
EMIT2(MUL)
|
||||
EMIT2(MACH)
|
||||
EMIT2(MAC)
|
||||
EMIT2(AND)
|
||||
EMIT2(OR)
|
||||
EMIT2(XOR)
|
||||
EMIT2(DP3)
|
||||
EMIT2(DP4)
|
||||
EMIT2(DPH)
|
||||
EMIT2(SHL)
|
||||
EMIT2(SHR)
|
||||
EMIT2(ASR)
|
||||
vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
|
||||
enum brw_conditional_mod condition);
|
||||
vec4_instruction *IF(src_reg src0, src_reg src1,
|
||||
enum brw_conditional_mod condition);
|
||||
vec4_instruction *IF(enum brw_predicate predicate);
|
||||
EMIT1(SCRATCH_READ)
|
||||
EMIT2(SCRATCH_WRITE)
|
||||
EMIT3(LRP)
|
||||
EMIT1(BFREV)
|
||||
EMIT3(BFE)
|
||||
EMIT2(BFI1)
|
||||
EMIT3(BFI2)
|
||||
EMIT1(FBH)
|
||||
EMIT1(FBL)
|
||||
EMIT1(CBIT)
|
||||
EMIT1(LZD)
|
||||
EMIT3(MAD)
|
||||
EMIT2(ADDC)
|
||||
EMIT2(SUBB)
|
||||
EMIT1(DIM)
|
||||
|
||||
#undef EMIT1
|
||||
#undef EMIT2
|
||||
#undef EMIT3
|
||||
|
||||
vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
|
||||
src_reg src0, src_reg src1);
|
||||
|
||||
/**
|
||||
* Copy any live channel from \p src to the first channel of the
|
||||
* result.
|
||||
*/
|
||||
src_reg emit_uniformize(const src_reg &src);
|
||||
|
||||
/** Fix all float operands of a 3-source instruction. */
|
||||
void fix_float_operands(src_reg op[3], nir_alu_instr *instr);
|
||||
|
||||
src_reg fix_3src_operand(const src_reg &src);
|
||||
|
||||
vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1 = src_reg());
|
||||
|
||||
src_reg fix_math_operand(const src_reg &src);
|
||||
|
||||
void emit_pack_half_2x16(dst_reg dst, src_reg src0);
|
||||
void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
|
||||
void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
|
||||
void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
|
||||
void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
|
||||
void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
|
||||
|
||||
src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
|
||||
src_reg surface);
|
||||
|
||||
void emit_ndc_computation();
|
||||
void emit_psiz_and_flags(dst_reg reg);
|
||||
vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
|
||||
virtual void emit_urb_slot(dst_reg reg, int varying);
|
||||
|
||||
src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
|
||||
src_reg *reladdr, int reg_offset);
|
||||
void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
|
||||
dst_reg dst,
|
||||
src_reg orig_src,
|
||||
int base_offset);
|
||||
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
|
||||
int base_offset);
|
||||
void emit_pull_constant_load_reg(dst_reg dst,
|
||||
src_reg surf_index,
|
||||
src_reg offset,
|
||||
bblock_t *before_block,
|
||||
vec4_instruction *before_inst);
|
||||
src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
|
||||
vec4_instruction *inst, src_reg src);
|
||||
|
||||
void resolve_ud_negate(src_reg *reg);
|
||||
|
||||
void emit_shader_float_controls_execution_mode();
|
||||
|
||||
bool lower_minmax();
|
||||
|
||||
src_reg get_timestamp();
|
||||
|
||||
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
|
||||
|
||||
bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
|
||||
|
||||
void emit_conversion_from_double(dst_reg dst, src_reg src);
|
||||
void emit_conversion_to_double(dst_reg dst, src_reg src);
|
||||
|
||||
vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
|
||||
bool for_write,
|
||||
bool for_scratch = false,
|
||||
bblock_t *block = NULL,
|
||||
vec4_instruction *ref = NULL);
|
||||
|
||||
virtual void emit_nir_code();
|
||||
virtual void nir_setup_uniforms();
|
||||
virtual void nir_emit_impl(nir_function_impl *impl);
|
||||
virtual void nir_emit_cf_list(exec_list *list);
|
||||
virtual void nir_emit_if(nir_if *if_stmt);
|
||||
virtual void nir_emit_loop(nir_loop *loop);
|
||||
virtual void nir_emit_block(nir_block *block);
|
||||
virtual void nir_emit_instr(nir_instr *instr);
|
||||
virtual void nir_emit_load_const(nir_load_const_instr *instr);
|
||||
src_reg get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr);
|
||||
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
|
||||
virtual void nir_emit_alu(nir_alu_instr *instr);
|
||||
virtual void nir_emit_jump(nir_jump_instr *instr);
|
||||
virtual void nir_emit_texture(nir_tex_instr *instr);
|
||||
virtual void nir_emit_undef(nir_undef_instr *instr);
|
||||
virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
|
||||
|
||||
dst_reg get_nir_def(const nir_def &def, enum brw_reg_type type);
|
||||
dst_reg get_nir_def(const nir_def &def, nir_alu_type type);
|
||||
dst_reg get_nir_def(const nir_def &def);
|
||||
src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
|
||||
unsigned num_components = 4);
|
||||
src_reg get_nir_src(const nir_src &src, nir_alu_type type,
|
||||
unsigned num_components = 4);
|
||||
src_reg get_nir_src(const nir_src &src,
|
||||
unsigned num_components = 4);
|
||||
src_reg get_nir_src_imm(const nir_src &src);
|
||||
src_reg get_indirect_offset(nir_intrinsic_instr *instr);
|
||||
|
||||
dst_reg *nir_ssa_values;
|
||||
|
||||
protected:
|
||||
void emit_vertex();
|
||||
void setup_payload_interference(struct ra_graph *g, int first_payload_node,
|
||||
int reg_node_count);
|
||||
virtual void setup_payload() = 0;
|
||||
virtual void emit_prolog() = 0;
|
||||
virtual void emit_thread_end() = 0;
|
||||
virtual void emit_urb_write_header(int mrf) = 0;
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
|
||||
virtual void gs_emit_vertex(int stream_id);
|
||||
virtual void gs_end_primitive();
|
||||
|
||||
private:
|
||||
/**
|
||||
* If true, then register allocation should fail instead of spilling.
|
||||
*/
|
||||
const bool no_spills;
|
||||
|
||||
unsigned last_scratch; /**< measured in 32-byte (register size) units */
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* BRW_VEC4_H */
|
||||
|
|
@ -1,646 +0,0 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2010-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_BUILDER_H
|
||||
#define BRW_VEC4_BUILDER_H
|
||||
|
||||
#include "brw_ir_vec4.h"
|
||||
#include "brw_ir_allocator.h"
|
||||
|
||||
namespace brw {
|
||||
/**
|
||||
* Toolbox to assemble a VEC4 IR program out of individual instructions.
|
||||
*
|
||||
* This object is meant to have an interface consistent with
|
||||
* brw::fs_builder. They cannot be fully interchangeable because
|
||||
* brw::fs_builder generates scalar code while brw::vec4_builder generates
|
||||
* vector code.
|
||||
*/
|
||||
class vec4_builder {
|
||||
public:
|
||||
/** Type used in this IR to represent a source of an instruction. */
|
||||
typedef brw::src_reg src_reg;
|
||||
|
||||
/** Type used in this IR to represent the destination of an instruction. */
|
||||
typedef brw::dst_reg dst_reg;
|
||||
|
||||
/** Type used in this IR to represent an instruction. */
|
||||
typedef vec4_instruction instruction;
|
||||
|
||||
/**
|
||||
* Construct a vec4_builder that inserts instructions into \p shader.
|
||||
*/
|
||||
vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
|
||||
shader(shader), block(NULL), cursor(NULL),
|
||||
_dispatch_width(dispatch_width), _group(0),
|
||||
force_writemask_all(false),
|
||||
annotation()
|
||||
{
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a vec4_builder that inserts instructions into \p shader
|
||||
* before instruction \p inst in basic block \p block. The default
|
||||
* execution controls and debug annotation are initialized from the
|
||||
* instruction passed as argument.
|
||||
*/
|
||||
vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
|
||||
shader(shader), block(block), cursor(inst),
|
||||
_dispatch_width(inst->exec_size), _group(inst->group),
|
||||
force_writemask_all(inst->force_writemask_all)
|
||||
{
|
||||
annotation.str = inst->annotation;
|
||||
annotation.ir = inst->ir;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a vec4_builder that inserts instructions before \p cursor
|
||||
* in basic block \p block, inheriting other code generation parameters
|
||||
* from this.
|
||||
*/
|
||||
vec4_builder
|
||||
at(bblock_t *block, exec_node *cursor) const
|
||||
{
|
||||
vec4_builder bld = *this;
|
||||
bld.block = block;
|
||||
bld.cursor = cursor;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a vec4_builder appending instructions at the end of the
|
||||
* instruction list of the shader, inheriting other code generation
|
||||
* parameters from this.
|
||||
*/
|
||||
vec4_builder
|
||||
at_end() const
|
||||
{
|
||||
return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a builder specifying the default SIMD width and group of
|
||||
* channel enable signals, inheriting other code generation parameters
|
||||
* from this.
|
||||
*
|
||||
* \p n gives the default SIMD width, \p i gives the slot group used for
|
||||
* predication and control flow masking in multiples of \p n channels.
|
||||
*/
|
||||
vec4_builder
|
||||
group(unsigned n, unsigned i) const
|
||||
{
|
||||
assert(force_writemask_all ||
|
||||
(n <= dispatch_width() && i < dispatch_width() / n));
|
||||
vec4_builder bld = *this;
|
||||
bld._dispatch_width = n;
|
||||
bld._group += i * n;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a builder with per-channel control flow execution masking
|
||||
* disabled if \p b is true. If control flow execution masking is
|
||||
* already disabled this has no effect.
|
||||
*/
|
||||
vec4_builder
|
||||
exec_all(bool b = true) const
|
||||
{
|
||||
vec4_builder bld = *this;
|
||||
if (b)
|
||||
bld.force_writemask_all = true;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Construct a builder with the given debug annotation info.
|
||||
*/
|
||||
vec4_builder
|
||||
annotate(const char *str, const void *ir = NULL) const
|
||||
{
|
||||
vec4_builder bld = *this;
|
||||
bld.annotation.str = str;
|
||||
bld.annotation.ir = ir;
|
||||
return bld;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the SIMD width in use.
|
||||
*/
|
||||
unsigned
|
||||
dispatch_width() const
|
||||
{
|
||||
return _dispatch_width;
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the channel group in use.
|
||||
*/
|
||||
unsigned
|
||||
group() const
|
||||
{
|
||||
return _group;
|
||||
}
|
||||
|
||||
/**
|
||||
* Allocate a virtual register of natural vector size (four for this IR)
|
||||
* and SIMD width. \p n gives the amount of space to allocate in
|
||||
* dispatch_width units (which is just enough space for four logical
|
||||
* components in this IR).
|
||||
*/
|
||||
dst_reg
|
||||
vgrf(enum brw_reg_type type, unsigned n = 1) const
|
||||
{
|
||||
assert(dispatch_width() <= 32);
|
||||
|
||||
if (n > 0)
|
||||
return retype(dst_reg(VGRF, shader->alloc.allocate(
|
||||
n * DIV_ROUND_UP(type_sz(type), 4))),
|
||||
type);
|
||||
else
|
||||
return retype(null_reg_ud(), type);
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a null register of floating type.
|
||||
*/
|
||||
dst_reg
|
||||
null_reg_f() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_vec(dispatch_width()),
|
||||
BRW_REGISTER_TYPE_F));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a null register of signed integer type.
|
||||
*/
|
||||
dst_reg
|
||||
null_reg_d() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_vec(dispatch_width()),
|
||||
BRW_REGISTER_TYPE_D));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create a null register of unsigned integer type.
|
||||
*/
|
||||
dst_reg
|
||||
null_reg_ud() const
|
||||
{
|
||||
return dst_reg(retype(brw_null_vec(dispatch_width()),
|
||||
BRW_REGISTER_TYPE_UD));
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert an instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(const instruction &inst) const
|
||||
{
|
||||
return emit(new(shader->mem_ctx) instruction(inst));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a nullary control instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode) const
|
||||
{
|
||||
return emit(instruction(opcode));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a nullary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst) const
|
||||
{
|
||||
return emit(instruction(opcode, dst));
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a unary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
|
||||
{
|
||||
switch (opcode) {
|
||||
case SHADER_OPCODE_RCP:
|
||||
case SHADER_OPCODE_RSQ:
|
||||
case SHADER_OPCODE_SQRT:
|
||||
case SHADER_OPCODE_EXP2:
|
||||
case SHADER_OPCODE_LOG2:
|
||||
case SHADER_OPCODE_SIN:
|
||||
case SHADER_OPCODE_COS:
|
||||
return fix_math_instruction(
|
||||
emit(instruction(opcode, dst,
|
||||
fix_math_operand(src0))));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dst, src0));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a binary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1) const
|
||||
{
|
||||
switch (opcode) {
|
||||
case SHADER_OPCODE_POW:
|
||||
case SHADER_OPCODE_INT_QUOTIENT:
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
return fix_math_instruction(
|
||||
emit(instruction(opcode, dst,
|
||||
fix_math_operand(src0),
|
||||
fix_math_operand(src1))));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dst, src0, src1));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Create and insert a ternary instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1, const src_reg &src2) const
|
||||
{
|
||||
switch (opcode) {
|
||||
case BRW_OPCODE_BFE:
|
||||
case BRW_OPCODE_BFI2:
|
||||
case BRW_OPCODE_MAD:
|
||||
case BRW_OPCODE_LRP:
|
||||
return emit(instruction(opcode, dst,
|
||||
fix_3src_operand(src0),
|
||||
fix_3src_operand(src1),
|
||||
fix_3src_operand(src2)));
|
||||
|
||||
default:
|
||||
return emit(instruction(opcode, dst, src0, src1, src2));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Insert a preallocated instruction into the program.
|
||||
*/
|
||||
instruction *
|
||||
emit(instruction *inst) const
|
||||
{
|
||||
inst->exec_size = dispatch_width();
|
||||
inst->group = group();
|
||||
inst->force_writemask_all = force_writemask_all;
|
||||
inst->size_written = inst->exec_size * type_sz(inst->dst.type);
|
||||
inst->annotation = annotation.str;
|
||||
inst->ir = annotation.ir;
|
||||
|
||||
if (block)
|
||||
static_cast<instruction *>(cursor)->insert_before(block, inst);
|
||||
else
|
||||
cursor->insert_before(inst);
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
/**
|
||||
* Select \p src0 if the comparison of both sources with the given
|
||||
* conditional mod evaluates to true, otherwise select \p src1.
|
||||
*
|
||||
* Generally useful to get the minimum or maximum of two values.
|
||||
*/
|
||||
instruction *
|
||||
emit_minmax(const dst_reg &dst, const src_reg &src0,
|
||||
const src_reg &src1, brw_conditional_mod mod) const
|
||||
{
|
||||
assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
|
||||
|
||||
return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Copy any live channel from \p src to the first channel of the result.
|
||||
*/
|
||||
src_reg
|
||||
emit_uniformize(const src_reg &src) const
|
||||
{
|
||||
const vec4_builder ubld = exec_all();
|
||||
const dst_reg chan_index =
|
||||
writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
|
||||
const dst_reg dst = vgrf(src.type);
|
||||
|
||||
ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
|
||||
ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
|
||||
|
||||
return src_reg(dst);
|
||||
}
|
||||
|
||||
/**
|
||||
* Assorted arithmetic ops.
|
||||
* @{
|
||||
*/
|
||||
#define ALU1(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0) const \
|
||||
{ \
|
||||
return emit(BRW_OPCODE_##op, dst, src0); \
|
||||
}
|
||||
|
||||
#define ALU2(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
|
||||
{ \
|
||||
return emit(BRW_OPCODE_##op, dst, src0, src1); \
|
||||
}
|
||||
|
||||
#define ALU2_ACC(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
|
||||
{ \
|
||||
instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
|
||||
inst->writes_accumulator = true; \
|
||||
return inst; \
|
||||
}
|
||||
|
||||
#define ALU3(op) \
|
||||
instruction * \
|
||||
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
|
||||
const src_reg &src2) const \
|
||||
{ \
|
||||
return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
|
||||
}
|
||||
|
||||
ALU2(ADD)
|
||||
ALU2_ACC(ADDC)
|
||||
ALU2(AND)
|
||||
ALU2(ASR)
|
||||
ALU2(AVG)
|
||||
ALU3(BFE)
|
||||
ALU2(BFI1)
|
||||
ALU3(BFI2)
|
||||
ALU1(BFREV)
|
||||
ALU1(CBIT)
|
||||
ALU3(CSEL)
|
||||
ALU1(DIM)
|
||||
ALU2(DP2)
|
||||
ALU2(DP3)
|
||||
ALU2(DP4)
|
||||
ALU2(DPH)
|
||||
ALU1(F16TO32)
|
||||
ALU1(F32TO16)
|
||||
ALU1(FBH)
|
||||
ALU1(FBL)
|
||||
ALU1(FRC)
|
||||
ALU2(LINE)
|
||||
ALU1(LZD)
|
||||
ALU2(MAC)
|
||||
ALU2_ACC(MACH)
|
||||
ALU3(MAD)
|
||||
ALU1(MOV)
|
||||
ALU2(MUL)
|
||||
ALU1(NOT)
|
||||
ALU2(OR)
|
||||
ALU2(PLN)
|
||||
ALU1(RNDD)
|
||||
ALU1(RNDE)
|
||||
ALU1(RNDU)
|
||||
ALU1(RNDZ)
|
||||
ALU2(SAD2)
|
||||
ALU2_ACC(SADA2)
|
||||
ALU2(SEL)
|
||||
ALU2(SHL)
|
||||
ALU2(SHR)
|
||||
ALU2_ACC(SUBB)
|
||||
ALU2(XOR)
|
||||
|
||||
#undef ALU3
|
||||
#undef ALU2_ACC
|
||||
#undef ALU2
|
||||
#undef ALU1
|
||||
/** @} */
|
||||
|
||||
/**
|
||||
* CMP: Sets the low bit of the destination channels with the result
|
||||
* of the comparison, while the upper bits are undefined, and updates
|
||||
* the flag register with the packed 16 bits of the result.
|
||||
*/
|
||||
instruction *
|
||||
CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
|
||||
brw_conditional_mod condition) const
|
||||
{
|
||||
/* Take the instruction:
|
||||
*
|
||||
* CMP null<d> src0<f> src1<f>
|
||||
*
|
||||
* Original gfx4 does type conversion to the destination type
|
||||
* before comparison, producing garbage results for floating
|
||||
* point comparisons.
|
||||
*
|
||||
* The destination type doesn't matter on newer generations,
|
||||
* so we set the type to match src0 so we can compact the
|
||||
* instruction.
|
||||
*/
|
||||
return set_condmod(condition,
|
||||
emit(BRW_OPCODE_CMP, retype(dst, src0.type),
|
||||
fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* CMPN: Behaves like CMP, but produces true if src1 is NaN.
|
||||
*/
|
||||
instruction *
|
||||
CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
|
||||
brw_conditional_mod condition) const
|
||||
{
|
||||
/* Take the instruction:
|
||||
*
|
||||
* CMPN null<d> src0<f> src1<f>
|
||||
*
|
||||
* Original gfx4 does type conversion to the destination type
|
||||
* before comparison, producing garbage results for floating
|
||||
* point comparisons.
|
||||
*
|
||||
* The destination type doesn't matter on newer generations,
|
||||
* so we set the type to match src0 so we can compact the
|
||||
* instruction.
|
||||
*/
|
||||
return set_condmod(condition,
|
||||
emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
|
||||
fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gfx4 predicated IF.
|
||||
*/
|
||||
instruction *
|
||||
IF(brw_predicate predicate) const
|
||||
{
|
||||
return set_predicate(predicate, emit(BRW_OPCODE_IF));
|
||||
}
|
||||
|
||||
/**
|
||||
* Gfx6 IF with embedded comparison.
|
||||
*/
|
||||
instruction *
|
||||
IF(const src_reg &src0, const src_reg &src1,
|
||||
brw_conditional_mod condition) const
|
||||
{
|
||||
assert(shader->devinfo->ver == 6);
|
||||
return set_condmod(condition,
|
||||
emit(BRW_OPCODE_IF,
|
||||
null_reg_d(),
|
||||
fix_unsigned_negate(src0),
|
||||
fix_unsigned_negate(src1)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit a linear interpolation instruction.
|
||||
*/
|
||||
instruction *
|
||||
LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
|
||||
const src_reg &a) const
|
||||
{
|
||||
/* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
|
||||
* we need to reorder the operands.
|
||||
*/
|
||||
assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9);
|
||||
return emit(BRW_OPCODE_LRP, dst, a, y, x);
|
||||
}
|
||||
|
||||
backend_shader *shader;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* Workaround for negation of UD registers. See comment in
|
||||
* fs_generator::generate_code() for the details.
|
||||
*/
|
||||
src_reg
|
||||
fix_unsigned_negate(const src_reg &src) const
|
||||
{
|
||||
if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
|
||||
dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
|
||||
MOV(temp, src);
|
||||
return src_reg(temp);
|
||||
} else {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Workaround for register access modes not supported by the ternary
|
||||
* instruction encoding.
|
||||
*/
|
||||
src_reg
|
||||
fix_3src_operand(const src_reg &src) const
|
||||
{
|
||||
/* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
|
||||
* able to use vertical stride of zero to replicate the vec4 uniform, like
|
||||
*
|
||||
* g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
|
||||
*
|
||||
* But you can't, since vertical stride is always four in three-source
|
||||
* instructions. Instead, insert a MOV instruction to do the replication so
|
||||
* that the three-source instruction can consume it.
|
||||
*/
|
||||
|
||||
/* The MOV is only needed if the source is a uniform or immediate. */
|
||||
if (src.file != UNIFORM && src.file != IMM)
|
||||
return src;
|
||||
|
||||
if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
|
||||
return src;
|
||||
|
||||
const dst_reg expanded = vgrf(src.type);
|
||||
emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
|
||||
return src_reg(expanded);
|
||||
}
|
||||
|
||||
/**
|
||||
* Workaround for register access modes not supported by the math
|
||||
* instruction.
|
||||
*/
|
||||
src_reg
|
||||
fix_math_operand(const src_reg &src) const
|
||||
{
|
||||
/* The gfx6 math instruction ignores the source modifiers --
|
||||
* swizzle, abs, negate, and at least some parts of the register
|
||||
* region description.
|
||||
*
|
||||
* Rather than trying to enumerate all these cases, *always* expand the
|
||||
* operand to a temp GRF for gfx6.
|
||||
*
|
||||
* For gfx7, keep the operand as-is, except if immediate, which gfx7 still
|
||||
* can't use.
|
||||
*/
|
||||
if (shader->devinfo->ver == 6 ||
|
||||
(shader->devinfo->ver == 7 && src.file == IMM)) {
|
||||
const dst_reg tmp = vgrf(src.type);
|
||||
MOV(tmp, src);
|
||||
return src_reg(tmp);
|
||||
} else {
|
||||
return src;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Workaround other weirdness of the math instruction.
|
||||
*/
|
||||
instruction *
|
||||
fix_math_instruction(instruction *inst) const
|
||||
{
|
||||
if (shader->devinfo->ver == 6 &&
|
||||
inst->dst.writemask != WRITEMASK_XYZW) {
|
||||
const dst_reg tmp = vgrf(inst->dst.type);
|
||||
MOV(inst->dst, src_reg(tmp));
|
||||
inst->dst = tmp;
|
||||
|
||||
} else if (shader->devinfo->ver < 6) {
|
||||
const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
|
||||
inst->base_mrf = 1;
|
||||
inst->mlen = sources;
|
||||
}
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
bblock_t *block;
|
||||
exec_node *cursor;
|
||||
|
||||
unsigned _dispatch_width;
|
||||
unsigned _group;
|
||||
bool force_writemask_all;
|
||||
|
||||
/** Debug annotation info. */
|
||||
struct {
|
||||
const char *str;
|
||||
const void *ir;
|
||||
} annotation;
|
||||
};
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -1,365 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
/** @file brw_vec4_cmod_propagation.cpp
|
||||
*
|
||||
* Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
|
||||
* brw_fs_cmod_propagation for further details on the rationale behind this
|
||||
* optimization.
|
||||
*/
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_eu.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
static bool
|
||||
writemasks_incompatible(const vec4_instruction *earlier,
|
||||
const vec4_instruction *later)
|
||||
{
|
||||
return (earlier->dst.writemask != WRITEMASK_X &&
|
||||
earlier->dst.writemask != WRITEMASK_XYZW) ||
|
||||
(earlier->dst.writemask == WRITEMASK_XYZW &&
|
||||
later->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
|
||||
(later->dst.writemask & ~earlier->dst.writemask) != 0;
|
||||
}
|
||||
|
||||
static bool
|
||||
opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
|
||||
{
|
||||
bool progress = false;
|
||||
UNUSED int ip = block->end_ip + 1;
|
||||
|
||||
foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
|
||||
ip--;
|
||||
|
||||
if ((inst->opcode != BRW_OPCODE_AND &&
|
||||
inst->opcode != BRW_OPCODE_CMP &&
|
||||
inst->opcode != BRW_OPCODE_MOV) ||
|
||||
inst->predicate != BRW_PREDICATE_NONE ||
|
||||
!inst->dst.is_null() ||
|
||||
(inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
|
||||
inst->src[0].file != UNIFORM))
|
||||
continue;
|
||||
|
||||
/* An ABS source modifier can only be handled when processing a compare
|
||||
* with a value other than zero.
|
||||
*/
|
||||
if (inst->src[0].abs &&
|
||||
(inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
|
||||
continue;
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_AND &&
|
||||
!(inst->src[1].is_one() &&
|
||||
inst->conditional_mod == BRW_CONDITIONAL_NZ &&
|
||||
!inst->src[0].negate))
|
||||
continue;
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_MOV &&
|
||||
inst->conditional_mod != BRW_CONDITIONAL_NZ)
|
||||
continue;
|
||||
|
||||
bool read_flag = false;
|
||||
foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
|
||||
/* A CMP with a second source of zero can match with anything. A CMP
|
||||
* with a second source that is not zero can only match with an ADD
|
||||
* instruction.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
|
||||
bool negate;
|
||||
|
||||
if (scan_inst->opcode != BRW_OPCODE_ADD)
|
||||
goto not_match;
|
||||
|
||||
if (writemasks_incompatible(scan_inst, inst))
|
||||
goto not_match;
|
||||
|
||||
/* A CMP is basically a subtraction. The result of the
|
||||
* subtraction must be the same as the result of the addition.
|
||||
* This means that one of the operands must be negated. So (a +
|
||||
* b) vs (a == -b) or (a + -b) vs (a == b).
|
||||
*/
|
||||
if ((inst->src[0].equals(scan_inst->src[0]) &&
|
||||
inst->src[1].negative_equals(scan_inst->src[1])) ||
|
||||
(inst->src[0].equals(scan_inst->src[1]) &&
|
||||
inst->src[1].negative_equals(scan_inst->src[0]))) {
|
||||
negate = false;
|
||||
} else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
|
||||
inst->src[1].equals(scan_inst->src[1])) ||
|
||||
(inst->src[0].negative_equals(scan_inst->src[1]) &&
|
||||
inst->src[1].equals(scan_inst->src[0]))) {
|
||||
negate = true;
|
||||
} else {
|
||||
goto not_match;
|
||||
}
|
||||
|
||||
if (scan_inst->exec_size != inst->exec_size ||
|
||||
scan_inst->group != inst->group)
|
||||
goto not_match;
|
||||
|
||||
/* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
|
||||
*
|
||||
* * Note that the [post condition signal] bits generated at
|
||||
* the output of a compute are before the .sat.
|
||||
*
|
||||
* So we don't have to bail if scan_inst has saturate.
|
||||
*/
|
||||
|
||||
/* Otherwise, try propagating the conditional. */
|
||||
const enum brw_conditional_mod cond =
|
||||
negate ? brw_swap_cmod(inst->conditional_mod)
|
||||
: inst->conditional_mod;
|
||||
|
||||
if (scan_inst->can_do_cmod() &&
|
||||
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
|
||||
scan_inst->conditional_mod == cond)) {
|
||||
scan_inst->conditional_mod = cond;
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
if (regions_overlap(inst->src[0], inst->size_read(0),
|
||||
scan_inst->dst, scan_inst->size_written)) {
|
||||
if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
|
||||
scan_inst->dst.offset != inst->src[0].offset ||
|
||||
scan_inst->exec_size != inst->exec_size ||
|
||||
scan_inst->group != inst->group) {
|
||||
break;
|
||||
}
|
||||
|
||||
/* If scan_inst is a CMP that produces a single value and inst is
|
||||
* a CMP.NZ that consumes only that value, remove inst.
|
||||
*/
|
||||
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
|
||||
(inst->src[0].type == BRW_REGISTER_TYPE_D ||
|
||||
inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
|
||||
(inst->opcode == BRW_OPCODE_CMP ||
|
||||
inst->opcode == BRW_OPCODE_MOV) &&
|
||||
scan_inst->opcode == BRW_OPCODE_CMP &&
|
||||
((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
|
||||
scan_inst->dst.writemask == WRITEMASK_X) ||
|
||||
(inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
|
||||
scan_inst->dst.writemask == WRITEMASK_Y) ||
|
||||
(inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
|
||||
scan_inst->dst.writemask == WRITEMASK_Z) ||
|
||||
(inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
|
||||
scan_inst->dst.writemask == WRITEMASK_W))) {
|
||||
if (inst->dst.writemask != scan_inst->dst.writemask) {
|
||||
src_reg temp(v, glsl_vec4_type(), 1);
|
||||
|
||||
/* Given a sequence like:
|
||||
*
|
||||
* cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF
|
||||
* ...
|
||||
* cmp.nz.f0(8) null<1>D g21<4>.zD 0D
|
||||
*
|
||||
* Replace it with something like:
|
||||
*
|
||||
* cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF
|
||||
* mov(8) g21<1>.xF g22<1>.zzzzF
|
||||
*
|
||||
* The added MOV will most likely be removed later. In the
|
||||
* worst case, it should be cheaper to schedule.
|
||||
*/
|
||||
temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
|
||||
temp.type = scan_inst->src[0].type;
|
||||
|
||||
vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
|
||||
|
||||
/* Modify the source swizzles on scan_inst. If scan_inst
|
||||
* was
|
||||
*
|
||||
* cmp.ge.f0(8) g21<1>.zF g20<4>.wzyxF g18<4>.yxwzF
|
||||
*
|
||||
* replace it with
|
||||
*
|
||||
* cmp.ge.f0(8) g21<1>.zF g20<4>.yyyyF g18<4>.wwwwF
|
||||
*/
|
||||
unsigned src0_chan;
|
||||
unsigned src1_chan;
|
||||
switch (scan_inst->dst.writemask) {
|
||||
case WRITEMASK_X:
|
||||
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
|
||||
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
|
||||
break;
|
||||
case WRITEMASK_Y:
|
||||
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
|
||||
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
|
||||
break;
|
||||
case WRITEMASK_Z:
|
||||
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
|
||||
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
|
||||
break;
|
||||
case WRITEMASK_W:
|
||||
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
|
||||
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
|
||||
break;
|
||||
default:
|
||||
unreachable("Impossible writemask");
|
||||
}
|
||||
|
||||
scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
|
||||
src0_chan,
|
||||
src0_chan,
|
||||
src0_chan);
|
||||
|
||||
/* There's no swizzle on immediate value sources. */
|
||||
if (scan_inst->src[1].file != IMM) {
|
||||
scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
|
||||
src1_chan,
|
||||
src1_chan,
|
||||
src1_chan);
|
||||
}
|
||||
|
||||
scan_inst->dst = dst_reg(temp);
|
||||
scan_inst->dst.writemask = inst->dst.writemask;
|
||||
|
||||
scan_inst->insert_after(block, mov);
|
||||
}
|
||||
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
if (writemasks_incompatible(scan_inst, inst))
|
||||
break;
|
||||
|
||||
/* CMP's result is the same regardless of dest type. */
|
||||
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
|
||||
scan_inst->opcode == BRW_OPCODE_CMP &&
|
||||
(inst->dst.type == BRW_REGISTER_TYPE_D ||
|
||||
inst->dst.type == BRW_REGISTER_TYPE_UD)) {
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/* If the AND wasn't handled by the previous case, it isn't safe
|
||||
* to remove it.
|
||||
*/
|
||||
if (inst->opcode == BRW_OPCODE_AND)
|
||||
break;
|
||||
|
||||
/* Comparisons operate differently for ints and floats */
|
||||
if (scan_inst->dst.type != inst->dst.type &&
|
||||
(scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
|
||||
inst->dst.type == BRW_REGISTER_TYPE_F))
|
||||
break;
|
||||
|
||||
/* If the instruction generating inst's source also wrote the
|
||||
* flag, and inst is doing a simple .nz comparison, then inst
|
||||
* is redundant - the appropriate value is already in the flag
|
||||
* register. Delete inst.
|
||||
*/
|
||||
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
|
||||
!inst->src[0].negate &&
|
||||
scan_inst->writes_flag(v->devinfo)) {
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
|
||||
/* The conditional mod of the CMP/CMPN instructions behaves
|
||||
* specially because the flag output is not calculated from the
|
||||
* result of the instruction, but the other way around, which
|
||||
* means that even if the condmod to propagate and the condmod
|
||||
* from the CMP instruction are the same they will in general give
|
||||
* different results because they are evaluated based on different
|
||||
* inputs.
|
||||
*/
|
||||
if (scan_inst->opcode == BRW_OPCODE_CMP ||
|
||||
scan_inst->opcode == BRW_OPCODE_CMPN)
|
||||
break;
|
||||
|
||||
/* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
|
||||
*
|
||||
* * Note that the [post condition signal] bits generated at
|
||||
* the output of a compute are before the .sat.
|
||||
*/
|
||||
if (scan_inst->saturate)
|
||||
break;
|
||||
|
||||
/* From the Sky Lake PRM, Vol 2a, "Multiply":
|
||||
*
|
||||
* "When multiplying integer data types, if one of the sources
|
||||
* is a DW, the resulting full precision data is stored in
|
||||
* the accumulator. However, if the destination data type is
|
||||
* either W or DW, the low bits of the result are written to
|
||||
* the destination register and the remaining high bits are
|
||||
* discarded. This results in undefined Overflow and Sign
|
||||
* flags. Therefore, conditional modifiers and saturation
|
||||
* (.sat) cannot be used in this case.
|
||||
*
|
||||
* We just disallow cmod propagation on all integer multiplies.
|
||||
*/
|
||||
if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
|
||||
scan_inst->opcode == BRW_OPCODE_MUL)
|
||||
break;
|
||||
|
||||
/* Otherwise, try propagating the conditional. */
|
||||
enum brw_conditional_mod cond =
|
||||
inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
|
||||
: inst->conditional_mod;
|
||||
|
||||
if (scan_inst->can_do_cmod() &&
|
||||
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
|
||||
scan_inst->conditional_mod == cond)) {
|
||||
scan_inst->conditional_mod = cond;
|
||||
inst->remove(block);
|
||||
progress = true;
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
not_match:
|
||||
if (scan_inst->writes_flag(v->devinfo))
|
||||
break;
|
||||
|
||||
read_flag = read_flag || scan_inst->reads_flag();
|
||||
}
|
||||
}
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::opt_cmod_propagation()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
foreach_block_reverse(block, cfg) {
|
||||
progress = opt_cmod_propagation_local(block, this) || progress;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
|
@ -1,556 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2011 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file brw_vec4_copy_propagation.cpp
|
||||
*
|
||||
* Implements tracking of values copied between registers, and
|
||||
* optimizations based on that: copy propagation and constant
|
||||
* propagation.
|
||||
*/
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_eu.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
struct copy_entry {
|
||||
src_reg *value[4];
|
||||
int saturatemask;
|
||||
};
|
||||
|
||||
static bool
|
||||
is_direct_copy(vec4_instruction *inst)
|
||||
{
|
||||
return (inst->opcode == BRW_OPCODE_MOV &&
|
||||
!inst->predicate &&
|
||||
inst->dst.file == VGRF &&
|
||||
inst->dst.offset % REG_SIZE == 0 &&
|
||||
!inst->dst.reladdr &&
|
||||
!inst->src[0].reladdr &&
|
||||
(inst->dst.type == inst->src[0].type ||
|
||||
(inst->dst.type == BRW_REGISTER_TYPE_F &&
|
||||
inst->src[0].type == BRW_REGISTER_TYPE_VF)));
|
||||
}
|
||||
|
||||
static bool
|
||||
is_dominated_by_previous_instruction(vec4_instruction *inst)
|
||||
{
|
||||
return (inst->opcode != BRW_OPCODE_DO &&
|
||||
inst->opcode != BRW_OPCODE_WHILE &&
|
||||
inst->opcode != BRW_OPCODE_ELSE &&
|
||||
inst->opcode != BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
static bool
|
||||
is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
|
||||
{
|
||||
const src_reg *src = values[ch];
|
||||
|
||||
/* consider GRF only */
|
||||
assert(inst->dst.file == VGRF);
|
||||
if (!src || src->file != VGRF)
|
||||
return false;
|
||||
|
||||
return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
|
||||
(inst->dst.offset != src->offset ||
|
||||
inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
|
||||
}
|
||||
|
||||
/**
|
||||
* Get the origin of a copy as a single register if all components present in
|
||||
* the given readmask originate from the same register and have compatible
|
||||
* regions, otherwise return a BAD_FILE register.
|
||||
*/
|
||||
static src_reg
|
||||
get_copy_value(const copy_entry &entry, unsigned readmask)
|
||||
{
|
||||
unsigned swz[4] = {};
|
||||
src_reg value;
|
||||
|
||||
for (unsigned i = 0; i < 4; i++) {
|
||||
if (readmask & (1 << i)) {
|
||||
if (entry.value[i]) {
|
||||
src_reg src = *entry.value[i];
|
||||
|
||||
if (src.file == IMM) {
|
||||
swz[i] = i;
|
||||
} else {
|
||||
swz[i] = BRW_GET_SWZ(src.swizzle, i);
|
||||
/* Overwrite the original swizzle so the src_reg::equals call
|
||||
* below doesn't care about it, the correct swizzle will be
|
||||
* calculated once the swizzles of all components are known.
|
||||
*/
|
||||
src.swizzle = BRW_SWIZZLE_XYZW;
|
||||
}
|
||||
|
||||
if (value.file == BAD_FILE) {
|
||||
value = src;
|
||||
} else if (!value.equals(src)) {
|
||||
return src_reg();
|
||||
}
|
||||
} else {
|
||||
return src_reg();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return swizzle(value,
|
||||
brw_compose_swizzle(brw_swizzle_for_mask(readmask),
|
||||
BRW_SWIZZLE4(swz[0], swz[1],
|
||||
swz[2], swz[3])));
|
||||
}
|
||||
|
||||
static bool
|
||||
try_constant_propagate(vec4_instruction *inst,
|
||||
int arg, const copy_entry *entry)
|
||||
{
|
||||
/* For constant propagation, we only handle the same constant
|
||||
* across all 4 channels. Some day, we should handle the 8-bit
|
||||
* float vector format, which would let us constant propagate
|
||||
* vectors better.
|
||||
* We could be more aggressive here -- some channels might not get used
|
||||
* based on the destination writemask.
|
||||
*/
|
||||
src_reg value =
|
||||
get_copy_value(*entry,
|
||||
brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
|
||||
WRITEMASK_XYZW));
|
||||
|
||||
if (value.file != IMM)
|
||||
return false;
|
||||
|
||||
/* 64-bit types can't be used except for one-source instructions, which
|
||||
* higher levels should have constant folded away, so there's no point in
|
||||
* propagating immediates here.
|
||||
*/
|
||||
if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
|
||||
return false;
|
||||
|
||||
if (value.type == BRW_REGISTER_TYPE_VF) {
|
||||
/* The result of bit-casting the component values of a vector float
|
||||
* cannot in general be represented as an immediate.
|
||||
*/
|
||||
if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
|
||||
return false;
|
||||
} else {
|
||||
value.type = inst->src[arg].type;
|
||||
}
|
||||
|
||||
if (inst->src[arg].abs) {
|
||||
if (!brw_abs_immediate(value.type, &value.as_brw_reg()))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (inst->src[arg].negate) {
|
||||
if (!brw_negate_immediate(value.type, &value.as_brw_reg()))
|
||||
return false;
|
||||
}
|
||||
|
||||
value = swizzle(value, inst->src[arg].swizzle);
|
||||
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_MOV:
|
||||
case SHADER_OPCODE_BROADCAST:
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
|
||||
case VEC4_OPCODE_UNTYPED_ATOMIC:
|
||||
if (arg == 1) {
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_POW:
|
||||
case SHADER_OPCODE_INT_QUOTIENT:
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
break;
|
||||
case BRW_OPCODE_DP2:
|
||||
case BRW_OPCODE_DP3:
|
||||
case BRW_OPCODE_DP4:
|
||||
case BRW_OPCODE_DPH:
|
||||
case BRW_OPCODE_BFI1:
|
||||
case BRW_OPCODE_ASR:
|
||||
case BRW_OPCODE_SHL:
|
||||
case BRW_OPCODE_SHR:
|
||||
case BRW_OPCODE_SUBB:
|
||||
if (arg == 1) {
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_MACH:
|
||||
case BRW_OPCODE_MUL:
|
||||
case SHADER_OPCODE_MULH:
|
||||
case BRW_OPCODE_ADD:
|
||||
case BRW_OPCODE_OR:
|
||||
case BRW_OPCODE_AND:
|
||||
case BRW_OPCODE_XOR:
|
||||
case BRW_OPCODE_ADDC:
|
||||
if (arg == 1) {
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
} else if (arg == 0 && inst->src[1].file != IMM) {
|
||||
/* Fit this constant in by commuting the operands. Exception: we
|
||||
* can't do this for 32-bit integer MUL/MACH because it's asymmetric.
|
||||
*/
|
||||
if ((inst->opcode == BRW_OPCODE_MUL ||
|
||||
inst->opcode == BRW_OPCODE_MACH) &&
|
||||
(inst->src[1].type == BRW_REGISTER_TYPE_D ||
|
||||
inst->src[1].type == BRW_REGISTER_TYPE_UD))
|
||||
break;
|
||||
inst->src[0] = inst->src[1];
|
||||
inst->src[1] = value;
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
case GS_OPCODE_SET_WRITE_OFFSET:
|
||||
/* This is just a multiply by a constant with special strides.
|
||||
* The generator will handle immediates in both arguments (generating
|
||||
* a single MOV of the product). So feel free to propagate in src0.
|
||||
*/
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
|
||||
case BRW_OPCODE_CMP:
|
||||
if (arg == 1) {
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
} else if (arg == 0 && inst->src[1].file != IMM) {
|
||||
enum brw_conditional_mod new_cmod;
|
||||
|
||||
new_cmod = brw_swap_cmod(inst->conditional_mod);
|
||||
if (new_cmod != BRW_CONDITIONAL_NONE) {
|
||||
/* Fit this constant in by swapping the operands and
|
||||
* flipping the test.
|
||||
*/
|
||||
inst->src[0] = inst->src[1];
|
||||
inst->src[1] = value;
|
||||
inst->conditional_mod = new_cmod;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_SEL:
|
||||
if (arg == 1) {
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
} else if (arg == 0 && inst->src[1].file != IMM) {
|
||||
inst->src[0] = inst->src[1];
|
||||
inst->src[1] = value;
|
||||
|
||||
/* If this was predicated, flipping operands means
|
||||
* we also need to flip the predicate.
|
||||
*/
|
||||
if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
|
||||
inst->predicate_inverse = !inst->predicate_inverse;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool
|
||||
is_align1_opcode(unsigned opcode)
|
||||
{
|
||||
switch (opcode) {
|
||||
case VEC4_OPCODE_DOUBLE_TO_F32:
|
||||
case VEC4_OPCODE_DOUBLE_TO_D32:
|
||||
case VEC4_OPCODE_DOUBLE_TO_U32:
|
||||
case VEC4_OPCODE_TO_DOUBLE:
|
||||
case VEC4_OPCODE_PICK_LOW_32BIT:
|
||||
case VEC4_OPCODE_PICK_HIGH_32BIT:
|
||||
case VEC4_OPCODE_SET_LOW_32BIT:
|
||||
case VEC4_OPCODE_SET_HIGH_32BIT:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
try_copy_propagate(const struct brw_compiler *compiler,
|
||||
vec4_instruction *inst, int arg,
|
||||
const copy_entry *entry, int attributes_per_reg)
|
||||
{
|
||||
const struct intel_device_info *devinfo = compiler->devinfo;
|
||||
|
||||
/* Build up the value we are propagating as if it were the source of a
|
||||
* single MOV
|
||||
*/
|
||||
src_reg value =
|
||||
get_copy_value(*entry,
|
||||
brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
|
||||
WRITEMASK_XYZW));
|
||||
|
||||
/* Check that we can propagate that value */
|
||||
if (value.file != UNIFORM &&
|
||||
value.file != VGRF &&
|
||||
value.file != ATTR)
|
||||
return false;
|
||||
|
||||
/* Instructions that write 2 registers also need to read 2 registers. Make
|
||||
* sure we don't break that restriction by copy propagating from a uniform.
|
||||
*/
|
||||
if (inst->size_written > REG_SIZE && is_uniform(value))
|
||||
return false;
|
||||
|
||||
/* There is a regioning restriction such that if execsize == width
|
||||
* and hstride != 0 then the vstride can't be 0. When we split instrutions
|
||||
* that take a single-precision source (like F->DF conversions) we end up
|
||||
* with a 4-wide source on an instruction with an execution size of 4.
|
||||
* If we then copy-propagate the source from a uniform we also end up with a
|
||||
* vstride of 0 and we violate the restriction.
|
||||
*/
|
||||
if (inst->exec_size == 4 && value.file == UNIFORM &&
|
||||
type_sz(value.type) == 4)
|
||||
return false;
|
||||
|
||||
/* If the type of the copy value is different from the type of the
|
||||
* instruction then the swizzles and writemasks involved don't have the same
|
||||
* meaning and simply replacing the source would produce different semantics.
|
||||
*/
|
||||
if (type_sz(value.type) != type_sz(inst->src[arg].type))
|
||||
return false;
|
||||
|
||||
if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
|
||||
return false;
|
||||
|
||||
bool has_source_modifiers = value.negate || value.abs;
|
||||
|
||||
/* gfx6 math and gfx7+ SENDs from GRFs ignore source modifiers on
|
||||
* instructions.
|
||||
*/
|
||||
if (has_source_modifiers && !inst->can_do_source_mods(devinfo))
|
||||
return false;
|
||||
|
||||
/* Reject cases that would violate register regioning restrictions. */
|
||||
if ((value.file == UNIFORM || value.swizzle != BRW_SWIZZLE_XYZW) &&
|
||||
((devinfo->ver == 6 && inst->is_math()) ||
|
||||
inst->is_send_from_grf() ||
|
||||
inst->uses_indirect_addressing())) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (has_source_modifiers &&
|
||||
value.type != inst->src[arg].type &&
|
||||
!inst->can_change_types())
|
||||
return false;
|
||||
|
||||
if (has_source_modifiers &&
|
||||
(inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
|
||||
inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT))
|
||||
return false;
|
||||
|
||||
unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
|
||||
value.swizzle);
|
||||
|
||||
/* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
|
||||
* so copy-propagation won't be safe if the composed swizzle is anything
|
||||
* other than the identity.
|
||||
*/
|
||||
if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
|
||||
return false;
|
||||
|
||||
if (inst->is_3src(compiler) &&
|
||||
(value.file == UNIFORM ||
|
||||
(value.file == ATTR && attributes_per_reg != 1)) &&
|
||||
!brw_is_single_value_swizzle(composed_swizzle))
|
||||
return false;
|
||||
|
||||
if (inst->is_send_from_grf())
|
||||
return false;
|
||||
|
||||
/* we can't generally copy-propagate UD negations because we
|
||||
* end up accessing the resulting values as signed integers
|
||||
* instead. See also resolve_ud_negate().
|
||||
*/
|
||||
if (value.negate &&
|
||||
value.type == BRW_REGISTER_TYPE_UD)
|
||||
return false;
|
||||
|
||||
/* Don't report progress if this is a noop. */
|
||||
if (value.equals(inst->src[arg]))
|
||||
return false;
|
||||
|
||||
const unsigned dst_saturate_mask = inst->dst.writemask &
|
||||
brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
|
||||
|
||||
if (dst_saturate_mask) {
|
||||
/* We either saturate all or nothing. */
|
||||
if (dst_saturate_mask != inst->dst.writemask)
|
||||
return false;
|
||||
|
||||
/* Limit saturate propagation only to SEL with src1 bounded within 0.0
|
||||
* and 1.0, otherwise skip copy propagate altogether.
|
||||
*/
|
||||
switch(inst->opcode) {
|
||||
case BRW_OPCODE_SEL:
|
||||
if (arg != 0 ||
|
||||
inst->src[0].type != BRW_REGISTER_TYPE_F ||
|
||||
inst->src[1].file != IMM ||
|
||||
inst->src[1].type != BRW_REGISTER_TYPE_F ||
|
||||
inst->src[1].f < 0.0 ||
|
||||
inst->src[1].f > 1.0) {
|
||||
return false;
|
||||
}
|
||||
if (!inst->saturate)
|
||||
inst->saturate = true;
|
||||
break;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
/* Build the final value */
|
||||
if (inst->src[arg].abs) {
|
||||
value.negate = false;
|
||||
value.abs = true;
|
||||
}
|
||||
if (inst->src[arg].negate)
|
||||
value.negate = !value.negate;
|
||||
|
||||
value.swizzle = composed_swizzle;
|
||||
if (has_source_modifiers &&
|
||||
value.type != inst->src[arg].type) {
|
||||
assert(inst->can_change_types());
|
||||
for (int i = 0; i < 3; i++) {
|
||||
inst->src[i].type = value.type;
|
||||
}
|
||||
inst->dst.type = value.type;
|
||||
} else {
|
||||
value.type = inst->src[arg].type;
|
||||
}
|
||||
|
||||
inst->src[arg] = value;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::opt_copy_propagation(bool do_constant_prop)
|
||||
{
|
||||
/* If we are in dual instanced or single mode, then attributes are going
|
||||
* to be interleaved, so one register contains two attribute slots.
|
||||
*/
|
||||
const int attributes_per_reg =
|
||||
prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
|
||||
bool progress = false;
|
||||
struct copy_entry entries[alloc.total_size];
|
||||
|
||||
memset(&entries, 0, sizeof(entries));
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
/* This pass only works on basic blocks. If there's flow
|
||||
* control, throw out all our information and start from
|
||||
* scratch.
|
||||
*
|
||||
* This should really be fixed by using a structure like in
|
||||
* src/glsl/opt_copy_propagation.cpp to track available copies.
|
||||
*/
|
||||
if (!is_dominated_by_previous_instruction(inst)) {
|
||||
memset(&entries, 0, sizeof(entries));
|
||||
continue;
|
||||
}
|
||||
|
||||
/* For each source arg, see if each component comes from a copy
|
||||
* from the same type file (IMM, VGRF, UNIFORM), and try
|
||||
* optimizing out access to the copy result
|
||||
*/
|
||||
for (int i = 2; i >= 0; i--) {
|
||||
/* Copied values end up in GRFs, and we don't track reladdr
|
||||
* accesses.
|
||||
*/
|
||||
if (inst->src[i].file != VGRF ||
|
||||
inst->src[i].reladdr)
|
||||
continue;
|
||||
|
||||
/* We only handle register-aligned single GRF copies. */
|
||||
if (inst->size_read(i) != REG_SIZE ||
|
||||
inst->src[i].offset % REG_SIZE)
|
||||
continue;
|
||||
|
||||
const unsigned reg = (alloc.offsets[inst->src[i].nr] +
|
||||
inst->src[i].offset / REG_SIZE);
|
||||
const copy_entry &entry = entries[reg];
|
||||
|
||||
if (do_constant_prop && try_constant_propagate(inst, i, &entry))
|
||||
progress = true;
|
||||
else if (try_copy_propagate(compiler, inst, i, &entry, attributes_per_reg))
|
||||
progress = true;
|
||||
}
|
||||
|
||||
/* Track available source registers. */
|
||||
if (inst->dst.file == VGRF) {
|
||||
const int reg =
|
||||
alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
|
||||
|
||||
/* Update our destination's current channel values. For a direct copy,
|
||||
* the value is the newly propagated source. Otherwise, we don't know
|
||||
* the new value, so clear it.
|
||||
*/
|
||||
bool direct_copy = is_direct_copy(inst);
|
||||
entries[reg].saturatemask &= ~inst->dst.writemask;
|
||||
for (int i = 0; i < 4; i++) {
|
||||
if (inst->dst.writemask & (1 << i)) {
|
||||
entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
|
||||
entries[reg].saturatemask |=
|
||||
inst->saturate && direct_copy ? 1 << i : 0;
|
||||
}
|
||||
}
|
||||
|
||||
/* Clear the records for any registers whose current value came from
|
||||
* our destination's updated channels, as the two are no longer equal.
|
||||
*/
|
||||
if (inst->dst.reladdr)
|
||||
memset(&entries, 0, sizeof(entries));
|
||||
else {
|
||||
for (unsigned i = 0; i < alloc.total_size; i++) {
|
||||
for (int j = 0; j < 4; j++) {
|
||||
if (is_channel_updated(inst, entries[i].value, j)) {
|
||||
entries[i].value[j] = NULL;
|
||||
entries[i].saturatemask &= ~(1 << j);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
||||
DEPENDENCY_INSTRUCTION_DETAIL);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
|
@ -1,322 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2012, 2013, 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_vec4_live_variables.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
/** @file brw_vec4_cse.cpp
|
||||
*
|
||||
* Support for local common subexpression elimination.
|
||||
*
|
||||
* See Muchnick's Advanced Compiler Design and Implementation, section
|
||||
* 13.1 (p378).
|
||||
*/
|
||||
|
||||
namespace {
|
||||
struct aeb_entry : public exec_node {
|
||||
/** The instruction that generates the expression value. */
|
||||
vec4_instruction *generator;
|
||||
|
||||
/** The temporary where the value is stored. */
|
||||
src_reg tmp;
|
||||
};
|
||||
}
|
||||
|
||||
static bool
|
||||
is_expression(const vec4_instruction *const inst)
|
||||
{
|
||||
switch (inst->opcode) {
|
||||
case BRW_OPCODE_MOV:
|
||||
case BRW_OPCODE_SEL:
|
||||
case BRW_OPCODE_NOT:
|
||||
case BRW_OPCODE_AND:
|
||||
case BRW_OPCODE_OR:
|
||||
case BRW_OPCODE_XOR:
|
||||
case BRW_OPCODE_SHR:
|
||||
case BRW_OPCODE_SHL:
|
||||
case BRW_OPCODE_ASR:
|
||||
case BRW_OPCODE_CMP:
|
||||
case BRW_OPCODE_CMPN:
|
||||
case BRW_OPCODE_ADD:
|
||||
case BRW_OPCODE_MUL:
|
||||
case SHADER_OPCODE_MULH:
|
||||
case BRW_OPCODE_FRC:
|
||||
case BRW_OPCODE_RNDU:
|
||||
case BRW_OPCODE_RNDD:
|
||||
case BRW_OPCODE_RNDE:
|
||||
case BRW_OPCODE_RNDZ:
|
||||
case BRW_OPCODE_LINE:
|
||||
case BRW_OPCODE_PLN:
|
||||
case BRW_OPCODE_MAD:
|
||||
case BRW_OPCODE_LRP:
|
||||
case VEC4_OPCODE_UNPACK_UNIFORM:
|
||||
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
|
||||
case SHADER_OPCODE_BROADCAST:
|
||||
case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
|
||||
case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
|
||||
return true;
|
||||
case SHADER_OPCODE_RCP:
|
||||
case SHADER_OPCODE_RSQ:
|
||||
case SHADER_OPCODE_SQRT:
|
||||
case SHADER_OPCODE_EXP2:
|
||||
case SHADER_OPCODE_LOG2:
|
||||
case SHADER_OPCODE_POW:
|
||||
case SHADER_OPCODE_INT_QUOTIENT:
|
||||
case SHADER_OPCODE_INT_REMAINDER:
|
||||
case SHADER_OPCODE_SIN:
|
||||
case SHADER_OPCODE_COS:
|
||||
return inst->mlen == 0;
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
operands_match(const vec4_instruction *a, const vec4_instruction *b)
|
||||
{
|
||||
const src_reg *xs = a->src;
|
||||
const src_reg *ys = b->src;
|
||||
|
||||
if (a->opcode == BRW_OPCODE_MAD) {
|
||||
return xs[0].equals(ys[0]) &&
|
||||
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
|
||||
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
|
||||
} else if (a->opcode == BRW_OPCODE_MOV &&
|
||||
xs[0].file == IMM &&
|
||||
xs[0].type == BRW_REGISTER_TYPE_VF) {
|
||||
src_reg tmp_x = xs[0];
|
||||
src_reg tmp_y = ys[0];
|
||||
|
||||
/* Smash out the values that are not part of the writemask. Otherwise
|
||||
* the equals operator will fail due to mismatches in unused components.
|
||||
*/
|
||||
const unsigned ab_writemask = a->dst.writemask & b->dst.writemask;
|
||||
const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) |
|
||||
((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) |
|
||||
((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) |
|
||||
((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0);
|
||||
|
||||
tmp_x.ud &= mask;
|
||||
tmp_y.ud &= mask;
|
||||
|
||||
return tmp_x.equals(tmp_y);
|
||||
} else if (!a->is_commutative()) {
|
||||
return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
|
||||
} else {
|
||||
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
|
||||
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Checks if instructions match, exactly for sources, but loosely for
|
||||
* destination writemasks.
|
||||
*
|
||||
* \param 'a' is the generating expression from the AEB entry.
|
||||
* \param 'b' is the second occurrence of the expression that we're
|
||||
* considering eliminating.
|
||||
*/
|
||||
static bool
|
||||
instructions_match(vec4_instruction *a, vec4_instruction *b)
|
||||
{
|
||||
return a->opcode == b->opcode &&
|
||||
a->saturate == b->saturate &&
|
||||
a->predicate == b->predicate &&
|
||||
a->predicate_inverse == b->predicate_inverse &&
|
||||
a->conditional_mod == b->conditional_mod &&
|
||||
a->flag_subreg == b->flag_subreg &&
|
||||
a->dst.type == b->dst.type &&
|
||||
a->offset == b->offset &&
|
||||
a->mlen == b->mlen &&
|
||||
a->base_mrf == b->base_mrf &&
|
||||
a->header_size == b->header_size &&
|
||||
a->shadow_compare == b->shadow_compare &&
|
||||
((a->dst.writemask & b->dst.writemask) == a->dst.writemask) &&
|
||||
a->force_writemask_all == b->force_writemask_all &&
|
||||
a->size_written == b->size_written &&
|
||||
a->exec_size == b->exec_size &&
|
||||
a->group == b->group &&
|
||||
operands_match(a, b);
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::opt_cse_local(bblock_t *block, const vec4_live_variables &live)
|
||||
{
|
||||
bool progress = false;
|
||||
exec_list aeb;
|
||||
|
||||
void *cse_ctx = ralloc_context(NULL);
|
||||
|
||||
int ip = block->start_ip;
|
||||
foreach_inst_in_block (vec4_instruction, inst, block) {
|
||||
/* Skip some cases. */
|
||||
if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
|
||||
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
|
||||
inst->dst.is_null()))
|
||||
{
|
||||
bool found = false;
|
||||
|
||||
foreach_in_list_use_after(aeb_entry, entry, &aeb) {
|
||||
/* Match current instruction's expression against those in AEB. */
|
||||
if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
|
||||
instructions_match(inst, entry->generator)) {
|
||||
found = true;
|
||||
progress = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!found) {
|
||||
if (inst->opcode != BRW_OPCODE_MOV ||
|
||||
(inst->opcode == BRW_OPCODE_MOV &&
|
||||
inst->src[0].file == IMM &&
|
||||
inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
|
||||
/* Our first sighting of this expression. Create an entry. */
|
||||
aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
|
||||
entry->tmp = src_reg(); /* file will be BAD_FILE */
|
||||
entry->generator = inst;
|
||||
aeb.push_tail(entry);
|
||||
}
|
||||
} else {
|
||||
/* This is at least our second sighting of this expression.
|
||||
* If we don't have a temporary already, make one.
|
||||
*/
|
||||
bool no_existing_temp = entry->tmp.file == BAD_FILE;
|
||||
if (no_existing_temp && !entry->generator->dst.is_null()) {
|
||||
entry->tmp = retype(src_reg(VGRF, alloc.allocate(
|
||||
regs_written(entry->generator)),
|
||||
NULL), inst->dst.type);
|
||||
|
||||
const unsigned width = entry->generator->exec_size;
|
||||
unsigned component_size = width * type_sz(entry->tmp.type);
|
||||
unsigned num_copy_movs =
|
||||
DIV_ROUND_UP(entry->generator->size_written, component_size);
|
||||
for (unsigned i = 0; i < num_copy_movs; ++i) {
|
||||
vec4_instruction *copy =
|
||||
MOV(offset(entry->generator->dst, width, i),
|
||||
offset(entry->tmp, width, i));
|
||||
copy->exec_size = width;
|
||||
copy->group = entry->generator->group;
|
||||
copy->force_writemask_all =
|
||||
entry->generator->force_writemask_all;
|
||||
entry->generator->insert_after(block, copy);
|
||||
}
|
||||
|
||||
entry->generator->dst = dst_reg(entry->tmp);
|
||||
}
|
||||
|
||||
/* dest <- temp */
|
||||
if (!inst->dst.is_null()) {
|
||||
assert(inst->dst.type == entry->tmp.type);
|
||||
const unsigned width = inst->exec_size;
|
||||
unsigned component_size = width * type_sz(inst->dst.type);
|
||||
unsigned num_copy_movs =
|
||||
DIV_ROUND_UP(inst->size_written, component_size);
|
||||
for (unsigned i = 0; i < num_copy_movs; ++i) {
|
||||
vec4_instruction *copy =
|
||||
MOV(offset(inst->dst, width, i),
|
||||
offset(entry->tmp, width, i));
|
||||
copy->exec_size = inst->exec_size;
|
||||
copy->group = inst->group;
|
||||
copy->force_writemask_all = inst->force_writemask_all;
|
||||
inst->insert_before(block, copy);
|
||||
}
|
||||
}
|
||||
|
||||
/* Set our iterator so that next time through the loop inst->next
|
||||
* will get the instruction in the basic block after the one we've
|
||||
* removed.
|
||||
*/
|
||||
vec4_instruction *prev = (vec4_instruction *)inst->prev;
|
||||
|
||||
inst->remove(block);
|
||||
inst = prev;
|
||||
}
|
||||
}
|
||||
|
||||
foreach_in_list_safe(aeb_entry, entry, &aeb) {
|
||||
/* Kill all AEB entries that write a different value to or read from
|
||||
* the flag register if we just wrote it.
|
||||
*/
|
||||
if (inst->writes_flag(devinfo)) {
|
||||
if (entry->generator->reads_flag() ||
|
||||
(entry->generator->writes_flag(devinfo) &&
|
||||
!instructions_match(inst, entry->generator))) {
|
||||
entry->remove();
|
||||
ralloc_free(entry);
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
src_reg *src = &entry->generator->src[i];
|
||||
|
||||
/* Kill all AEB entries that use the destination we just
|
||||
* overwrote.
|
||||
*/
|
||||
if (inst->dst.file == entry->generator->src[i].file &&
|
||||
inst->dst.nr == entry->generator->src[i].nr) {
|
||||
entry->remove();
|
||||
ralloc_free(entry);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Kill any AEB entries using registers that don't get reused any
|
||||
* more -- a sure sign they'll fail operands_match().
|
||||
*/
|
||||
if (src->file == VGRF) {
|
||||
if (live.var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
|
||||
entry->remove();
|
||||
ralloc_free(entry);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ip++;
|
||||
}
|
||||
|
||||
ralloc_free(cse_ctx);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::opt_cse()
|
||||
{
|
||||
bool progress = false;
|
||||
const vec4_live_variables &live = live_analysis.require();
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
progress = opt_cse_local(block, live) || progress;
|
||||
}
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
|
||||
return progress;
|
||||
}
|
||||
|
|
@ -1,188 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_vec4_live_variables.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
/** @file brw_vec4_dead_code_eliminate.cpp
|
||||
*
|
||||
* Dataflow-aware dead code elimination.
|
||||
*
|
||||
* Walks the instruction list from the bottom, removing instructions that
|
||||
* have results that both aren't used in later blocks and haven't been read
|
||||
* yet in the tail end of this block.
|
||||
*/
|
||||
|
||||
using namespace brw;
|
||||
|
||||
bool
|
||||
vec4_visitor::dead_code_eliminate()
|
||||
{
|
||||
bool progress = false;
|
||||
|
||||
const vec4_live_variables &live_vars = live_analysis.require();
|
||||
int num_vars = live_vars.num_vars;
|
||||
BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
|
||||
BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
|
||||
|
||||
foreach_block_reverse_safe(block, cfg) {
|
||||
memcpy(live, live_vars.block_data[block->num].liveout,
|
||||
sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
|
||||
memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
|
||||
sizeof(BITSET_WORD));
|
||||
|
||||
foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
|
||||
if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
|
||||
(inst->dst.is_null() && inst->writes_flag(devinfo))){
|
||||
bool result_live[4] = { false };
|
||||
if (inst->dst.file == VGRF) {
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
const unsigned v = var_from_reg(alloc, inst->dst, c, i);
|
||||
result_live[c] |= BITSET_TEST(live, v);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (unsigned c = 0; c < 4; c++)
|
||||
result_live[c] = BITSET_TEST(flag_live, c);
|
||||
}
|
||||
|
||||
/* If the instruction can't do writemasking, then it's all or
|
||||
* nothing.
|
||||
*/
|
||||
if (!inst->can_do_writemask(devinfo)) {
|
||||
bool result = result_live[0] | result_live[1] |
|
||||
result_live[2] | result_live[3];
|
||||
result_live[0] = result;
|
||||
result_live[1] = result;
|
||||
result_live[2] = result;
|
||||
result_live[3] = result;
|
||||
}
|
||||
|
||||
if (inst->writes_flag(devinfo)) {
|
||||
/* Independently calculate the usage of the flag components and
|
||||
* the destination value components.
|
||||
*/
|
||||
uint8_t flag_mask = inst->dst.writemask;
|
||||
uint8_t dest_mask = inst->dst.writemask;
|
||||
|
||||
for (int c = 0; c < 4; c++) {
|
||||
if (!result_live[c] && dest_mask & (1 << c))
|
||||
dest_mask &= ~(1 << c);
|
||||
|
||||
if (!BITSET_TEST(flag_live, c))
|
||||
flag_mask &= ~(1 << c);
|
||||
}
|
||||
|
||||
if (inst->dst.writemask != (flag_mask | dest_mask)) {
|
||||
progress = true;
|
||||
inst->dst.writemask = flag_mask | dest_mask;
|
||||
}
|
||||
|
||||
/* If none of the destination components are read, replace the
|
||||
* destination register with the NULL register.
|
||||
*/
|
||||
if (dest_mask == 0) {
|
||||
progress = true;
|
||||
inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
|
||||
}
|
||||
} else {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
if (!result_live[c] && inst->dst.writemask & (1 << c)) {
|
||||
inst->dst.writemask &= ~(1 << c);
|
||||
progress = true;
|
||||
|
||||
if (inst->dst.writemask == 0) {
|
||||
if (inst->writes_accumulator) {
|
||||
inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
|
||||
} else {
|
||||
inst->opcode = BRW_OPCODE_NOP;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.is_null() && inst->writes_flag(devinfo)) {
|
||||
bool combined_live = false;
|
||||
for (unsigned c = 0; c < 4; c++)
|
||||
combined_live |= BITSET_TEST(flag_live, c);
|
||||
|
||||
if (!combined_live) {
|
||||
inst->opcode = BRW_OPCODE_NOP;
|
||||
progress = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF && !inst->predicate &&
|
||||
!inst->is_align1_partial_write()) {
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
if (inst->dst.writemask & (1 << c)) {
|
||||
const unsigned v = var_from_reg(alloc, inst->dst, c, i);
|
||||
BITSET_CLEAR(live, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->writes_flag(devinfo) && !inst->predicate && inst->exec_size == 8) {
|
||||
for (unsigned c = 0; c < 4; c++)
|
||||
BITSET_CLEAR(flag_live, c);
|
||||
}
|
||||
|
||||
if (inst->opcode == BRW_OPCODE_NOP) {
|
||||
inst->remove(block);
|
||||
continue;
|
||||
}
|
||||
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
|
||||
BITSET_SET(live, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (unsigned c = 0; c < 4; c++) {
|
||||
if (inst->reads_flag(c)) {
|
||||
BITSET_SET(flag_live, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(live);
|
||||
ralloc_free(flag_live);
|
||||
|
||||
if (progress)
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
|
||||
|
||||
return progress;
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,98 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
void
|
||||
vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||
{
|
||||
dst_reg dest;
|
||||
src_reg src;
|
||||
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_load_per_vertex_input: {
|
||||
assert(instr->def.bit_size == 32);
|
||||
/* The EmitNoIndirectInput flag guarantees our vertex index will
|
||||
* be constant. We should handle indirects someday.
|
||||
*/
|
||||
const unsigned vertex = nir_src_as_uint(instr->src[0]);
|
||||
const unsigned offset_reg = nir_src_as_uint(instr->src[1]);
|
||||
|
||||
const unsigned input_array_stride = prog_data->urb_read_length * 2;
|
||||
|
||||
/* Make up a type...we have no way of knowing... */
|
||||
const glsl_type *const type = glsl_ivec_type(instr->num_components);
|
||||
|
||||
src = src_reg(ATTR, input_array_stride * vertex +
|
||||
nir_intrinsic_base(instr) + offset_reg,
|
||||
type);
|
||||
src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
|
||||
|
||||
dest = get_nir_def(instr->def, src.type);
|
||||
dest.writemask = brw_writemask_for_size(instr->num_components);
|
||||
emit(MOV(dest, src));
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_load_input:
|
||||
unreachable("nir_lower_io should have produced per_vertex intrinsics");
|
||||
|
||||
case nir_intrinsic_emit_vertex_with_counter:
|
||||
this->vertex_count =
|
||||
retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
|
||||
gs_emit_vertex(nir_intrinsic_stream_id(instr));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_end_primitive_with_counter:
|
||||
this->vertex_count =
|
||||
retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
|
||||
gs_end_primitive();
|
||||
break;
|
||||
|
||||
case nir_intrinsic_set_vertex_and_primitive_count:
|
||||
this->vertex_count =
|
||||
retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_primitive_id:
|
||||
assert(gs_prog_data->include_primitive_id);
|
||||
dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
|
||||
emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_invocation_id: {
|
||||
dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
|
||||
if (gs_prog_data->invocations > 1)
|
||||
emit(GS_OPCODE_GET_INSTANCE_ID, dest);
|
||||
else
|
||||
emit(MOV(dest, brw_imm_ud(0)));
|
||||
break;
|
||||
}
|
||||
|
||||
default:
|
||||
vec4_visitor::nir_emit_intrinsic(instr);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,560 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_vec4_gs_visitor.cpp
|
||||
*
|
||||
* Geometry-shader-specific code derived from the vec4_visitor class.
|
||||
*/
|
||||
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "brw_fs.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
struct brw_gs_compile *c,
|
||||
struct brw_gs_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
bool no_spills,
|
||||
bool debug_enabled)
|
||||
: vec4_visitor(compiler, params, &c->key.base.tex,
|
||||
&prog_data->base, shader,
|
||||
no_spills, debug_enabled),
|
||||
c(c),
|
||||
gs_prog_data(prog_data)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
static inline struct brw_reg
|
||||
attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
|
||||
{
|
||||
struct brw_reg reg;
|
||||
|
||||
unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
|
||||
if (interleaved) {
|
||||
reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
|
||||
} else {
|
||||
reg = brw_vecn_grf(width, attr, 0);
|
||||
}
|
||||
|
||||
reg.type = type;
|
||||
return reg;
|
||||
}
|
||||
|
||||
/**
|
||||
* Replace each register of type ATTR in this->instructions with a reference
|
||||
* to a fixed HW register.
|
||||
*
|
||||
* If interleaved is true, then each attribute takes up half a register, with
|
||||
* register N containing attribute 2*N in its first half and attribute 2*N+1
|
||||
* in its second half (this corresponds to the payload setup used by geometry
|
||||
* shaders in "single" or "dual instanced" dispatch mode). If interleaved is
|
||||
* false, then each attribute takes up a whole register, with register N
|
||||
* containing attribute N (this corresponds to the payload setup used by
|
||||
* vertex shaders, and by geometry shaders in "dual object" dispatch mode).
|
||||
*/
|
||||
int
|
||||
vec4_gs_visitor::setup_varying_inputs(int payload_reg,
|
||||
int attributes_per_reg)
|
||||
{
|
||||
/* For geometry shaders there are N copies of the input attributes, where N
|
||||
* is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT *
|
||||
* i + j] represents attribute j for vertex i.
|
||||
*
|
||||
* Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
|
||||
* so the total number of input slots that will be delivered to the GS (and
|
||||
* thus the stride of the input arrays) is urb_read_length * 2.
|
||||
*/
|
||||
const unsigned num_input_vertices = nir->info.gs.vertices_in;
|
||||
assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
|
||||
unsigned input_array_stride = prog_data->urb_read_length * 2;
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file != ATTR)
|
||||
continue;
|
||||
|
||||
assert(inst->src[i].offset % REG_SIZE == 0);
|
||||
int grf = payload_reg * attributes_per_reg +
|
||||
inst->src[i].nr + inst->src[i].offset / REG_SIZE;
|
||||
|
||||
struct brw_reg reg =
|
||||
attribute_to_hw_reg(grf, inst->src[i].type, attributes_per_reg > 1);
|
||||
reg.swizzle = inst->src[i].swizzle;
|
||||
if (inst->src[i].abs)
|
||||
reg = brw_abs(reg);
|
||||
if (inst->src[i].negate)
|
||||
reg = negate(reg);
|
||||
|
||||
inst->src[i] = reg;
|
||||
}
|
||||
}
|
||||
|
||||
int regs_used = ALIGN(input_array_stride * num_input_vertices,
|
||||
attributes_per_reg) / attributes_per_reg;
|
||||
return payload_reg + regs_used;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_gs_visitor::setup_payload()
|
||||
{
|
||||
/* If we are in dual instanced or single mode, then attributes are going
|
||||
* to be interleaved, so one register contains two attribute slots.
|
||||
*/
|
||||
int attributes_per_reg =
|
||||
prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
|
||||
|
||||
int reg = 0;
|
||||
|
||||
/* The payload always contains important data in r0, which contains
|
||||
* the URB handles that are passed on to the URB write at the end
|
||||
* of the thread.
|
||||
*/
|
||||
reg++;
|
||||
|
||||
/* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
|
||||
if (gs_prog_data->include_primitive_id)
|
||||
reg++;
|
||||
|
||||
reg = setup_uniforms(reg);
|
||||
|
||||
reg = setup_varying_inputs(reg, attributes_per_reg);
|
||||
|
||||
this->first_non_payload_grf = reg;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_gs_visitor::emit_prolog()
|
||||
{
|
||||
/* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In
|
||||
* geometry shaders, it isn't (it contains a bunch of information we don't
|
||||
* need, like the input primitive type). We need r0.2 to be zero in order
|
||||
* to build scratch read/write messages correctly (otherwise this value
|
||||
* will be interpreted as a global offset, causing us to do our scratch
|
||||
* reads/writes to garbage memory). So just set it to zero at the top of
|
||||
* the shader.
|
||||
*/
|
||||
this->current_annotation = "clear r0.2";
|
||||
dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
||||
vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
/* Create a virtual register to hold the vertex count */
|
||||
this->vertex_count = src_reg(this, glsl_uint_type());
|
||||
|
||||
/* Initialize the vertex_count register to 0 */
|
||||
this->current_annotation = "initialize vertex_count";
|
||||
inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u)));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
if (c->control_data_header_size_bits > 0) {
|
||||
/* Create a virtual register to hold the current set of control data
|
||||
* bits.
|
||||
*/
|
||||
this->control_data_bits = src_reg(this, glsl_uint_type());
|
||||
|
||||
/* If we're outputting more than 32 control data bits, then EmitVertex()
|
||||
* will set control_data_bits to 0 after emitting the first vertex.
|
||||
* Otherwise, we need to initialize it to 0 here.
|
||||
*/
|
||||
if (c->control_data_header_size_bits <= 32) {
|
||||
this->current_annotation = "initialize control data bits";
|
||||
inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
|
||||
inst->force_writemask_all = true;
|
||||
}
|
||||
}
|
||||
|
||||
this->current_annotation = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_gs_visitor::emit_thread_end()
|
||||
{
|
||||
if (c->control_data_header_size_bits > 0) {
|
||||
/* During shader execution, we only ever call emit_control_data_bits()
|
||||
* just prior to outputting a vertex. Therefore, the control data bits
|
||||
* corresponding to the most recently output vertex still need to be
|
||||
* emitted.
|
||||
*/
|
||||
current_annotation = "thread end: emit control data bits";
|
||||
emit_control_data_bits();
|
||||
}
|
||||
|
||||
/* MRF 0 is reserved for the debugger, so start with message header
|
||||
* in MRF 1.
|
||||
*/
|
||||
int base_mrf = 1;
|
||||
|
||||
current_annotation = "thread end";
|
||||
dst_reg mrf_reg(MRF, base_mrf);
|
||||
src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
||||
vec4_instruction *inst = emit(MOV(mrf_reg, r0));
|
||||
inst->force_writemask_all = true;
|
||||
emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
|
||||
inst = emit(GS_OPCODE_THREAD_END);
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = 1;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_gs_visitor::emit_urb_write_header(int mrf)
|
||||
{
|
||||
/* The SEND instruction that writes the vertex data to the VUE will use
|
||||
* per_slot_offset=true, which means that DWORDs 3 and 4 of the message
|
||||
* header specify an offset (in multiples of 256 bits) into the URB entry
|
||||
* at which the write should take place.
|
||||
*
|
||||
* So we have to prepare a message header with the appropriate offset
|
||||
* values.
|
||||
*/
|
||||
dst_reg mrf_reg(MRF, mrf);
|
||||
src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
||||
this->current_annotation = "URB write header";
|
||||
vec4_instruction *inst = emit(MOV(mrf_reg, r0));
|
||||
inst->force_writemask_all = true;
|
||||
emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
|
||||
brw_imm_ud(gs_prog_data->output_vertex_size_hwords));
|
||||
}
|
||||
|
||||
|
||||
vec4_instruction *
|
||||
vec4_gs_visitor::emit_urb_write_opcode(bool complete)
|
||||
{
|
||||
/* We don't care whether the vertex is complete, because in general
|
||||
* geometry shaders output multiple vertices, and we don't terminate the
|
||||
* thread until all vertices are complete.
|
||||
*/
|
||||
(void) complete;
|
||||
|
||||
vec4_instruction *inst = emit(VEC4_GS_OPCODE_URB_WRITE);
|
||||
inst->offset = gs_prog_data->control_data_header_size_hwords;
|
||||
|
||||
inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
|
||||
return inst;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Write out a batch of 32 control data bits from the control_data_bits
|
||||
* register to the URB.
|
||||
*
|
||||
* The current value of the vertex_count register determines which DWORD in
|
||||
* the URB receives the control data bits. The control_data_bits register is
|
||||
* assumed to contain the correct data for the vertex that was most recently
|
||||
* output, and all previous vertices that share the same DWORD.
|
||||
*
|
||||
* This function takes care of ensuring that if no vertices have been output
|
||||
* yet, no control bits are emitted.
|
||||
*/
|
||||
void
|
||||
vec4_gs_visitor::emit_control_data_bits()
|
||||
{
|
||||
assert(c->control_data_bits_per_vertex != 0);
|
||||
|
||||
/* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
|
||||
* granularity, we need to use two tricks to ensure that the batch of 32
|
||||
* control data bits is written to the appropriate DWORD in the URB. To
|
||||
* select which vec4 we are writing to, we use the "slot {0,1} offset"
|
||||
* fields of the message header. To select which DWORD in the vec4 we are
|
||||
* writing to, we use the channel mask fields of the message header. To
|
||||
* avoid penalizing geometry shaders that emit a small number of vertices
|
||||
* with extra bookkeeping, we only do each of these tricks when
|
||||
* c->prog_data.control_data_header_size_bits is large enough to make it
|
||||
* necessary.
|
||||
*
|
||||
* Note: this means that if we're outputting just a single DWORD of control
|
||||
* data bits, we'll actually replicate it four times since we won't do any
|
||||
* channel masking. But that's not a problem since in this case the
|
||||
* hardware only pays attention to the first DWORD.
|
||||
*/
|
||||
enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
|
||||
if (c->control_data_header_size_bits > 32)
|
||||
urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
|
||||
if (c->control_data_header_size_bits > 128)
|
||||
urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
|
||||
|
||||
/* If we are using either channel masks or a per-slot offset, then we
|
||||
* need to figure out which DWORD we are trying to write to, using the
|
||||
* formula:
|
||||
*
|
||||
* dword_index = (vertex_count - 1) * bits_per_vertex / 32
|
||||
*
|
||||
* Since bits_per_vertex is a power of two, and is known at compile
|
||||
* time, this can be optimized to:
|
||||
*
|
||||
* dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
|
||||
*/
|
||||
src_reg dword_index(this, glsl_uint_type());
|
||||
if (urb_write_flags) {
|
||||
src_reg prev_count(this, glsl_uint_type());
|
||||
emit(ADD(dst_reg(prev_count), this->vertex_count,
|
||||
brw_imm_ud(0xffffffffu)));
|
||||
unsigned log2_bits_per_vertex =
|
||||
util_last_bit(c->control_data_bits_per_vertex);
|
||||
emit(SHR(dst_reg(dword_index), prev_count,
|
||||
brw_imm_ud(6 - log2_bits_per_vertex)));
|
||||
}
|
||||
|
||||
/* Start building the URB write message. The first MRF gets a copy of
|
||||
* R0.
|
||||
*/
|
||||
int base_mrf = 1;
|
||||
dst_reg mrf_reg(MRF, base_mrf);
|
||||
src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
|
||||
vec4_instruction *inst = emit(MOV(mrf_reg, r0));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
|
||||
/* Set the per-slot offset to dword_index / 4, to that we'll write to
|
||||
* the appropriate OWORD within the control data header.
|
||||
*/
|
||||
src_reg per_slot_offset(this, glsl_uint_type());
|
||||
emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
|
||||
emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
|
||||
brw_imm_ud(1u));
|
||||
}
|
||||
|
||||
if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
|
||||
/* Set the channel masks to 1 << (dword_index % 4), so that we'll
|
||||
* write to the appropriate DWORD within the OWORD. We need to do
|
||||
* this computation with force_writemask_all, otherwise garbage data
|
||||
* from invocation 0 might clobber the mask for invocation 1 when
|
||||
* GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
|
||||
* together.
|
||||
*/
|
||||
src_reg channel(this, glsl_uint_type());
|
||||
inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
|
||||
inst->force_writemask_all = true;
|
||||
src_reg one(this, glsl_uint_type());
|
||||
inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
|
||||
inst->force_writemask_all = true;
|
||||
src_reg channel_mask(this, glsl_uint_type());
|
||||
inst = emit(SHL(dst_reg(channel_mask), one, channel));
|
||||
inst->force_writemask_all = true;
|
||||
emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
|
||||
channel_mask);
|
||||
emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
|
||||
}
|
||||
|
||||
/* Store the control data bits in the message payload and send it. */
|
||||
dst_reg mrf_reg2(MRF, base_mrf + 1);
|
||||
inst = emit(MOV(mrf_reg2, this->control_data_bits));
|
||||
inst->force_writemask_all = true;
|
||||
inst = emit(VEC4_GS_OPCODE_URB_WRITE);
|
||||
inst->urb_write_flags = urb_write_flags;
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = 2;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
|
||||
{
|
||||
/* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
|
||||
|
||||
/* Note: we are calling this *before* increasing vertex_count, so
|
||||
* this->vertex_count == vertex_count - 1 in the formula above.
|
||||
*/
|
||||
|
||||
/* Stream mode uses 2 bits per vertex */
|
||||
assert(c->control_data_bits_per_vertex == 2);
|
||||
|
||||
/* Must be a valid stream */
|
||||
assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
|
||||
|
||||
/* Control data bits are initialized to 0 so we don't have to set any
|
||||
* bits when sending vertices to stream 0.
|
||||
*/
|
||||
if (stream_id == 0)
|
||||
return;
|
||||
|
||||
/* reg::sid = stream_id */
|
||||
src_reg sid(this, glsl_uint_type());
|
||||
emit(MOV(dst_reg(sid), brw_imm_ud(stream_id)));
|
||||
|
||||
/* reg:shift_count = 2 * (vertex_count - 1) */
|
||||
src_reg shift_count(this, glsl_uint_type());
|
||||
emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u)));
|
||||
|
||||
/* Note: we're relying on the fact that the GEN SHL instruction only pays
|
||||
* attention to the lower 5 bits of its second source argument, so on this
|
||||
* architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
|
||||
* stream_id << ((2 * (vertex_count - 1)) % 32).
|
||||
*/
|
||||
src_reg mask(this, glsl_uint_type());
|
||||
emit(SHL(dst_reg(mask), sid, shift_count));
|
||||
emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
|
||||
}
|
||||
|
||||
void
|
||||
vec4_gs_visitor::gs_emit_vertex(int stream_id)
|
||||
{
|
||||
this->current_annotation = "emit vertex: safety check";
|
||||
|
||||
/* Haswell and later hardware ignores the "Render Stream Select" bits
|
||||
* from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
|
||||
* and instead sends all primitives down the pipeline for rasterization.
|
||||
* If the SOL stage is enabled, "Render Stream Select" is honored and
|
||||
* primitives bound to non-zero streams are discarded after stream output.
|
||||
*
|
||||
* Since the only purpose of primives sent to non-zero streams is to
|
||||
* be recorded by transform feedback, we can simply discard all geometry
|
||||
* bound to these streams when transform feedback is disabled.
|
||||
*/
|
||||
if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
|
||||
return;
|
||||
|
||||
/* If we're outputting 32 control data bits or less, then we can wait
|
||||
* until the shader is over to output them all. Otherwise we need to
|
||||
* output them as we go. Now is the time to do it, since we're about to
|
||||
* output the vertex_count'th vertex, so it's guaranteed that the
|
||||
* control data bits associated with the (vertex_count - 1)th vertex are
|
||||
* correct.
|
||||
*/
|
||||
if (c->control_data_header_size_bits > 32) {
|
||||
this->current_annotation = "emit vertex: emit control data bits";
|
||||
/* Only emit control data bits if we've finished accumulating a batch
|
||||
* of 32 bits. This is the case when:
|
||||
*
|
||||
* (vertex_count * bits_per_vertex) % 32 == 0
|
||||
*
|
||||
* (in other words, when the last 5 bits of vertex_count *
|
||||
* bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
|
||||
* integer n (which is always the case, since bits_per_vertex is
|
||||
* always 1 or 2), this is equivalent to requiring that the last 5-n
|
||||
* bits of vertex_count are 0:
|
||||
*
|
||||
* vertex_count & (2^(5-n) - 1) == 0
|
||||
*
|
||||
* 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
|
||||
* equivalent to:
|
||||
*
|
||||
* vertex_count & (32 / bits_per_vertex - 1) == 0
|
||||
*/
|
||||
vec4_instruction *inst =
|
||||
emit(AND(dst_null_ud(), this->vertex_count,
|
||||
brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
|
||||
inst->conditional_mod = BRW_CONDITIONAL_Z;
|
||||
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
{
|
||||
/* If vertex_count is 0, then no control data bits have been
|
||||
* accumulated yet, so we skip emitting them.
|
||||
*/
|
||||
emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
|
||||
BRW_CONDITIONAL_NEQ));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
emit_control_data_bits();
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
|
||||
/* Reset control_data_bits to 0 so we can start accumulating a new
|
||||
* batch.
|
||||
*
|
||||
* Note: in the case where vertex_count == 0, this neutralizes the
|
||||
* effect of any call to EndPrimitive() that the shader may have
|
||||
* made before outputting its first vertex.
|
||||
*/
|
||||
inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
|
||||
inst->force_writemask_all = true;
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
this->current_annotation = "emit vertex: vertex data";
|
||||
emit_vertex();
|
||||
|
||||
/* In stream mode we have to set control data bits for all vertices
|
||||
* unless we have disabled control data bits completely (which we do
|
||||
* do for MESA_PRIM_POINTS outputs that don't use streams).
|
||||
*/
|
||||
if (c->control_data_header_size_bits > 0 &&
|
||||
gs_prog_data->control_data_format ==
|
||||
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
|
||||
this->current_annotation = "emit vertex: Stream control data bits";
|
||||
set_stream_control_data_bits(stream_id);
|
||||
}
|
||||
|
||||
this->current_annotation = NULL;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_gs_visitor::gs_end_primitive()
|
||||
{
|
||||
/* We can only do EndPrimitive() functionality when the control data
|
||||
* consists of cut bits. Fortunately, the only time it isn't is when the
|
||||
* output type is points, in which case EndPrimitive() is a no-op.
|
||||
*/
|
||||
if (gs_prog_data->control_data_format !=
|
||||
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (c->control_data_header_size_bits == 0)
|
||||
return;
|
||||
|
||||
/* Cut bits use one bit per vertex. */
|
||||
assert(c->control_data_bits_per_vertex == 1);
|
||||
|
||||
/* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
|
||||
* vertex n, 0 otherwise. So all we need to do here is mark bit
|
||||
* (vertex_count - 1) % 32 in the cut_bits register to indicate that
|
||||
* EndPrimitive() was called after emitting vertex (vertex_count - 1);
|
||||
* vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
|
||||
*
|
||||
* Note that if EndPrimitve() is called before emitting any vertices, this
|
||||
* will cause us to set bit 31 of the control_data_bits register to 1.
|
||||
* That's fine because:
|
||||
*
|
||||
* - If max_vertices < 32, then vertex number 31 (zero-based) will never be
|
||||
* output, so the hardware will ignore cut bit 31.
|
||||
*
|
||||
* - If max_vertices == 32, then vertex number 31 is guaranteed to be the
|
||||
* last vertex, so setting cut bit 31 has no effect (since the primitive
|
||||
* is automatically ended when the GS terminates).
|
||||
*
|
||||
* - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
|
||||
* control_data_bits register to 0 when the first vertex is emitted.
|
||||
*/
|
||||
|
||||
/* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
|
||||
src_reg one(this, glsl_uint_type());
|
||||
emit(MOV(dst_reg(one), brw_imm_ud(1u)));
|
||||
src_reg prev_count(this, glsl_uint_type());
|
||||
emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu)));
|
||||
src_reg mask(this, glsl_uint_type());
|
||||
/* Note: we're relying on the fact that the GEN SHL instruction only pays
|
||||
* attention to the lower 5 bits of its second source argument, so on this
|
||||
* architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
|
||||
* ((vertex_count - 1) % 32).
|
||||
*/
|
||||
emit(SHL(dst_reg(mask), one, prev_count));
|
||||
emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
|
|
@ -1,75 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_vec4_gs_visitor.h
|
||||
*
|
||||
* Geometry-shader-specific code derived from the vec4_visitor class.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_GS_VISITOR_H
|
||||
#define BRW_VEC4_GS_VISITOR_H
|
||||
|
||||
#include "brw_vec4.h"
|
||||
|
||||
#define MAX_GS_INPUT_VERTICES 6
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace brw {
|
||||
|
||||
class vec4_gs_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
vec4_gs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
struct brw_gs_compile *c,
|
||||
struct brw_gs_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
bool no_spills,
|
||||
bool debug_enabled);
|
||||
|
||||
protected:
|
||||
virtual void setup_payload();
|
||||
virtual void emit_prolog();
|
||||
virtual void emit_thread_end();
|
||||
virtual void emit_urb_write_header(int mrf);
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool complete);
|
||||
virtual void gs_emit_vertex(int stream_id);
|
||||
virtual void gs_end_primitive();
|
||||
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
|
||||
|
||||
protected:
|
||||
int setup_varying_inputs(int payload_reg, int attributes_per_reg);
|
||||
void emit_control_data_bits();
|
||||
void set_stream_control_data_bits(unsigned stream_id);
|
||||
|
||||
src_reg vertex_count;
|
||||
src_reg control_data_bits;
|
||||
const struct brw_gs_compile * const c;
|
||||
struct brw_gs_prog_data * const gs_prog_data;
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* BRW_VEC4_GS_VISITOR_H */
|
||||
|
|
@ -1,331 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_vec4_live_variables.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
#define MAX_INSTRUCTION (1 << 30)
|
||||
|
||||
/** @file brw_vec4_live_variables.cpp
|
||||
*
|
||||
* Support for computing at the basic block level which variables
|
||||
* (virtual GRFs in our case) are live at entry and exit.
|
||||
*
|
||||
* See Muchnick's Advanced Compiler Design and Implementation, section
|
||||
* 14.1 (p444).
|
||||
*/
|
||||
|
||||
/**
|
||||
* Sets up the use/def arrays and block-local approximation of the live ranges.
|
||||
*
|
||||
* The basic-block-level live variable analysis needs to know which
|
||||
* variables get used before they're completely defined, and which
|
||||
* variables are completely defined before they're used.
|
||||
*
|
||||
* We independently track each channel of a vec4. This is because we need to
|
||||
* be able to recognize a sequence like:
|
||||
*
|
||||
* ...
|
||||
* DP4 tmp.x a b;
|
||||
* DP4 tmp.y c d;
|
||||
* MUL result.xy tmp.xy e.xy
|
||||
* ...
|
||||
*
|
||||
* as having tmp live only across that sequence (assuming it's used nowhere
|
||||
* else), because it's a common pattern. A more conservative approach that
|
||||
* doesn't get tmp marked a deffed in this block will tend to result in
|
||||
* spilling.
|
||||
*/
|
||||
void
|
||||
vec4_live_variables::setup_def_use()
|
||||
{
|
||||
int ip = 0;
|
||||
|
||||
foreach_block (block, cfg) {
|
||||
assert(ip == block->start_ip);
|
||||
if (block->num > 0)
|
||||
assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
|
||||
|
||||
foreach_inst_in_block(vec4_instruction, inst, block) {
|
||||
struct block_data *bd = &block_data[block->num];
|
||||
|
||||
/* Set up the instruction uses. */
|
||||
for (unsigned int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
|
||||
|
||||
start[v] = MIN2(start[v], ip);
|
||||
end[v] = ip;
|
||||
|
||||
if (!BITSET_TEST(bd->def, v))
|
||||
BITSET_SET(bd->use, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for (unsigned c = 0; c < 4; c++) {
|
||||
if (inst->reads_flag(c) &&
|
||||
!BITSET_TEST(bd->flag_def, c)) {
|
||||
BITSET_SET(bd->flag_use, c);
|
||||
}
|
||||
}
|
||||
|
||||
/* Set up the instruction defs. */
|
||||
if (inst->dst.file == VGRF) {
|
||||
for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
|
||||
for (int c = 0; c < 4; c++) {
|
||||
if (inst->dst.writemask & (1 << c)) {
|
||||
const unsigned v = var_from_reg(alloc, inst->dst, c, i);
|
||||
|
||||
start[v] = MIN2(start[v], ip);
|
||||
end[v] = ip;
|
||||
|
||||
/* Check for unconditional register writes, these are the
|
||||
* things that screen off preceding definitions of a
|
||||
* variable, and thus qualify for being in def[].
|
||||
*/
|
||||
if ((!inst->predicate || inst->opcode == BRW_OPCODE_SEL) &&
|
||||
!BITSET_TEST(bd->use, v))
|
||||
BITSET_SET(bd->def, v);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (inst->writes_flag(devinfo)) {
|
||||
for (unsigned c = 0; c < 4; c++) {
|
||||
if ((inst->dst.writemask & (1 << c)) &&
|
||||
!BITSET_TEST(bd->flag_use, c)) {
|
||||
BITSET_SET(bd->flag_def, c);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ip++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* The algorithm incrementally sets bits in liveout and livein,
|
||||
* propagating it through control flow. It will eventually terminate
|
||||
* because it only ever adds bits, and stops when no bits are added in
|
||||
* a pass.
|
||||
*/
|
||||
void
|
||||
vec4_live_variables::compute_live_variables()
|
||||
{
|
||||
bool cont = true;
|
||||
|
||||
while (cont) {
|
||||
cont = false;
|
||||
|
||||
foreach_block_reverse (block, cfg) {
|
||||
struct block_data *bd = &block_data[block->num];
|
||||
|
||||
/* Update liveout */
|
||||
foreach_list_typed(bblock_link, child_link, link, &block->children) {
|
||||
struct block_data *child_bd = &block_data[child_link->block->num];
|
||||
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
BITSET_WORD new_liveout = (child_bd->livein[i] &
|
||||
~bd->liveout[i]);
|
||||
if (new_liveout) {
|
||||
bd->liveout[i] |= new_liveout;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
|
||||
~bd->flag_liveout[0]);
|
||||
if (new_liveout) {
|
||||
bd->flag_liveout[0] |= new_liveout;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
|
||||
/* Update livein */
|
||||
for (int i = 0; i < bitset_words; i++) {
|
||||
BITSET_WORD new_livein = (bd->use[i] |
|
||||
(bd->liveout[i] &
|
||||
~bd->def[i]));
|
||||
if (new_livein & ~bd->livein[i]) {
|
||||
bd->livein[i] |= new_livein;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
BITSET_WORD new_livein = (bd->flag_use[0] |
|
||||
(bd->flag_liveout[0] &
|
||||
~bd->flag_def[0]));
|
||||
if (new_livein & ~bd->flag_livein[0]) {
|
||||
bd->flag_livein[0] |= new_livein;
|
||||
cont = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Extend the start/end ranges for each variable to account for the
|
||||
* new information calculated from control flow.
|
||||
*/
|
||||
void
|
||||
vec4_live_variables::compute_start_end()
|
||||
{
|
||||
foreach_block (block, cfg) {
|
||||
const struct block_data &bd = block_data[block->num];
|
||||
|
||||
for (int i = 0; i < num_vars; i++) {
|
||||
if (BITSET_TEST(bd.livein, i)) {
|
||||
start[i] = MIN2(start[i], block->start_ip);
|
||||
end[i] = MAX2(end[i], block->start_ip);
|
||||
}
|
||||
|
||||
if (BITSET_TEST(bd.liveout, i)) {
|
||||
start[i] = MIN2(start[i], block->end_ip);
|
||||
end[i] = MAX2(end[i], block->end_ip);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
vec4_live_variables::vec4_live_variables(const backend_shader *s)
|
||||
: alloc(s->alloc), cfg(s->cfg)
|
||||
{
|
||||
mem_ctx = ralloc_context(NULL);
|
||||
|
||||
num_vars = alloc.total_size * 8;
|
||||
start = ralloc_array(mem_ctx, int, num_vars);
|
||||
end = ralloc_array(mem_ctx, int, num_vars);
|
||||
|
||||
for (int i = 0; i < num_vars; i++) {
|
||||
start[i] = MAX_INSTRUCTION;
|
||||
end[i] = -1;
|
||||
}
|
||||
|
||||
devinfo = s->compiler->devinfo;
|
||||
|
||||
block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
|
||||
|
||||
bitset_words = BITSET_WORDS(num_vars);
|
||||
for (int i = 0; i < cfg->num_blocks; i++) {
|
||||
block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
|
||||
block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
|
||||
|
||||
block_data[i].flag_def[0] = 0;
|
||||
block_data[i].flag_use[0] = 0;
|
||||
block_data[i].flag_livein[0] = 0;
|
||||
block_data[i].flag_liveout[0] = 0;
|
||||
}
|
||||
|
||||
setup_def_use();
|
||||
compute_live_variables();
|
||||
compute_start_end();
|
||||
}
|
||||
|
||||
vec4_live_variables::~vec4_live_variables()
|
||||
{
|
||||
ralloc_free(mem_ctx);
|
||||
}
|
||||
|
||||
static bool
|
||||
check_register_live_range(const vec4_live_variables *live, int ip,
|
||||
unsigned var, unsigned n)
|
||||
{
|
||||
for (unsigned j = 0; j < n; j += 4) {
|
||||
if (var + j >= unsigned(live->num_vars) ||
|
||||
live->start[var + j] > ip || live->end[var + j] < ip)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_live_variables::validate(const backend_shader *s) const
|
||||
{
|
||||
unsigned ip = 0;
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, s->cfg) {
|
||||
for (unsigned c = 0; c < 4; c++) {
|
||||
if (inst->dst.writemask & (1 << c)) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF &&
|
||||
!check_register_live_range(this, ip,
|
||||
var_from_reg(alloc, inst->src[i], c),
|
||||
regs_read(inst, i)))
|
||||
return false;
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF &&
|
||||
!check_register_live_range(this, ip,
|
||||
var_from_reg(alloc, inst->dst, c),
|
||||
regs_written(inst)))
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
ip++;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
int
|
||||
vec4_live_variables::var_range_start(unsigned v, unsigned n) const
|
||||
{
|
||||
int ip = INT_MAX;
|
||||
|
||||
for (unsigned i = 0; i < n; i++)
|
||||
ip = MIN2(ip, start[v + i]);
|
||||
|
||||
return ip;
|
||||
}
|
||||
|
||||
int
|
||||
vec4_live_variables::var_range_end(unsigned v, unsigned n) const
|
||||
{
|
||||
int ip = INT_MIN;
|
||||
|
||||
for (unsigned i = 0; i < n; i++)
|
||||
ip = MAX2(ip, end[v + i]);
|
||||
|
||||
return ip;
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_live_variables::vgrfs_interfere(int a, int b) const
|
||||
{
|
||||
return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <=
|
||||
var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) ||
|
||||
(var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <=
|
||||
var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a])));
|
||||
}
|
||||
|
|
@ -1,143 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* Authors:
|
||||
* Eric Anholt <eric@anholt.net>
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_LIVE_VARIABLES_H
|
||||
#define BRW_VEC4_LIVE_VARIABLES_H
|
||||
|
||||
#include "brw_ir_vec4.h"
|
||||
#include "brw_ir_analysis.h"
|
||||
#include "util/bitset.h"
|
||||
|
||||
struct backend_shader;
|
||||
|
||||
namespace brw {
|
||||
|
||||
class vec4_live_variables {
|
||||
public:
|
||||
struct block_data {
|
||||
/**
|
||||
* Which variables are defined before being used in the block.
|
||||
*
|
||||
* Note that for our purposes, "defined" means unconditionally, completely
|
||||
* defined.
|
||||
*/
|
||||
BITSET_WORD *def;
|
||||
|
||||
/**
|
||||
* Which variables are used before being defined in the block.
|
||||
*/
|
||||
BITSET_WORD *use;
|
||||
|
||||
/** Which defs reach the entry point of the block. */
|
||||
BITSET_WORD *livein;
|
||||
|
||||
/** Which defs reach the exit point of the block. */
|
||||
BITSET_WORD *liveout;
|
||||
|
||||
BITSET_WORD flag_def[1];
|
||||
BITSET_WORD flag_use[1];
|
||||
BITSET_WORD flag_livein[1];
|
||||
BITSET_WORD flag_liveout[1];
|
||||
};
|
||||
|
||||
vec4_live_variables(const backend_shader *s);
|
||||
~vec4_live_variables();
|
||||
|
||||
bool
|
||||
validate(const backend_shader *s) const;
|
||||
|
||||
analysis_dependency_class
|
||||
dependency_class() const
|
||||
{
|
||||
return (DEPENDENCY_INSTRUCTION_IDENTITY |
|
||||
DEPENDENCY_INSTRUCTION_DATA_FLOW |
|
||||
DEPENDENCY_VARIABLES);
|
||||
}
|
||||
|
||||
int num_vars;
|
||||
int bitset_words;
|
||||
|
||||
const struct intel_device_info *devinfo;
|
||||
|
||||
/** Per-basic-block information on live variables */
|
||||
struct block_data *block_data;
|
||||
|
||||
/** @{
|
||||
* Final computed live ranges for each variable.
|
||||
*/
|
||||
int *start;
|
||||
int *end;
|
||||
/** @} */
|
||||
|
||||
int var_range_start(unsigned v, unsigned n) const;
|
||||
int var_range_end(unsigned v, unsigned n) const;
|
||||
bool vgrfs_interfere(int a, int b) const;
|
||||
|
||||
protected:
|
||||
void setup_def_use();
|
||||
void compute_live_variables();
|
||||
void compute_start_end();
|
||||
|
||||
const simple_allocator &alloc;
|
||||
cfg_t *cfg;
|
||||
void *mem_ctx;
|
||||
};
|
||||
|
||||
/* Returns the variable index for the k-th dword of the c-th component of
|
||||
* register reg.
|
||||
*/
|
||||
inline unsigned
|
||||
var_from_reg(const simple_allocator &alloc, const src_reg ®,
|
||||
unsigned c = 0, unsigned k = 0)
|
||||
{
|
||||
assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
|
||||
const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
|
||||
unsigned result =
|
||||
8 * alloc.offsets[reg.nr] + reg.offset / 4 +
|
||||
(BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize;
|
||||
/* Do not exceed the limit for this register */
|
||||
assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
|
||||
return result;
|
||||
}
|
||||
|
||||
inline unsigned
|
||||
var_from_reg(const simple_allocator &alloc, const dst_reg ®,
|
||||
unsigned c = 0, unsigned k = 0)
|
||||
{
|
||||
assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
|
||||
const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
|
||||
unsigned result =
|
||||
8 * alloc.offsets[reg.nr] + reg.offset / 4 +
|
||||
(c + k / csize * 4) * csize + k % csize;
|
||||
/* Do not exceed the limit for this register */
|
||||
assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
|
||||
return result;
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
#endif /* BRW_VEC4_LIVE_VARIABLES_H */
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,512 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2011 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "util/register_allocate.h"
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_cfg.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
#define REG_CLASS_COUNT 20
|
||||
|
||||
namespace brw {
|
||||
|
||||
static void
|
||||
assign(unsigned int *reg_hw_locations, backend_reg *reg)
|
||||
{
|
||||
if (reg->file == VGRF) {
|
||||
reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
|
||||
reg->offset %= REG_SIZE;
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::reg_allocate_trivial()
|
||||
{
|
||||
unsigned int hw_reg_mapping[this->alloc.count];
|
||||
bool virtual_grf_used[this->alloc.count];
|
||||
int next;
|
||||
|
||||
/* Calculate which virtual GRFs are actually in use after whatever
|
||||
* optimization passes have occurred.
|
||||
*/
|
||||
for (unsigned i = 0; i < this->alloc.count; i++) {
|
||||
virtual_grf_used[i] = false;
|
||||
}
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
if (inst->dst.file == VGRF)
|
||||
virtual_grf_used[inst->dst.nr] = true;
|
||||
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF)
|
||||
virtual_grf_used[inst->src[i].nr] = true;
|
||||
}
|
||||
}
|
||||
|
||||
hw_reg_mapping[0] = this->first_non_payload_grf;
|
||||
next = hw_reg_mapping[0] + this->alloc.sizes[0];
|
||||
for (unsigned i = 1; i < this->alloc.count; i++) {
|
||||
if (virtual_grf_used[i]) {
|
||||
hw_reg_mapping[i] = next;
|
||||
next += this->alloc.sizes[i];
|
||||
}
|
||||
}
|
||||
prog_data->total_grf = next;
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
assign(hw_reg_mapping, &inst->dst);
|
||||
assign(hw_reg_mapping, &inst->src[0]);
|
||||
assign(hw_reg_mapping, &inst->src[1]);
|
||||
assign(hw_reg_mapping, &inst->src[2]);
|
||||
}
|
||||
|
||||
if (prog_data->total_grf > max_grf) {
|
||||
fail("Ran out of regs on trivial allocator (%d/%d)\n",
|
||||
prog_data->total_grf, max_grf);
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
extern "C" void
|
||||
brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
|
||||
{
|
||||
int base_reg_count =
|
||||
compiler->devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
|
||||
|
||||
assert(compiler->devinfo->ver < 8);
|
||||
|
||||
/* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
|
||||
* SEND-from-GRF sources cannot be split, so we also need classes for each
|
||||
* potential message length.
|
||||
*/
|
||||
assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(compiler->devinfo));
|
||||
int class_sizes[REG_CLASS_COUNT];
|
||||
|
||||
for (int i = 0; i < REG_CLASS_COUNT; i++)
|
||||
class_sizes[i] = i + 1;
|
||||
|
||||
|
||||
ralloc_free(compiler->vec4_reg_set.regs);
|
||||
compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, base_reg_count, false);
|
||||
if (compiler->devinfo->ver >= 6)
|
||||
ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
|
||||
ralloc_free(compiler->vec4_reg_set.classes);
|
||||
compiler->vec4_reg_set.classes = ralloc_array(compiler, struct ra_class *, REG_CLASS_COUNT);
|
||||
|
||||
/* Now, add the registers to their classes, and add the conflicts
|
||||
* between them and the base GRF registers (and also each other).
|
||||
*/
|
||||
for (int i = 0; i < REG_CLASS_COUNT; i++) {
|
||||
int class_reg_count = base_reg_count - (class_sizes[i] - 1);
|
||||
compiler->vec4_reg_set.classes[i] =
|
||||
ra_alloc_contig_reg_class(compiler->vec4_reg_set.regs, class_sizes[i]);
|
||||
|
||||
for (int j = 0; j < class_reg_count; j++)
|
||||
ra_class_add_reg(compiler->vec4_reg_set.classes[i], j);
|
||||
}
|
||||
|
||||
ra_set_finalize(compiler->vec4_reg_set.regs, NULL);
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::setup_payload_interference(struct ra_graph *g,
|
||||
int first_payload_node,
|
||||
int reg_node_count)
|
||||
{
|
||||
int payload_node_count = this->first_non_payload_grf;
|
||||
|
||||
for (int i = 0; i < payload_node_count; i++) {
|
||||
/* Mark each payload reg node as being allocated to its physical register.
|
||||
*
|
||||
* The alternative would be to have per-physical register classes, which
|
||||
* would just be silly.
|
||||
*/
|
||||
ra_set_node_reg(g, first_payload_node + i, i);
|
||||
|
||||
/* For now, just mark each payload node as interfering with every other
|
||||
* node to be allocated.
|
||||
*/
|
||||
for (int j = 0; j < reg_node_count; j++) {
|
||||
ra_add_node_interference(g, first_payload_node + i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool
|
||||
vec4_visitor::reg_allocate()
|
||||
{
|
||||
unsigned int hw_reg_mapping[alloc.count];
|
||||
int payload_reg_count = this->first_non_payload_grf;
|
||||
|
||||
/* Using the trivial allocator can be useful in debugging undefined
|
||||
* register access as a result of broken optimization passes.
|
||||
*/
|
||||
if (0)
|
||||
return reg_allocate_trivial();
|
||||
|
||||
assert(devinfo->ver < 8);
|
||||
|
||||
const vec4_live_variables &live = live_analysis.require();
|
||||
int node_count = alloc.count;
|
||||
int first_payload_node = node_count;
|
||||
node_count += payload_reg_count;
|
||||
struct ra_graph *g =
|
||||
ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count);
|
||||
|
||||
for (unsigned i = 0; i < alloc.count; i++) {
|
||||
int size = this->alloc.sizes[i];
|
||||
assert(size >= 1 && size <= MAX_VGRF_SIZE(devinfo));
|
||||
ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]);
|
||||
|
||||
for (unsigned j = 0; j < i; j++) {
|
||||
if (live.vgrfs_interfere(i, j)) {
|
||||
ra_add_node_interference(g, i, j);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* Certain instructions can't safely use the same register for their
|
||||
* sources and destination. Add interference.
|
||||
*/
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF) {
|
||||
ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
setup_payload_interference(g, first_payload_node, node_count);
|
||||
|
||||
if (!ra_allocate(g)) {
|
||||
/* Failed to allocate registers. Spill a reg, and the caller will
|
||||
* loop back into here to try again.
|
||||
*/
|
||||
int reg = choose_spill_reg(g);
|
||||
if (this->no_spills) {
|
||||
fail("Failure to register allocate. Reduce number of live "
|
||||
"values to avoid this.");
|
||||
} else if (reg == -1) {
|
||||
fail("no register to spill\n");
|
||||
} else {
|
||||
spill_reg(reg);
|
||||
}
|
||||
ralloc_free(g);
|
||||
return false;
|
||||
}
|
||||
|
||||
/* Get the chosen virtual registers for each node, and map virtual
|
||||
* regs in the register classes back down to real hardware reg
|
||||
* numbers.
|
||||
*/
|
||||
prog_data->total_grf = payload_reg_count;
|
||||
for (unsigned i = 0; i < alloc.count; i++) {
|
||||
hw_reg_mapping[i] = ra_get_node_reg(g, i);
|
||||
prog_data->total_grf = MAX2(prog_data->total_grf,
|
||||
hw_reg_mapping[i] + alloc.sizes[i]);
|
||||
}
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
assign(hw_reg_mapping, &inst->dst);
|
||||
assign(hw_reg_mapping, &inst->src[0]);
|
||||
assign(hw_reg_mapping, &inst->src[1]);
|
||||
assign(hw_reg_mapping, &inst->src[2]);
|
||||
}
|
||||
|
||||
ralloc_free(g);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/**
|
||||
* When we decide to spill a register, instead of blindly spilling every use,
|
||||
* save unspills when the spill register is used (read) in consecutive
|
||||
* instructions. This can potentially save a bunch of unspills that would
|
||||
* have very little impact in register allocation anyway.
|
||||
*
|
||||
* Notice that we need to account for this behavior when spilling a register
|
||||
* and when evaluating spilling costs. This function is designed so it can
|
||||
* be called from both places and avoid repeating the logic.
|
||||
*
|
||||
* - When we call this function from spill_reg(), we pass in scratch_reg the
|
||||
* actual unspill/spill register that we want to reuse in the current
|
||||
* instruction.
|
||||
*
|
||||
* - When we call this from evaluate_spill_costs(), we pass the register for
|
||||
* which we are evaluating spilling costs.
|
||||
*
|
||||
* In either case, we check if the previous instructions read scratch_reg until
|
||||
* we find one that writes to it with a compatible mask or does not read/write
|
||||
* scratch_reg at all.
|
||||
*/
|
||||
static bool
|
||||
can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
|
||||
unsigned scratch_reg)
|
||||
{
|
||||
assert(inst->src[i].file == VGRF);
|
||||
bool prev_inst_read_scratch_reg = false;
|
||||
|
||||
/* See if any previous source in the same instructions reads scratch_reg */
|
||||
for (unsigned n = 0; n < i; n++) {
|
||||
if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
|
||||
prev_inst_read_scratch_reg = true;
|
||||
}
|
||||
|
||||
/* Now check if previous instructions read/write scratch_reg */
|
||||
for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
|
||||
!prev_inst->is_head_sentinel();
|
||||
prev_inst = (vec4_instruction *) prev_inst->prev) {
|
||||
|
||||
/* If the previous instruction writes to scratch_reg then we can reuse
|
||||
* it if the write is not conditional and the channels we write are
|
||||
* compatible with our read mask
|
||||
*/
|
||||
if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
|
||||
return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
|
||||
(brw_mask_for_swizzle(inst->src[i].swizzle) &
|
||||
~prev_inst->dst.writemask) == 0;
|
||||
}
|
||||
|
||||
/* Skip scratch read/writes so that instructions generated by spilling
|
||||
* other registers (that won't read/write scratch_reg) do not stop us from
|
||||
* reusing scratch_reg for this instruction.
|
||||
*/
|
||||
if (prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
|
||||
prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_READ)
|
||||
continue;
|
||||
|
||||
/* If the previous instruction does not write to scratch_reg, then check
|
||||
* if it reads it
|
||||
*/
|
||||
int n;
|
||||
for (n = 0; n < 3; n++) {
|
||||
if (prev_inst->src[n].file == VGRF &&
|
||||
prev_inst->src[n].nr == scratch_reg) {
|
||||
prev_inst_read_scratch_reg = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (n == 3) {
|
||||
/* The previous instruction does not read scratch_reg. At this point,
|
||||
* if no previous instruction has read scratch_reg it means that we
|
||||
* will need to unspill it here and we can't reuse it (so we return
|
||||
* false). Otherwise, if we found at least one consecutive instruction
|
||||
* that read scratch_reg, then we know that we got here from
|
||||
* evaluate_spill_costs (since for the spill_reg path any block of
|
||||
* consecutive instructions using scratch_reg must start with a write
|
||||
* to that register, so we would've exited the loop in the check for
|
||||
* the write that we have at the start of this loop), and in that case
|
||||
* it means that we found the point at which the scratch_reg would be
|
||||
* unspilled. Since we always unspill a full vec4, it means that we
|
||||
* have all the channels available and we can just return true to
|
||||
* signal that we can reuse the register in the current instruction
|
||||
* too.
|
||||
*/
|
||||
return prev_inst_read_scratch_reg;
|
||||
}
|
||||
}
|
||||
|
||||
return prev_inst_read_scratch_reg;
|
||||
}
|
||||
|
||||
static inline float
|
||||
spill_cost_for_type(enum brw_reg_type type)
|
||||
{
|
||||
/* Spilling of a 64-bit register involves emitting 2 32-bit scratch
|
||||
* messages plus the 64b/32b shuffling code.
|
||||
*/
|
||||
return type_sz(type) == 8 ? 2.25f : 1.0f;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
|
||||
{
|
||||
float loop_scale = 1.0;
|
||||
|
||||
unsigned *reg_type_size = (unsigned *)
|
||||
ralloc_size(NULL, this->alloc.count * sizeof(unsigned));
|
||||
|
||||
for (unsigned i = 0; i < this->alloc.count; i++) {
|
||||
spill_costs[i] = 0.0;
|
||||
no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2;
|
||||
reg_type_size[i] = 0;
|
||||
}
|
||||
|
||||
/* Calculate costs for spilling nodes. Call it a cost of 1 per
|
||||
* spill/unspill we'll have to do, and guess that the insides of
|
||||
* loops run 10 times.
|
||||
*/
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
for (unsigned int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) {
|
||||
/* We will only unspill src[i] it it wasn't unspilled for the
|
||||
* previous instruction, in which case we'll just reuse the scratch
|
||||
* reg for this instruction.
|
||||
*/
|
||||
if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
|
||||
spill_costs[inst->src[i].nr] +=
|
||||
loop_scale * spill_cost_for_type(inst->src[i].type);
|
||||
if (inst->src[i].reladdr ||
|
||||
inst->src[i].offset >= REG_SIZE)
|
||||
no_spill[inst->src[i].nr] = true;
|
||||
|
||||
/* We don't support unspills of partial DF reads.
|
||||
*
|
||||
* Our 64-bit unspills are implemented with two 32-bit scratch
|
||||
* messages, each one reading that for both SIMD4x2 threads that
|
||||
* we need to shuffle into correct 64-bit data. Ensure that we
|
||||
* are reading data for both threads.
|
||||
*/
|
||||
if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
|
||||
no_spill[inst->src[i].nr] = true;
|
||||
}
|
||||
|
||||
/* We can't spill registers that mix 32-bit and 64-bit access (that
|
||||
* contain 64-bit data that is operated on via 32-bit instructions)
|
||||
*/
|
||||
unsigned type_size = type_sz(inst->src[i].type);
|
||||
if (reg_type_size[inst->src[i].nr] == 0)
|
||||
reg_type_size[inst->src[i].nr] = type_size;
|
||||
else if (reg_type_size[inst->src[i].nr] != type_size)
|
||||
no_spill[inst->src[i].nr] = true;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
|
||||
spill_costs[inst->dst.nr] +=
|
||||
loop_scale * spill_cost_for_type(inst->dst.type);
|
||||
if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
|
||||
no_spill[inst->dst.nr] = true;
|
||||
|
||||
/* We don't support spills of partial DF writes.
|
||||
*
|
||||
* Our 64-bit spills are implemented with two 32-bit scratch messages,
|
||||
* each one writing that for both SIMD4x2 threads. Ensure that we
|
||||
* are writing data for both threads.
|
||||
*/
|
||||
if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
|
||||
no_spill[inst->dst.nr] = true;
|
||||
|
||||
/* We can't spill registers that mix 32-bit and 64-bit access (that
|
||||
* contain 64-bit data that is operated on via 32-bit instructions)
|
||||
*/
|
||||
unsigned type_size = type_sz(inst->dst.type);
|
||||
if (reg_type_size[inst->dst.nr] == 0)
|
||||
reg_type_size[inst->dst.nr] = type_size;
|
||||
else if (reg_type_size[inst->dst.nr] != type_size)
|
||||
no_spill[inst->dst.nr] = true;
|
||||
}
|
||||
|
||||
switch (inst->opcode) {
|
||||
|
||||
case BRW_OPCODE_DO:
|
||||
loop_scale *= 10;
|
||||
break;
|
||||
|
||||
case BRW_OPCODE_WHILE:
|
||||
loop_scale /= 10;
|
||||
break;
|
||||
|
||||
case SHADER_OPCODE_GFX4_SCRATCH_READ:
|
||||
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
|
||||
case VEC4_OPCODE_MOV_FOR_SCRATCH:
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF)
|
||||
no_spill[inst->src[i].nr] = true;
|
||||
}
|
||||
if (inst->dst.file == VGRF)
|
||||
no_spill[inst->dst.nr] = true;
|
||||
break;
|
||||
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
ralloc_free(reg_type_size);
|
||||
}
|
||||
|
||||
int
|
||||
vec4_visitor::choose_spill_reg(struct ra_graph *g)
|
||||
{
|
||||
float spill_costs[this->alloc.count];
|
||||
bool no_spill[this->alloc.count];
|
||||
|
||||
evaluate_spill_costs(spill_costs, no_spill);
|
||||
|
||||
for (unsigned i = 0; i < this->alloc.count; i++) {
|
||||
if (!no_spill[i])
|
||||
ra_set_node_spill_cost(g, i, spill_costs[i]);
|
||||
}
|
||||
|
||||
return ra_get_best_spill_node(g);
|
||||
}
|
||||
|
||||
void
|
||||
vec4_visitor::spill_reg(unsigned spill_reg_nr)
|
||||
{
|
||||
assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2);
|
||||
unsigned spill_offset = last_scratch;
|
||||
last_scratch += alloc.sizes[spill_reg_nr];
|
||||
|
||||
/* Generate spill/unspill instructions for the objects being spilled. */
|
||||
unsigned scratch_reg = ~0u;
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
for (unsigned i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
|
||||
if (scratch_reg == ~0u ||
|
||||
!can_use_scratch_for_source(inst, i, scratch_reg)) {
|
||||
/* We need to unspill anyway so make sure we read the full vec4
|
||||
* in any case. This way, the cached register can be reused
|
||||
* for consecutive instructions that read different channels of
|
||||
* the same vec4.
|
||||
*/
|
||||
scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]);
|
||||
src_reg temp = inst->src[i];
|
||||
temp.nr = scratch_reg;
|
||||
temp.offset = 0;
|
||||
temp.swizzle = BRW_SWIZZLE_XYZW;
|
||||
emit_scratch_read(block, inst,
|
||||
dst_reg(temp), inst->src[i], spill_offset);
|
||||
temp.offset = inst->src[i].offset;
|
||||
}
|
||||
assert(scratch_reg != ~0u);
|
||||
inst->src[i].nr = scratch_reg;
|
||||
}
|
||||
}
|
||||
|
||||
if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
|
||||
emit_scratch_write(block, inst, spill_offset);
|
||||
scratch_reg = inst->dst.nr;
|
||||
}
|
||||
}
|
||||
|
||||
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
|
@ -1,213 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "brw_vec4_surface_builder.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
namespace {
|
||||
namespace array_utils {
|
||||
/**
|
||||
* Copy one every \p src_stride logical components of the argument into
|
||||
* one every \p dst_stride logical components of the result.
|
||||
*/
|
||||
static src_reg
|
||||
emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size,
|
||||
unsigned dst_stride, unsigned src_stride)
|
||||
{
|
||||
if (src_stride == 1 && dst_stride == 1) {
|
||||
return src;
|
||||
} else {
|
||||
const dst_reg dst = bld.vgrf(src.type,
|
||||
DIV_ROUND_UP(size * dst_stride, 4));
|
||||
|
||||
for (unsigned i = 0; i < size; ++i)
|
||||
bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4),
|
||||
1 << (i * dst_stride % 4)),
|
||||
swizzle(offset(src, 8, i * src_stride / 4),
|
||||
brw_swizzle_for_mask(1 << (i * src_stride % 4))));
|
||||
|
||||
return src_reg(dst);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Convert a VEC4 into an array of registers with the layout expected by
|
||||
* the recipient shared unit. If \p has_simd4x2 is true the argument is
|
||||
* left unmodified in SIMD4x2 form, otherwise it will be rearranged into
|
||||
* a SIMD8 vector.
|
||||
*/
|
||||
static src_reg
|
||||
emit_insert(const vec4_builder &bld, const src_reg &src,
|
||||
unsigned n, bool has_simd4x2)
|
||||
{
|
||||
if (src.file == BAD_FILE || n == 0) {
|
||||
return src_reg();
|
||||
|
||||
} else {
|
||||
/* Pad unused components with zeroes. */
|
||||
const unsigned mask = (1 << n) - 1;
|
||||
const dst_reg tmp = bld.vgrf(src.type);
|
||||
|
||||
bld.MOV(writemask(tmp, mask), src);
|
||||
if (n < 4)
|
||||
bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));
|
||||
|
||||
return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
namespace brw {
|
||||
namespace surface_access {
|
||||
namespace {
|
||||
using namespace array_utils;
|
||||
|
||||
/**
|
||||
* Generate a send opcode for a surface message and return the
|
||||
* result.
|
||||
*/
|
||||
src_reg
|
||||
emit_send(const vec4_builder &bld, enum opcode op,
|
||||
const src_reg &header,
|
||||
const src_reg &addr, unsigned addr_sz,
|
||||
const src_reg &src, unsigned src_sz,
|
||||
const src_reg &surface,
|
||||
unsigned arg, unsigned ret_sz,
|
||||
brw_predicate pred = BRW_PREDICATE_NONE)
|
||||
{
|
||||
/* Calculate the total number of components of the payload. */
|
||||
const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1);
|
||||
const unsigned sz = header_sz + addr_sz + src_sz;
|
||||
|
||||
/* Construct the payload. */
|
||||
const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
|
||||
unsigned n = 0;
|
||||
|
||||
if (header_sz)
|
||||
bld.exec_all().MOV(offset(payload, 8, n++),
|
||||
retype(header, BRW_REGISTER_TYPE_UD));
|
||||
|
||||
for (unsigned i = 0; i < addr_sz; i++)
|
||||
bld.MOV(offset(payload, 8, n++),
|
||||
offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i));
|
||||
|
||||
for (unsigned i = 0; i < src_sz; i++)
|
||||
bld.MOV(offset(payload, 8, n++),
|
||||
offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i));
|
||||
|
||||
/* Reduce the dynamically uniform surface index to a single
|
||||
* scalar.
|
||||
*/
|
||||
const src_reg usurface = bld.emit_uniformize(surface);
|
||||
|
||||
/* Emit the message send instruction. */
|
||||
const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
|
||||
vec4_instruction *inst =
|
||||
bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
|
||||
inst->mlen = sz;
|
||||
inst->size_written = ret_sz * REG_SIZE;
|
||||
inst->header_size = header_sz;
|
||||
inst->predicate = pred;
|
||||
|
||||
return src_reg(dst);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit an untyped surface read opcode. \p dims determines the number
|
||||
* of components of the address and \p size the number of components of
|
||||
* the returned value.
|
||||
*/
|
||||
src_reg
|
||||
emit_untyped_read(const vec4_builder &bld,
|
||||
const src_reg &surface, const src_reg &addr,
|
||||
unsigned dims, unsigned size,
|
||||
brw_predicate pred)
|
||||
{
|
||||
return emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_READ, src_reg(),
|
||||
emit_insert(bld, addr, dims, true), 1,
|
||||
src_reg(), 0,
|
||||
surface, size, 1, pred);
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit an untyped surface write opcode. \p dims determines the number
|
||||
* of components of the address and \p size the number of components of
|
||||
* the argument.
|
||||
*/
|
||||
void
|
||||
emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
|
||||
const src_reg &addr, const src_reg &src,
|
||||
unsigned dims, unsigned size,
|
||||
brw_predicate pred)
|
||||
{
|
||||
const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75;
|
||||
emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(),
|
||||
emit_insert(bld, addr, dims, has_simd4x2),
|
||||
has_simd4x2 ? 1 : dims,
|
||||
emit_insert(bld, src, size, has_simd4x2),
|
||||
has_simd4x2 ? 1 : size,
|
||||
surface, size, 0, pred);
|
||||
}
|
||||
|
||||
/**
|
||||
* Emit an untyped surface atomic opcode. \p dims determines the number
|
||||
* of components of the address and \p rsize the number of components of
|
||||
* the returned value (either zero or one).
|
||||
*/
|
||||
src_reg
|
||||
emit_untyped_atomic(const vec4_builder &bld,
|
||||
const src_reg &surface, const src_reg &addr,
|
||||
const src_reg &src0, const src_reg &src1,
|
||||
unsigned dims, unsigned rsize, unsigned op,
|
||||
brw_predicate pred)
|
||||
{
|
||||
const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75;
|
||||
|
||||
/* Zip the components of both sources, they are represented as the X
|
||||
* and Y components of the same vector.
|
||||
*/
|
||||
const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
|
||||
const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
|
||||
|
||||
if (size >= 1) {
|
||||
bld.MOV(writemask(srcs, WRITEMASK_X),
|
||||
swizzle(src0, BRW_SWIZZLE_XXXX));
|
||||
}
|
||||
|
||||
if (size >= 2) {
|
||||
bld.MOV(writemask(srcs, WRITEMASK_Y),
|
||||
swizzle(src1, BRW_SWIZZLE_XXXX));
|
||||
}
|
||||
|
||||
return emit_send(bld, VEC4_OPCODE_UNTYPED_ATOMIC, src_reg(),
|
||||
emit_insert(bld, addr, dims, has_simd4x2),
|
||||
has_simd4x2 ? 1 : dims,
|
||||
emit_insert(bld, src_reg(srcs), size, has_simd4x2),
|
||||
has_simd4x2 && size ? 1 : size,
|
||||
surface, op, rsize, pred);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -1,53 +0,0 @@
|
|||
/* -*- c++ -*- */
|
||||
/*
|
||||
* Copyright © 2013-2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_SURFACE_BUILDER_H
|
||||
#define BRW_VEC4_SURFACE_BUILDER_H
|
||||
|
||||
#include "brw_vec4_builder.h"
|
||||
|
||||
namespace brw {
|
||||
namespace surface_access {
|
||||
src_reg
|
||||
emit_untyped_read(const vec4_builder &bld,
|
||||
const src_reg &surface, const src_reg &addr,
|
||||
unsigned dims, unsigned size,
|
||||
brw_predicate pred = BRW_PREDICATE_NONE);
|
||||
|
||||
void
|
||||
emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
|
||||
const src_reg &addr, const src_reg &src,
|
||||
unsigned dims, unsigned size,
|
||||
brw_predicate pred = BRW_PREDICATE_NONE);
|
||||
|
||||
src_reg
|
||||
emit_untyped_atomic(const vec4_builder &bld,
|
||||
const src_reg &surface, const src_reg &addr,
|
||||
const src_reg &src0, const src_reg &src1,
|
||||
unsigned dims, unsigned rsize, unsigned op,
|
||||
brw_predicate pred = BRW_PREDICATE_NONE);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
@ -1,320 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_vec4_tcs.cpp
|
||||
*
|
||||
* Tessellaton control shader specific code derived from the vec4_visitor class.
|
||||
*/
|
||||
|
||||
#include "intel_nir.h"
|
||||
#include "brw_vec4_tcs.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_tcs_prog_key *key,
|
||||
struct brw_tcs_prog_data *prog_data,
|
||||
const nir_shader *nir,
|
||||
bool debug_enabled)
|
||||
: vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
|
||||
nir, false, debug_enabled),
|
||||
key(key)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::setup_payload()
|
||||
{
|
||||
int reg = 0;
|
||||
|
||||
/* The payload always contains important data in r0, which contains
|
||||
* the URB handles that are passed on to the URB write at the end
|
||||
* of the thread.
|
||||
*/
|
||||
reg++;
|
||||
|
||||
/* r1.0 - r4.7 may contain the input control point URB handles,
|
||||
* which we use to pull vertex data.
|
||||
*/
|
||||
reg += 4;
|
||||
|
||||
/* Push constants may start at r5.0 */
|
||||
reg = setup_uniforms(reg);
|
||||
|
||||
this->first_non_payload_grf = reg;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::emit_prolog()
|
||||
{
|
||||
invocation_id = src_reg(this, glsl_uint_type());
|
||||
emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
|
||||
|
||||
/* HS threads are dispatched with the dispatch mask set to 0xFF.
|
||||
* If there are an odd number of output vertices, then the final
|
||||
* HS instance dispatched will only have its bottom half doing real
|
||||
* work, and so we need to disable the upper half:
|
||||
*/
|
||||
if (nir->info.tess.tcs_vertices_out % 2) {
|
||||
emit(CMP(dst_null_d(), invocation_id,
|
||||
brw_imm_ud(nir->info.tess.tcs_vertices_out),
|
||||
BRW_CONDITIONAL_L));
|
||||
|
||||
/* Matching ENDIF is in emit_thread_end() */
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::emit_thread_end()
|
||||
{
|
||||
vec4_instruction *inst;
|
||||
current_annotation = "thread end";
|
||||
|
||||
if (nir->info.tess.tcs_vertices_out % 2) {
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
if (devinfo->ver == 7) {
|
||||
struct brw_tcs_prog_data *tcs_prog_data =
|
||||
(struct brw_tcs_prog_data *) prog_data;
|
||||
|
||||
current_annotation = "release input vertices";
|
||||
|
||||
/* Synchronize all threads, so we know that no one is still
|
||||
* using the input URB handles.
|
||||
*/
|
||||
if (tcs_prog_data->instances > 1) {
|
||||
dst_reg header = dst_reg(this, glsl_uvec4_type());
|
||||
emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
|
||||
emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
|
||||
}
|
||||
|
||||
/* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
|
||||
* We want to compare the bottom half of invocation_id with 0, but
|
||||
* use that truth value for the top half as well. Unfortunately,
|
||||
* we don't have stride in the vec4 world, nor UV immediates in
|
||||
* align16, so we need an opcode to get invocation_id<0,4,0>.
|
||||
*/
|
||||
set_condmod(BRW_CONDITIONAL_Z,
|
||||
emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
|
||||
invocation_id));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
for (unsigned i = 0; i < key->input_vertices; i += 2) {
|
||||
/* If we have an odd number of input vertices, the last will be
|
||||
* unpaired. We don't want to use an interleaved URB write in
|
||||
* that case.
|
||||
*/
|
||||
const bool is_unpaired = i == key->input_vertices - 1;
|
||||
|
||||
dst_reg header(this, glsl_uvec4_type());
|
||||
emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
|
||||
brw_imm_ud(is_unpaired));
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
inst = emit(TCS_OPCODE_THREAD_END);
|
||||
inst->base_mrf = 14;
|
||||
inst->mlen = 2;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
|
||||
const src_reg &vertex_index,
|
||||
unsigned base_offset,
|
||||
unsigned first_component,
|
||||
const src_reg &indirect_offset)
|
||||
{
|
||||
vec4_instruction *inst;
|
||||
dst_reg temp(this, glsl_ivec4_type());
|
||||
temp.type = dst.type;
|
||||
|
||||
/* Set up the message header to reference the proper parts of the URB */
|
||||
dst_reg header = dst_reg(this, glsl_uvec4_type());
|
||||
inst = emit(VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
|
||||
indirect_offset);
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
/* Read into a temporary, ignoring writemasking. */
|
||||
inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
|
||||
inst->offset = base_offset;
|
||||
inst->mlen = 1;
|
||||
inst->base_mrf = -1;
|
||||
|
||||
/* Copy the temporary to the destination to deal with writemasking.
|
||||
*
|
||||
* Also attempt to deal with gl_PointSize being in the .w component.
|
||||
*/
|
||||
if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
|
||||
emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
|
||||
} else {
|
||||
src_reg src = src_reg(temp);
|
||||
src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
|
||||
emit(MOV(dst, src));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
|
||||
unsigned base_offset,
|
||||
unsigned first_component,
|
||||
const src_reg &indirect_offset)
|
||||
{
|
||||
vec4_instruction *inst;
|
||||
|
||||
/* Set up the message header to reference the proper parts of the URB */
|
||||
dst_reg header = dst_reg(this, glsl_uvec4_type());
|
||||
inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
|
||||
brw_imm_ud(dst.writemask << first_component), indirect_offset);
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
|
||||
read->offset = base_offset;
|
||||
read->mlen = 1;
|
||||
read->base_mrf = -1;
|
||||
|
||||
if (first_component) {
|
||||
/* Read into a temporary and copy with a swizzle and writemask. */
|
||||
read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type);
|
||||
emit(MOV(dst, swizzle(src_reg(read->dst),
|
||||
BRW_SWZ_COMP_INPUT(first_component))));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::emit_urb_write(const src_reg &value,
|
||||
unsigned writemask,
|
||||
unsigned base_offset,
|
||||
const src_reg &indirect_offset)
|
||||
{
|
||||
if (writemask == 0)
|
||||
return;
|
||||
|
||||
src_reg message(this, glsl_uvec4_type(), 2);
|
||||
vec4_instruction *inst;
|
||||
|
||||
inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
|
||||
brw_imm_ud(writemask), indirect_offset);
|
||||
inst->force_writemask_all = true;
|
||||
inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
|
||||
value));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
inst = emit(VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message);
|
||||
inst->offset = base_offset;
|
||||
inst->mlen = 2;
|
||||
inst->base_mrf = -1;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||
{
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_load_invocation_id:
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_UD),
|
||||
invocation_id));
|
||||
break;
|
||||
case nir_intrinsic_load_primitive_id:
|
||||
emit(TCS_OPCODE_GET_PRIMITIVE_ID,
|
||||
get_nir_def(instr->def, BRW_REGISTER_TYPE_UD));
|
||||
break;
|
||||
case nir_intrinsic_load_patch_vertices_in:
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D),
|
||||
brw_imm_d(key->input_vertices)));
|
||||
break;
|
||||
case nir_intrinsic_load_per_vertex_input: {
|
||||
assert(instr->def.bit_size == 32);
|
||||
src_reg indirect_offset = get_indirect_offset(instr);
|
||||
unsigned imm_offset = nir_intrinsic_base(instr);
|
||||
|
||||
src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
|
||||
BRW_REGISTER_TYPE_UD);
|
||||
|
||||
unsigned first_component = nir_intrinsic_component(instr);
|
||||
dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
|
||||
dst.writemask = brw_writemask_for_size(instr->num_components);
|
||||
emit_input_urb_read(dst, vertex_index, imm_offset,
|
||||
first_component, indirect_offset);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_load_input:
|
||||
unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
|
||||
break;
|
||||
case nir_intrinsic_load_output:
|
||||
case nir_intrinsic_load_per_vertex_output: {
|
||||
src_reg indirect_offset = get_indirect_offset(instr);
|
||||
unsigned imm_offset = nir_intrinsic_base(instr);
|
||||
|
||||
dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
|
||||
dst.writemask = brw_writemask_for_size(instr->num_components);
|
||||
|
||||
emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
|
||||
indirect_offset);
|
||||
break;
|
||||
}
|
||||
case nir_intrinsic_store_output:
|
||||
case nir_intrinsic_store_per_vertex_output: {
|
||||
assert(nir_src_bit_size(instr->src[0]) == 32);
|
||||
src_reg value = get_nir_src(instr->src[0]);
|
||||
unsigned mask = nir_intrinsic_write_mask(instr);
|
||||
unsigned swiz = BRW_SWIZZLE_XYZW;
|
||||
|
||||
src_reg indirect_offset = get_indirect_offset(instr);
|
||||
unsigned imm_offset = nir_intrinsic_base(instr);
|
||||
|
||||
unsigned first_component = nir_intrinsic_component(instr);
|
||||
if (first_component) {
|
||||
assert(swiz == BRW_SWIZZLE_XYZW);
|
||||
swiz = BRW_SWZ_COMP_OUTPUT(first_component);
|
||||
mask = mask << first_component;
|
||||
}
|
||||
|
||||
emit_urb_write(swizzle(value, swiz), mask,
|
||||
imm_offset, indirect_offset);
|
||||
break;
|
||||
}
|
||||
|
||||
case nir_intrinsic_barrier:
|
||||
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
|
||||
vec4_visitor::nir_emit_intrinsic(instr);
|
||||
if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
|
||||
dst_reg header = dst_reg(this, glsl_uvec4_type());
|
||||
emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
|
||||
emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
|
||||
}
|
||||
break;
|
||||
|
||||
default:
|
||||
vec4_visitor::nir_emit_intrinsic(instr);
|
||||
}
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_vec4_tcs.h
|
||||
*
|
||||
* The vec4-mode tessellation control shader compiler backend.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_TCS_H
|
||||
#define BRW_VEC4_TCS_H
|
||||
|
||||
#include "brw_compiler.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_vec4.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace brw {
|
||||
|
||||
class vec4_tcs_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
vec4_tcs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_tcs_prog_key *key,
|
||||
struct brw_tcs_prog_data *prog_data,
|
||||
const nir_shader *nir,
|
||||
bool debug_enabled);
|
||||
|
||||
protected:
|
||||
virtual void setup_payload();
|
||||
virtual void emit_prolog();
|
||||
virtual void emit_thread_end();
|
||||
|
||||
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
|
||||
|
||||
void emit_input_urb_read(const dst_reg &dst,
|
||||
const src_reg &vertex_index,
|
||||
unsigned base_offset,
|
||||
unsigned first_component,
|
||||
const src_reg &indirect_offset);
|
||||
void emit_output_urb_read(const dst_reg &dst,
|
||||
unsigned base_offset,
|
||||
unsigned first_component,
|
||||
const src_reg &indirect_offset);
|
||||
|
||||
void emit_urb_write(const src_reg &value, unsigned writemask,
|
||||
unsigned base_offset, const src_reg &indirect_offset);
|
||||
|
||||
/* we do not use the normal end-of-shader URB write mechanism -- but every
|
||||
* vec4 stage must provide implementations of these:
|
||||
*/
|
||||
virtual void emit_urb_write_header(int /* mrf */) {}
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) { return NULL; }
|
||||
|
||||
const struct brw_tcs_prog_key *key;
|
||||
src_reg invocation_id;
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* BRW_VEC4_TCS_H */
|
||||
|
|
@ -1,223 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_vec4_tes.cpp
|
||||
*
|
||||
* Tessellaton evaluation shader specific code derived from the vec4_visitor class.
|
||||
*/
|
||||
|
||||
#include "brw_vec4_tes.h"
|
||||
#include "brw_cfg.h"
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_tes_prog_key *key,
|
||||
struct brw_tes_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
bool debug_enabled)
|
||||
: vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
|
||||
shader, false, debug_enabled)
|
||||
{
|
||||
}
|
||||
|
||||
void
|
||||
vec4_tes_visitor::setup_payload()
|
||||
{
|
||||
int reg = 0;
|
||||
|
||||
/* The payload always contains important data in r0 and r1, which contains
|
||||
* the URB handles that are passed on to the URB write at the end
|
||||
* of the thread.
|
||||
*/
|
||||
reg += 2;
|
||||
|
||||
reg = setup_uniforms(reg);
|
||||
|
||||
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
|
||||
for (int i = 0; i < 3; i++) {
|
||||
if (inst->src[i].file != ATTR)
|
||||
continue;
|
||||
|
||||
unsigned slot = inst->src[i].nr + inst->src[i].offset / 16;
|
||||
struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2));
|
||||
grf = stride(grf, 0, 4, 1);
|
||||
grf.swizzle = inst->src[i].swizzle;
|
||||
grf.type = inst->src[i].type;
|
||||
grf.abs = inst->src[i].abs;
|
||||
grf.negate = inst->src[i].negate;
|
||||
inst->src[i] = grf;
|
||||
}
|
||||
}
|
||||
|
||||
reg += 8 * prog_data->urb_read_length;
|
||||
|
||||
this->first_non_payload_grf = reg;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tes_visitor::emit_prolog()
|
||||
{
|
||||
input_read_header = src_reg(this, glsl_uvec4_type());
|
||||
emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
|
||||
|
||||
this->current_annotation = NULL;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tes_visitor::emit_urb_write_header(int mrf)
|
||||
{
|
||||
/* No need to do anything for DS; an implied write to this MRF will be
|
||||
* performed by VEC4_VS_OPCODE_URB_WRITE.
|
||||
*/
|
||||
(void) mrf;
|
||||
}
|
||||
|
||||
|
||||
vec4_instruction *
|
||||
vec4_tes_visitor::emit_urb_write_opcode(bool complete)
|
||||
{
|
||||
vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE);
|
||||
inst->urb_write_flags = complete ?
|
||||
BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
void
|
||||
vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
|
||||
{
|
||||
const struct brw_tes_prog_data *tes_prog_data =
|
||||
(const struct brw_tes_prog_data *) prog_data;
|
||||
|
||||
switch (instr->intrinsic) {
|
||||
case nir_intrinsic_load_tess_coord:
|
||||
/* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
|
||||
src_reg(brw_vec8_grf(1, 0))));
|
||||
break;
|
||||
case nir_intrinsic_load_tess_level_outer:
|
||||
if (tes_prog_data->domain == INTEL_TESS_DOMAIN_ISOLINE) {
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
|
||||
swizzle(src_reg(ATTR, 1, glsl_vec4_type()),
|
||||
BRW_SWIZZLE_ZWZW)));
|
||||
} else {
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
|
||||
swizzle(src_reg(ATTR, 1, glsl_vec4_type()),
|
||||
BRW_SWIZZLE_WZYX)));
|
||||
}
|
||||
break;
|
||||
case nir_intrinsic_load_tess_level_inner:
|
||||
if (tes_prog_data->domain == INTEL_TESS_DOMAIN_QUAD) {
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
|
||||
swizzle(src_reg(ATTR, 0, glsl_vec4_type()),
|
||||
BRW_SWIZZLE_WZYX)));
|
||||
} else {
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
|
||||
src_reg(ATTR, 1, glsl_float_type())));
|
||||
}
|
||||
break;
|
||||
case nir_intrinsic_load_primitive_id:
|
||||
emit(TES_OPCODE_GET_PRIMITIVE_ID,
|
||||
get_nir_def(instr->def, BRW_REGISTER_TYPE_UD));
|
||||
break;
|
||||
|
||||
case nir_intrinsic_load_input:
|
||||
case nir_intrinsic_load_per_vertex_input: {
|
||||
assert(instr->def.bit_size == 32);
|
||||
src_reg indirect_offset = get_indirect_offset(instr);
|
||||
unsigned imm_offset = instr->const_index[0];
|
||||
src_reg header = input_read_header;
|
||||
unsigned first_component = nir_intrinsic_component(instr);
|
||||
|
||||
if (indirect_offset.file != BAD_FILE) {
|
||||
src_reg clamped_indirect_offset = src_reg(this, glsl_uvec4_type());
|
||||
|
||||
/* Page 190 of "Volume 7: 3D Media GPGPU Engine (Haswell)" says the
|
||||
* valid range of the offset is [0, 0FFFFFFFh].
|
||||
*/
|
||||
emit_minmax(BRW_CONDITIONAL_L,
|
||||
dst_reg(clamped_indirect_offset),
|
||||
retype(indirect_offset, BRW_REGISTER_TYPE_UD),
|
||||
brw_imm_ud(0x0fffffffu));
|
||||
|
||||
header = src_reg(this, glsl_uvec4_type());
|
||||
emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
|
||||
input_read_header, clamped_indirect_offset);
|
||||
} else {
|
||||
/* Arbitrarily only push up to 24 vec4 slots worth of data,
|
||||
* which is 12 registers (since each holds 2 vec4 slots).
|
||||
*/
|
||||
const unsigned max_push_slots = 24;
|
||||
if (imm_offset < max_push_slots) {
|
||||
src_reg src = src_reg(ATTR, imm_offset, glsl_ivec4_type());
|
||||
src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
|
||||
|
||||
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D), src));
|
||||
|
||||
prog_data->urb_read_length =
|
||||
MAX2(prog_data->urb_read_length,
|
||||
DIV_ROUND_UP(imm_offset + 1, 2));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
dst_reg temp(this, glsl_ivec4_type());
|
||||
vec4_instruction *read =
|
||||
emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
|
||||
read->offset = imm_offset;
|
||||
read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
|
||||
|
||||
src_reg src = src_reg(temp);
|
||||
src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
|
||||
|
||||
/* Copy to target. We might end up with some funky writemasks landing
|
||||
* in here, but we really don't want them in the above pseudo-ops.
|
||||
*/
|
||||
dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
|
||||
dst.writemask = brw_writemask_for_size(instr->num_components);
|
||||
emit(MOV(dst, src));
|
||||
break;
|
||||
}
|
||||
default:
|
||||
vec4_visitor::nir_emit_intrinsic(instr);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_tes_visitor::emit_thread_end()
|
||||
{
|
||||
/* For DS, we always end the thread by emitting a single vertex.
|
||||
* emit_urb_write_opcode() will take care of setting the eot flag on the
|
||||
* SEND instruction.
|
||||
*/
|
||||
emit_vertex();
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
|
@ -1,65 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file brw_vec4_tes.h
|
||||
*
|
||||
* The vec4 mode tessellation evaluation shader compiler backend.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_TES_H
|
||||
#define BRW_VEC4_TES_H
|
||||
|
||||
#include "brw_vec4.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
namespace brw {
|
||||
|
||||
class vec4_tes_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
vec4_tes_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_tes_prog_key *key,
|
||||
struct brw_tes_prog_data *prog_data,
|
||||
const nir_shader *nir,
|
||||
bool debug_enabled);
|
||||
|
||||
protected:
|
||||
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
|
||||
|
||||
virtual void setup_payload();
|
||||
virtual void emit_prolog();
|
||||
virtual void emit_thread_end();
|
||||
|
||||
virtual void emit_urb_write_header(int mrf);
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool complete);
|
||||
|
||||
private:
|
||||
src_reg input_read_header;
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* BRW_VEC4_TES_H */
|
||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,58 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2006 - 2015 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef BRW_VEC4_VS_VISITOR_H
|
||||
#define BRW_VEC4_VS_VISITOR_H
|
||||
|
||||
#include "brw_vec4.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
class vec4_vs_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
vec4_vs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_vs_prog_key *key,
|
||||
struct brw_vs_prog_data *vs_prog_data,
|
||||
const nir_shader *shader,
|
||||
bool debug_enabled);
|
||||
|
||||
protected:
|
||||
virtual void setup_payload();
|
||||
virtual void emit_prolog();
|
||||
virtual void emit_thread_end();
|
||||
virtual void emit_urb_write_header(int mrf);
|
||||
virtual void emit_urb_slot(dst_reg reg, int varying);
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool complete);
|
||||
|
||||
private:
|
||||
int setup_attributes(int payload_reg);
|
||||
|
||||
const struct brw_vs_prog_key *const key;
|
||||
struct brw_vs_prog_data * const vs_prog_data;
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
#endif /* BRW_VEC4_VS_VISITOR_H */
|
||||
|
|
@ -1,108 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2013 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
||||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#include "brw_vec4_vs.h"
|
||||
#include "dev/intel_debug.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
void
|
||||
vec4_vs_visitor::emit_prolog()
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_vs_visitor::emit_urb_write_header(int mrf)
|
||||
{
|
||||
/* No need to do anything for VS; an implied write to this MRF will be
|
||||
* performed by VEC4_VS_OPCODE_URB_WRITE.
|
||||
*/
|
||||
(void) mrf;
|
||||
}
|
||||
|
||||
|
||||
vec4_instruction *
|
||||
vec4_vs_visitor::emit_urb_write_opcode(bool complete)
|
||||
{
|
||||
vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE);
|
||||
inst->urb_write_flags = complete ?
|
||||
BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
|
||||
|
||||
return inst;
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
|
||||
{
|
||||
reg.type = BRW_REGISTER_TYPE_F;
|
||||
output_reg[varying][0].type = reg.type;
|
||||
|
||||
switch (varying) {
|
||||
case VARYING_SLOT_COL0:
|
||||
case VARYING_SLOT_COL1:
|
||||
case VARYING_SLOT_BFC0:
|
||||
case VARYING_SLOT_BFC1: {
|
||||
/* These built-in varyings are only supported in compatibility mode,
|
||||
* and we only support GS in core profile. So, this must be a vertex
|
||||
* shader.
|
||||
*/
|
||||
vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0);
|
||||
if (inst && key->clamp_vertex_color)
|
||||
inst->saturate = true;
|
||||
break;
|
||||
}
|
||||
default:
|
||||
return vec4_visitor::emit_urb_slot(reg, varying);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
void
|
||||
vec4_vs_visitor::emit_thread_end()
|
||||
{
|
||||
/* For VS, we always end the thread by emitting a single vertex.
|
||||
* emit_urb_write_opcode() will take care of setting the eot flag on the
|
||||
* SEND instruction.
|
||||
*/
|
||||
emit_vertex();
|
||||
}
|
||||
|
||||
|
||||
vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
|
||||
const struct brw_compile_params *params,
|
||||
const struct brw_vs_prog_key *key,
|
||||
struct brw_vs_prog_data *vs_prog_data,
|
||||
const nir_shader *shader,
|
||||
bool debug_enabled)
|
||||
: vec4_visitor(compiler, params, &key->base.tex, &vs_prog_data->base,
|
||||
shader, false /* no_spills */, debug_enabled),
|
||||
key(key),
|
||||
vs_prog_data(vs_prog_data)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
} /* namespace brw */
|
||||
|
|
@ -1,702 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
* This code is based on original work by Ilia Mirkin.
|
||||
*/
|
||||
|
||||
/**
|
||||
* \file gfx6_gs_visitor.cpp
|
||||
*
|
||||
* Gfx6 geometry shader implementation
|
||||
*/
|
||||
|
||||
#include "gfx6_gs_visitor.h"
|
||||
#include "brw_eu.h"
|
||||
#include "brw_prim.h"
|
||||
|
||||
namespace brw {
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::emit_prolog()
|
||||
{
|
||||
vec4_gs_visitor::emit_prolog();
|
||||
|
||||
/* Gfx6 geometry shaders require to allocate an initial VUE handle via
|
||||
* FF_SYNC message, however the documentation remarks that only one thread
|
||||
* can write to the URB simultaneously and the FF_SYNC message provides the
|
||||
* synchronization mechanism for this, so using this message effectively
|
||||
* stalls the thread until it is its turn to write to the URB. Because of
|
||||
* this, the best way to implement geometry shader algorithms in gfx6 is to
|
||||
* execute the algorithm before the FF_SYNC message to maximize parallelism.
|
||||
*
|
||||
* To achieve this we buffer the geometry shader outputs for each emitted
|
||||
* vertex in vertex_output during operation. Then, when we have processed
|
||||
* the last vertex (that is, at thread end time), we send the FF_SYNC
|
||||
* message to allocate the initial VUE handle and write all buffered vertex
|
||||
* data to the URB in one go.
|
||||
*
|
||||
* For each emitted vertex, vertex_output will hold vue_map.num_slots
|
||||
* data items plus one additional item to hold required flags
|
||||
* (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
|
||||
* which come right after the data items for that vertex. Vertex data and
|
||||
* flags for the next vertex come right after the data items and flags for
|
||||
* the previous vertex.
|
||||
*/
|
||||
this->current_annotation = "gfx6 prolog";
|
||||
this->vertex_output = src_reg(this,
|
||||
glsl_uint_type(),
|
||||
(prog_data->vue_map.num_slots + 1) *
|
||||
nir->info.gs.vertices_out);
|
||||
this->vertex_output_offset = src_reg(this, glsl_uint_type());
|
||||
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
|
||||
|
||||
/* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
|
||||
* so initialize it once to R0.
|
||||
*/
|
||||
vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
|
||||
retype(brw_vec8_grf(0, 0),
|
||||
BRW_REGISTER_TYPE_UD)));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
/* This will be used as a temporary to store writeback data of FF_SYNC
|
||||
* and URB_WRITE messages.
|
||||
*/
|
||||
this->temp = src_reg(this, glsl_uint_type());
|
||||
|
||||
/* This will be used to know when we are processing the first vertex of
|
||||
* a primitive. We will set this to URB_WRITE_PRIM_START only when we know
|
||||
* that we are processing the first vertex in the primitive and to zero
|
||||
* otherwise. This way we can use its value directly in the URB write
|
||||
* headers.
|
||||
*/
|
||||
this->first_vertex = src_reg(this, glsl_uint_type());
|
||||
emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
|
||||
|
||||
/* The FF_SYNC message requires to know the number of primitives generated,
|
||||
* so keep a counter for this.
|
||||
*/
|
||||
this->prim_count = src_reg(this, glsl_uint_type());
|
||||
emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
|
||||
|
||||
if (gs_prog_data->num_transform_feedback_bindings) {
|
||||
/* Create a virtual register to hold destination indices in SOL */
|
||||
this->destination_indices = src_reg(this, glsl_uvec4_type());
|
||||
/* Create a virtual register to hold number of written primitives */
|
||||
this->sol_prim_written = src_reg(this, glsl_uint_type());
|
||||
/* Create a virtual register to hold Streamed Vertex Buffer Indices */
|
||||
this->svbi = src_reg(this, glsl_uvec4_type());
|
||||
/* Create a virtual register to hold max values of SVBI */
|
||||
this->max_svbi = src_reg(this, glsl_uvec4_type());
|
||||
emit(MOV(dst_reg(this->max_svbi),
|
||||
src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
|
||||
}
|
||||
|
||||
/* PrimitveID is delivered in r0.1 of the thread payload. If the program
|
||||
* needs it we have to move it to a separate register where we can map
|
||||
* the attribute.
|
||||
*
|
||||
* Notice that we cannot use a virtual register for this, because we need to
|
||||
* map all input attributes to hardware registers in setup_payload(),
|
||||
* which happens before virtual registers are mapped to hardware registers.
|
||||
* We could work around that issue if we were able to compute the first
|
||||
* non-payload register here and move the PrimitiveID information to that
|
||||
* register, but we can't because at this point we don't know the final
|
||||
* number uniforms that will be included in the payload.
|
||||
*
|
||||
* So, what we do is to place PrimitiveID information in r1, which is always
|
||||
* delivered as part of the payload, but its only populated with data
|
||||
* relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE
|
||||
* in the 3DSTATE_GS state packet. That information can be obtained by other
|
||||
* means though, so we can safely use r1 for this purpose.
|
||||
*/
|
||||
if (gs_prog_data->include_primitive_id) {
|
||||
this->primitive_id =
|
||||
src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
|
||||
emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::gs_emit_vertex(int stream_id)
|
||||
{
|
||||
this->current_annotation = "gfx6 emit vertex";
|
||||
|
||||
/* Buffer all output slots for this vertex in vertex_output */
|
||||
for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
|
||||
int varying = prog_data->vue_map.slot_to_varying[slot];
|
||||
if (varying != VARYING_SLOT_PSIZ) {
|
||||
dst_reg dst(this->vertex_output);
|
||||
dst.reladdr = ralloc(mem_ctx, src_reg);
|
||||
memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
|
||||
emit_urb_slot(dst, varying);
|
||||
} else {
|
||||
/* The PSIZ slot can pack multiple varyings in different channels
|
||||
* and emit_urb_slot() will produce a MOV instruction for each of
|
||||
* them. Since we are writing to an array, that will translate to
|
||||
* possibly multiple MOV instructions with an array destination and
|
||||
* each will generate a scratch write with the same offset into
|
||||
* scratch space (thus, each one overwriting the previous). This is
|
||||
* not what we want. What we will do instead is emit PSIZ to a
|
||||
* a regular temporary register, then move that register into the
|
||||
* array. This way we only have one instruction with an array
|
||||
* destination and we only produce a single scratch write.
|
||||
*/
|
||||
dst_reg tmp = dst_reg(src_reg(this, glsl_uvec4_type()));
|
||||
emit_urb_slot(tmp, varying);
|
||||
dst_reg dst(this->vertex_output);
|
||||
dst.reladdr = ralloc(mem_ctx, src_reg);
|
||||
memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
|
||||
vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
|
||||
inst->force_writemask_all = true;
|
||||
}
|
||||
|
||||
emit(ADD(dst_reg(this->vertex_output_offset),
|
||||
this->vertex_output_offset, brw_imm_ud(1u)));
|
||||
}
|
||||
|
||||
/* Now buffer flags for this vertex */
|
||||
dst_reg dst(this->vertex_output);
|
||||
dst.reladdr = ralloc(mem_ctx, src_reg);
|
||||
memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
|
||||
if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
|
||||
/* If we are outputting points, then every vertex has PrimStart and
|
||||
* PrimEnd set.
|
||||
*/
|
||||
emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
|
||||
URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
|
||||
emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
|
||||
} else {
|
||||
/* Otherwise, we can only set the PrimStart flag, which we have stored
|
||||
* in the first_vertex register. We will have to wait until we execute
|
||||
* EndPrimitive() or we end the thread to set the PrimEnd flag on a
|
||||
* vertex.
|
||||
*/
|
||||
emit(OR(dst, this->first_vertex,
|
||||
brw_imm_ud(gs_prog_data->output_topology <<
|
||||
URB_WRITE_PRIM_TYPE_SHIFT)));
|
||||
emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
|
||||
}
|
||||
emit(ADD(dst_reg(this->vertex_output_offset),
|
||||
this->vertex_output_offset, brw_imm_ud(1u)));
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::gs_end_primitive()
|
||||
{
|
||||
this->current_annotation = "gfx6 end primitive";
|
||||
/* Calling EndPrimitive() is optional for point output. In this case we set
|
||||
* the PrimEnd flag when we process EmitVertex().
|
||||
*/
|
||||
if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
|
||||
return;
|
||||
|
||||
/* Otherwise we know that the last vertex we have processed was the last
|
||||
* vertex in the primitive and we need to set its PrimEnd flag, so do this
|
||||
* unless we haven't emitted that vertex at all (vertex_count != 0).
|
||||
*
|
||||
* Notice that we have already incremented vertex_count when we processed
|
||||
* the last emit_vertex, so we need to take that into account in the
|
||||
* comparison below (hence the num_output_vertices + 1 in the comparison
|
||||
* below).
|
||||
*/
|
||||
unsigned num_output_vertices = nir->info.gs.vertices_out;
|
||||
emit(CMP(dst_null_ud(), this->vertex_count,
|
||||
brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
|
||||
vec4_instruction *inst = emit(CMP(dst_null_ud(),
|
||||
this->vertex_count, brw_imm_ud(0u),
|
||||
BRW_CONDITIONAL_NEQ));
|
||||
inst->predicate = BRW_PREDICATE_NORMAL;
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
{
|
||||
/* vertex_output_offset is already pointing at the first entry of the
|
||||
* next vertex. So subtract 1 to modify the flags for the previous
|
||||
* vertex.
|
||||
*/
|
||||
src_reg offset(this, glsl_uint_type());
|
||||
emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
|
||||
|
||||
src_reg dst(this->vertex_output);
|
||||
dst.reladdr = ralloc(mem_ctx, src_reg);
|
||||
memcpy(dst.reladdr, &offset, sizeof(src_reg));
|
||||
|
||||
emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
|
||||
emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
|
||||
|
||||
/* Set the first vertex flag to indicate that the next vertex will start
|
||||
* a primitive.
|
||||
*/
|
||||
emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::emit_urb_write_header(int mrf)
|
||||
{
|
||||
this->current_annotation = "gfx6 urb header";
|
||||
/* Compute offset of the flags for the current vertex in vertex_output and
|
||||
* write them in dw2 of the message header.
|
||||
*
|
||||
* Notice that by the time that emit_thread_end() calls here
|
||||
* vertex_output_offset should point to the first data item of the current
|
||||
* vertex in vertex_output, thus we only need to add the number of output
|
||||
* slots per vertex to that offset to obtain the flags data offset.
|
||||
*/
|
||||
src_reg flags_offset(this, glsl_uint_type());
|
||||
emit(ADD(dst_reg(flags_offset),
|
||||
this->vertex_output_offset,
|
||||
brw_imm_d(prog_data->vue_map.num_slots)));
|
||||
|
||||
src_reg flags_data(this->vertex_output);
|
||||
flags_data.reladdr = ralloc(mem_ctx, src_reg);
|
||||
memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
|
||||
|
||||
emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
|
||||
}
|
||||
|
||||
static unsigned
|
||||
align_interleaved_urb_mlen(unsigned mlen)
|
||||
{
|
||||
/* URB data written (does not include the message header reg) must
|
||||
* be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
|
||||
* section 5.4.3.2.2: URB_INTERLEAVED.
|
||||
*/
|
||||
if ((mlen % 2) != 1)
|
||||
mlen++;
|
||||
return mlen;
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf,
|
||||
int last_mrf, int urb_offset)
|
||||
{
|
||||
vec4_instruction *inst = NULL;
|
||||
|
||||
if (!complete) {
|
||||
/* If the vertex is not complete we don't have to do anything special */
|
||||
inst = emit(VEC4_GS_OPCODE_URB_WRITE);
|
||||
inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
|
||||
} else {
|
||||
/* Otherwise we always request to allocate a new VUE handle. If this is
|
||||
* the last write before the EOT message and the new handle never gets
|
||||
* used it will be dereferenced when we send the EOT message. This is
|
||||
* necessary to avoid different setups for the EOT message (one for the
|
||||
* case when there is no output and another for the case when there is)
|
||||
* which would require to end the program with an IF/ELSE/ENDIF block,
|
||||
* something we do not want.
|
||||
*/
|
||||
inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE);
|
||||
inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
|
||||
inst->dst = dst_reg(MRF, base_mrf);
|
||||
inst->src[0] = this->temp;
|
||||
}
|
||||
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
|
||||
inst->offset = urb_offset;
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::emit_thread_end()
|
||||
{
|
||||
/* Make sure the current primitive is ended: we know it is not ended when
|
||||
* first_vertex is not zero. This is only relevant for outputs other than
|
||||
* points because in the point case we set PrimEnd on all vertices.
|
||||
*/
|
||||
if (nir->info.gs.output_primitive != MESA_PRIM_POINTS) {
|
||||
emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
gs_end_primitive();
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
/* Here we have to:
|
||||
* 1) Emit an FF_SYNC message to obtain an initial VUE handle.
|
||||
* 2) Loop over all buffered vertex data and write it to corresponding
|
||||
* URB entries.
|
||||
* 3) Allocate new VUE handles for all vertices other than the first.
|
||||
* 4) Send a final EOT message.
|
||||
*/
|
||||
|
||||
/* MRF 0 is reserved for the debugger, so start with message header
|
||||
* in MRF 1.
|
||||
*/
|
||||
int base_mrf = 1;
|
||||
|
||||
/* In the process of generating our URB write message contents, we
|
||||
* may need to unspill a register or load from an array. Those
|
||||
* reads would use MRFs 21..23
|
||||
*/
|
||||
int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
|
||||
|
||||
/* Issue the FF_SYNC message and obtain the initial VUE handle. */
|
||||
this->current_annotation = "gfx6 thread end: ff_sync";
|
||||
|
||||
vec4_instruction *inst = NULL;
|
||||
if (gs_prog_data->num_transform_feedback_bindings) {
|
||||
src_reg sol_temp(this, glsl_uvec4_type());
|
||||
emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
|
||||
dst_reg(this->svbi),
|
||||
this->vertex_count,
|
||||
this->prim_count,
|
||||
sol_temp);
|
||||
inst = emit(GS_OPCODE_FF_SYNC,
|
||||
dst_reg(this->temp), this->prim_count, this->svbi);
|
||||
} else {
|
||||
inst = emit(GS_OPCODE_FF_SYNC,
|
||||
dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
|
||||
}
|
||||
inst->base_mrf = base_mrf;
|
||||
|
||||
emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
{
|
||||
/* Loop over all buffered vertices and emit URB write messages */
|
||||
this->current_annotation = "gfx6 thread end: urb writes init";
|
||||
src_reg vertex(this, glsl_uint_type());
|
||||
emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
|
||||
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
|
||||
|
||||
this->current_annotation = "gfx6 thread end: urb writes";
|
||||
emit(BRW_OPCODE_DO);
|
||||
{
|
||||
emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
|
||||
inst = emit(BRW_OPCODE_BREAK);
|
||||
inst->predicate = BRW_PREDICATE_NORMAL;
|
||||
|
||||
/* First we prepare the message header */
|
||||
emit_urb_write_header(base_mrf);
|
||||
|
||||
/* Then add vertex data to the message in interleaved fashion */
|
||||
int slot = 0;
|
||||
bool complete = false;
|
||||
do {
|
||||
int mrf = base_mrf + 1;
|
||||
|
||||
/* URB offset is in URB row increments, and each of our MRFs is half
|
||||
* of one of those, since we're doing interleaved writes.
|
||||
*/
|
||||
int urb_offset = slot / 2;
|
||||
|
||||
for (; slot < prog_data->vue_map.num_slots; ++slot) {
|
||||
int varying = prog_data->vue_map.slot_to_varying[slot];
|
||||
current_annotation = output_reg_annotation[varying];
|
||||
|
||||
/* Compute offset of this slot for the current vertex
|
||||
* in vertex_output
|
||||
*/
|
||||
src_reg data(this->vertex_output);
|
||||
data.reladdr = ralloc(mem_ctx, src_reg);
|
||||
memcpy(data.reladdr, &this->vertex_output_offset,
|
||||
sizeof(src_reg));
|
||||
|
||||
/* Copy this slot to the appropriate message register */
|
||||
dst_reg reg = dst_reg(MRF, mrf);
|
||||
reg.type = output_reg[varying][0].type;
|
||||
data.type = reg.type;
|
||||
inst = emit(MOV(reg, data));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
mrf++;
|
||||
emit(ADD(dst_reg(this->vertex_output_offset),
|
||||
this->vertex_output_offset, brw_imm_ud(1u)));
|
||||
|
||||
/* If this was max_usable_mrf, we can't fit anything more into
|
||||
* this URB WRITE. Same if we reached the max. message length.
|
||||
*/
|
||||
if (mrf > max_usable_mrf ||
|
||||
align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
|
||||
slot++;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
complete = slot >= prog_data->vue_map.num_slots;
|
||||
emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
|
||||
} while (!complete);
|
||||
|
||||
/* Skip over the flags data item so that vertex_output_offset points
|
||||
* to the first data item of the next vertex, so that we can start
|
||||
* writing the next vertex.
|
||||
*/
|
||||
emit(ADD(dst_reg(this->vertex_output_offset),
|
||||
this->vertex_output_offset, brw_imm_ud(1u)));
|
||||
|
||||
emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
|
||||
}
|
||||
emit(BRW_OPCODE_WHILE);
|
||||
|
||||
if (gs_prog_data->num_transform_feedback_bindings)
|
||||
xfb_write();
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
|
||||
/* Finally, emit EOT message.
|
||||
*
|
||||
* In gfx6 we need to end the thread differently depending on whether we have
|
||||
* emitted at least one vertex or not. In case we did, the EOT message must
|
||||
* always include the COMPLETE flag or else the GPU hangs. If we have not
|
||||
* produced any output we can't use the COMPLETE flag.
|
||||
*
|
||||
* However, this would lead us to end the program with an ENDIF opcode,
|
||||
* which we want to avoid, so what we do is that we always request a new
|
||||
* VUE handle every time, even if GS produces no output.
|
||||
* With this we make sure that whether we have emitted at least one vertex
|
||||
* or none at all, we have to finish the thread without writing to the URB,
|
||||
* which works for both cases by setting the COMPLETE and UNUSED flags in
|
||||
* the EOT message.
|
||||
*/
|
||||
this->current_annotation = "gfx6 thread end: EOT";
|
||||
|
||||
if (gs_prog_data->num_transform_feedback_bindings) {
|
||||
/* When emitting EOT, set SONumPrimsWritten Increment Value. */
|
||||
src_reg data(this, glsl_uint_type());
|
||||
emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
|
||||
emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
|
||||
emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
|
||||
}
|
||||
|
||||
inst = emit(GS_OPCODE_THREAD_END);
|
||||
inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
|
||||
inst->base_mrf = base_mrf;
|
||||
inst->mlen = 1;
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::setup_payload()
|
||||
{
|
||||
int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
|
||||
|
||||
/* Attributes are going to be interleaved, so one register contains two
|
||||
* attribute slots.
|
||||
*/
|
||||
int attributes_per_reg = 2;
|
||||
|
||||
/* If a geometry shader tries to read from an input that wasn't written by
|
||||
* the vertex shader, that produces undefined results, but it shouldn't
|
||||
* crash anything. So initialize attribute_map to zeros--that ensures that
|
||||
* these undefined results are read from r0.
|
||||
*/
|
||||
memset(attribute_map, 0, sizeof(attribute_map));
|
||||
|
||||
int reg = 0;
|
||||
|
||||
/* The payload always contains important data in r0. */
|
||||
reg++;
|
||||
|
||||
/* r1 is always part of the payload and it holds information relevant
|
||||
* for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in
|
||||
* the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
|
||||
* information (and move the original value to a virtual register if
|
||||
* necessary).
|
||||
*/
|
||||
if (gs_prog_data->include_primitive_id)
|
||||
attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
|
||||
reg++;
|
||||
|
||||
reg = setup_uniforms(reg);
|
||||
|
||||
reg = setup_varying_inputs(reg, attributes_per_reg);
|
||||
|
||||
this->first_non_payload_grf = reg;
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::xfb_write()
|
||||
{
|
||||
unsigned num_verts;
|
||||
|
||||
switch (gs_prog_data->output_topology) {
|
||||
case _3DPRIM_POINTLIST:
|
||||
num_verts = 1;
|
||||
break;
|
||||
case _3DPRIM_LINELIST:
|
||||
case _3DPRIM_LINESTRIP:
|
||||
case _3DPRIM_LINELOOP:
|
||||
num_verts = 2;
|
||||
break;
|
||||
case _3DPRIM_TRILIST:
|
||||
case _3DPRIM_TRIFAN:
|
||||
case _3DPRIM_TRISTRIP:
|
||||
case _3DPRIM_RECTLIST:
|
||||
num_verts = 3;
|
||||
break;
|
||||
case _3DPRIM_QUADLIST:
|
||||
case _3DPRIM_QUADSTRIP:
|
||||
case _3DPRIM_POLYGON:
|
||||
num_verts = 3;
|
||||
break;
|
||||
default:
|
||||
unreachable("Unexpected primitive type in Gfx6 SOL program.");
|
||||
}
|
||||
|
||||
this->current_annotation = "gfx6 thread end: svb writes init";
|
||||
|
||||
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
|
||||
emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
|
||||
|
||||
/* Check that at least one primitive can be written
|
||||
*
|
||||
* Note: since we use the binding table to keep track of buffer offsets
|
||||
* and stride, the GS doesn't need to keep track of a separate pointer
|
||||
* into each buffer; it uses a single pointer which increments by 1 for
|
||||
* each vertex. So we use SVBI0 for this pointer, regardless of whether
|
||||
* transform feedback is in interleaved or separate attribs mode.
|
||||
*/
|
||||
src_reg sol_temp(this, glsl_uvec4_type());
|
||||
emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
|
||||
|
||||
/* Compare SVBI calculated number with the maximum value, which is
|
||||
* in R1.4 (previously saved in this->max_svbi) for gfx6.
|
||||
*/
|
||||
emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
{
|
||||
vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
|
||||
brw_imm_vf4(brw_float_to_vf(0.0),
|
||||
brw_float_to_vf(1.0),
|
||||
brw_float_to_vf(2.0),
|
||||
brw_float_to_vf(0.0))));
|
||||
inst->force_writemask_all = true;
|
||||
|
||||
emit(ADD(dst_reg(this->destination_indices),
|
||||
this->destination_indices,
|
||||
this->svbi));
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
|
||||
/* Write transform feedback data for all processed vertices. */
|
||||
for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
|
||||
emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
|
||||
emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
|
||||
BRW_CONDITIONAL_L));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
{
|
||||
xfb_program(i, num_verts);
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
|
||||
{
|
||||
unsigned binding;
|
||||
unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
|
||||
src_reg sol_temp(this, glsl_uvec4_type());
|
||||
|
||||
/* Check for buffer overflow: we need room to write the complete primitive
|
||||
* (all vertices). Otherwise, avoid writing any vertices for it
|
||||
*/
|
||||
emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
|
||||
emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
|
||||
emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
|
||||
emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
|
||||
emit(IF(BRW_PREDICATE_NORMAL));
|
||||
{
|
||||
/* Avoid overwriting MRF 1 as it is used as URB write message header */
|
||||
dst_reg mrf_reg(MRF, 2);
|
||||
|
||||
this->current_annotation = "gfx6: emit SOL vertex data";
|
||||
/* For each vertex, generate code to output each varying using the
|
||||
* appropriate binding table entry.
|
||||
*/
|
||||
for (binding = 0; binding < num_bindings; ++binding) {
|
||||
unsigned char varying =
|
||||
gs_prog_data->transform_feedback_bindings[binding];
|
||||
|
||||
/* Set up the correct destination index for this vertex */
|
||||
vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
|
||||
mrf_reg,
|
||||
this->destination_indices);
|
||||
inst->sol_vertex = vertex % num_verts;
|
||||
|
||||
/* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
|
||||
*
|
||||
* "Prior to End of Thread with a URB_WRITE, the kernel must
|
||||
* ensure that all writes are complete by sending the final
|
||||
* write as a committed write."
|
||||
*/
|
||||
bool final_write = binding == (unsigned) num_bindings - 1 &&
|
||||
inst->sol_vertex == num_verts - 1;
|
||||
|
||||
/* Compute offset of this varying for the current vertex
|
||||
* in vertex_output
|
||||
*/
|
||||
this->current_annotation = output_reg_annotation[varying];
|
||||
src_reg data(this->vertex_output);
|
||||
data.reladdr = ralloc(mem_ctx, src_reg);
|
||||
int offset = get_vertex_output_offset_for_varying(vertex, varying);
|
||||
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
|
||||
memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
|
||||
data.type = output_reg[varying][0].type;
|
||||
data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
|
||||
|
||||
/* Write data */
|
||||
inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
|
||||
inst->sol_binding = binding;
|
||||
inst->sol_final_write = final_write;
|
||||
|
||||
if (final_write) {
|
||||
/* This is the last vertex of the primitive, then increment
|
||||
* SO num primitive counter and destination indices.
|
||||
*/
|
||||
emit(ADD(dst_reg(this->destination_indices),
|
||||
this->destination_indices,
|
||||
brw_imm_ud(num_verts)));
|
||||
emit(ADD(dst_reg(this->sol_prim_written),
|
||||
this->sol_prim_written, brw_imm_ud(1u)));
|
||||
}
|
||||
|
||||
}
|
||||
this->current_annotation = NULL;
|
||||
}
|
||||
emit(BRW_OPCODE_ENDIF);
|
||||
}
|
||||
|
||||
int
|
||||
gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
|
||||
{
|
||||
/* Find the output slot assigned to this varying.
|
||||
*
|
||||
* VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
|
||||
* as VARYING_SLOT_PSIZ.
|
||||
*/
|
||||
if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
|
||||
varying = VARYING_SLOT_PSIZ;
|
||||
int slot = prog_data->vue_map.varying_to_slot[varying];
|
||||
|
||||
if (slot < 0) {
|
||||
/* This varying does not exist in the VUE so we are not writing to it
|
||||
* and its value is undefined. We still want to return a valid offset
|
||||
* into vertex_output though, to prevent any out-of-bound accesses into
|
||||
* the vertex_output array. Since the value for this varying is undefined
|
||||
* we don't really care for the value we assign to it, so any offset
|
||||
* within the limits of vertex_output will do.
|
||||
*/
|
||||
slot = 0;
|
||||
}
|
||||
|
||||
return vertex * (prog_data->vue_map.num_slots + 1) + slot;
|
||||
}
|
||||
|
||||
} /* namespace brw */
|
||||
|
|
@ -1,84 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
#ifndef GFX6_GS_VISITOR_H
|
||||
#define GFX6_GS_VISITOR_H
|
||||
|
||||
#include "brw_vec4.h"
|
||||
#include "brw_vec4_gs_visitor.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
|
||||
namespace brw {
|
||||
|
||||
class gfx6_gs_visitor : public vec4_gs_visitor
|
||||
{
|
||||
public:
|
||||
gfx6_gs_visitor(const struct brw_compiler *comp,
|
||||
const struct brw_compile_params *params,
|
||||
struct brw_gs_compile *c,
|
||||
struct brw_gs_prog_data *prog_data,
|
||||
const nir_shader *shader,
|
||||
bool no_spills,
|
||||
bool debug_enabled) :
|
||||
vec4_gs_visitor(comp, params, c, prog_data, shader, no_spills, debug_enabled)
|
||||
{
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual void emit_prolog();
|
||||
virtual void emit_thread_end();
|
||||
virtual void gs_emit_vertex(int stream_id);
|
||||
virtual void gs_end_primitive();
|
||||
virtual void emit_urb_write_header(int mrf);
|
||||
virtual void setup_payload();
|
||||
|
||||
private:
|
||||
void xfb_write();
|
||||
void xfb_program(unsigned vertex, unsigned num_verts);
|
||||
int get_vertex_output_offset_for_varying(int vertex, int varying);
|
||||
void emit_snb_gs_urb_write_opcode(bool complete,
|
||||
int base_mrf,
|
||||
int last_mrf,
|
||||
int urb_offset);
|
||||
|
||||
src_reg vertex_output;
|
||||
src_reg vertex_output_offset;
|
||||
src_reg temp;
|
||||
src_reg first_vertex;
|
||||
src_reg prim_count;
|
||||
src_reg primitive_id;
|
||||
|
||||
/* Transform Feedback members */
|
||||
src_reg sol_prim_written;
|
||||
src_reg svbi;
|
||||
src_reg max_svbi;
|
||||
src_reg destination_indices;
|
||||
};
|
||||
|
||||
} /* namespace brw */
|
||||
|
||||
#endif /* __cplusplus */
|
||||
|
||||
#endif /* GFX6_GS_VISITOR_H */
|
||||
|
|
@ -105,7 +105,6 @@ libintel_compiler_brw_files = files(
|
|||
'brw_ir_fs.h',
|
||||
'brw_ir_performance.h',
|
||||
'brw_ir_performance.cpp',
|
||||
'brw_ir_vec4.h',
|
||||
'brw_isa_info.h',
|
||||
'brw_lower_logical_sends.cpp',
|
||||
'brw_mesh.cpp',
|
||||
|
|
@ -137,33 +136,7 @@ libintel_compiler_brw_files = files(
|
|||
'brw_shader.cpp',
|
||||
'brw_shader.h',
|
||||
'brw_simd_selection.cpp',
|
||||
'brw_vec4_builder.h',
|
||||
'brw_vec4_cmod_propagation.cpp',
|
||||
'brw_vec4_copy_propagation.cpp',
|
||||
'brw_vec4.cpp',
|
||||
'brw_vec4_cse.cpp',
|
||||
'brw_vec4_dead_code_eliminate.cpp',
|
||||
'brw_vec4_generator.cpp',
|
||||
'brw_vec4_gs_visitor.cpp',
|
||||
'brw_vec4_gs_visitor.h',
|
||||
'brw_vec4.h',
|
||||
'brw_vec4_live_variables.cpp',
|
||||
'brw_vec4_live_variables.h',
|
||||
'brw_vec4_nir.cpp',
|
||||
'brw_vec4_gs_nir.cpp',
|
||||
'brw_vec4_reg_allocate.cpp',
|
||||
'brw_vec4_surface_builder.cpp',
|
||||
'brw_vec4_surface_builder.h',
|
||||
'brw_vec4_tcs.cpp',
|
||||
'brw_vec4_tcs.h',
|
||||
'brw_vec4_tes.cpp',
|
||||
'brw_vec4_tes.h',
|
||||
'brw_vec4_visitor.cpp',
|
||||
'brw_vec4_vs_visitor.cpp',
|
||||
'brw_vec4_vs.h',
|
||||
'brw_vue_map.c',
|
||||
'gfx6_gs_visitor.cpp',
|
||||
'gfx6_gs_visitor.h',
|
||||
)
|
||||
|
||||
brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
|
||||
|
|
@ -236,10 +209,6 @@ if with_tests
|
|||
'test_fs_saturate_propagation.cpp',
|
||||
'test_fs_scoreboard.cpp',
|
||||
'test_simd_selection.cpp',
|
||||
'test_vec4_cmod_propagation.cpp',
|
||||
'test_vec4_copy_propagation.cpp',
|
||||
'test_vec4_dead_code_eliminate.cpp',
|
||||
'test_vec4_register_coalesce.cpp',
|
||||
'test_vf_float_conversions.cpp',
|
||||
),
|
||||
ir_expression_operation_h,
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -1,195 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2014 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "brw_vec4.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
class copy_propagation_vec4_test : public ::testing::Test {
|
||||
virtual void SetUp();
|
||||
virtual void TearDown();
|
||||
|
||||
public:
|
||||
struct brw_compiler *compiler;
|
||||
struct brw_compile_params params;
|
||||
struct intel_device_info *devinfo;
|
||||
void *ctx;
|
||||
struct gl_shader_program *shader_prog;
|
||||
struct brw_vue_prog_data *prog_data;
|
||||
vec4_visitor *v;
|
||||
};
|
||||
|
||||
class copy_propagation_vec4_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
copy_propagation_vec4_visitor(struct brw_compiler *compiler,
|
||||
struct brw_compile_params *params,
|
||||
nir_shader *shader,
|
||||
struct brw_vue_prog_data *prog_data)
|
||||
: vec4_visitor(compiler, params, NULL, prog_data, shader,
|
||||
false /* no_spills */, false)
|
||||
{
|
||||
prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual dst_reg *make_reg_for_system_value(int /* location */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void setup_payload()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_prolog()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_thread_end()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_urb_write_header(int /* mrf */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void copy_propagation_vec4_test::SetUp()
|
||||
{
|
||||
ctx = ralloc_context(NULL);
|
||||
compiler = rzalloc(ctx, struct brw_compiler);
|
||||
devinfo = rzalloc(ctx, struct intel_device_info);
|
||||
compiler->devinfo = devinfo;
|
||||
|
||||
params = {};
|
||||
params.mem_ctx = ctx;
|
||||
|
||||
prog_data = ralloc(ctx, struct brw_vue_prog_data);
|
||||
nir_shader *shader =
|
||||
nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
|
||||
|
||||
v = new copy_propagation_vec4_visitor(compiler, ¶ms, shader, prog_data);
|
||||
|
||||
devinfo->ver = 4;
|
||||
devinfo->verx10 = devinfo->ver * 10;
|
||||
}
|
||||
|
||||
void copy_propagation_vec4_test::TearDown()
|
||||
{
|
||||
delete v;
|
||||
v = NULL;
|
||||
|
||||
ralloc_free(ctx);
|
||||
ctx = NULL;
|
||||
}
|
||||
|
||||
|
||||
static void
|
||||
copy_propagation(vec4_visitor *v)
|
||||
{
|
||||
const bool print = getenv("TEST_DEBUG");
|
||||
|
||||
if (print) {
|
||||
fprintf(stderr, "instructions before:\n");
|
||||
v->dump_instructions();
|
||||
}
|
||||
|
||||
v->calculate_cfg();
|
||||
v->opt_copy_propagation();
|
||||
|
||||
if (print) {
|
||||
fprintf(stderr, "instructions after:\n");
|
||||
v->dump_instructions();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(copy_propagation_vec4_test, test_swizzle_swizzle)
|
||||
{
|
||||
dst_reg a = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg b = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg c = dst_reg(v, glsl_vec4_type());
|
||||
|
||||
v->emit(v->ADD(a, src_reg(a), src_reg(a)));
|
||||
|
||||
v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_Y,
|
||||
BRW_SWIZZLE_Z,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_X))));
|
||||
|
||||
vec4_instruction *test_mov =
|
||||
v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_Y,
|
||||
BRW_SWIZZLE_Z,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_X)));
|
||||
v->emit(test_mov);
|
||||
|
||||
copy_propagation(v);
|
||||
|
||||
EXPECT_EQ(test_mov->src[0].nr, a.nr);
|
||||
EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_Z,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_X,
|
||||
BRW_SWIZZLE_Y));
|
||||
}
|
||||
|
||||
TEST_F(copy_propagation_vec4_test, test_swizzle_writemask)
|
||||
{
|
||||
dst_reg a = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg b = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg c = dst_reg(v, glsl_vec4_type());
|
||||
|
||||
v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_X,
|
||||
BRW_SWIZZLE_Y,
|
||||
BRW_SWIZZLE_X,
|
||||
BRW_SWIZZLE_Z))));
|
||||
|
||||
v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f)));
|
||||
|
||||
vec4_instruction *test_mov =
|
||||
v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_W)));
|
||||
v->emit(test_mov);
|
||||
|
||||
copy_propagation(v);
|
||||
|
||||
/* should not copy propagate */
|
||||
EXPECT_EQ(test_mov->src[0].nr, b.nr);
|
||||
EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_W,
|
||||
BRW_SWIZZLE_W));
|
||||
}
|
||||
|
|
@ -1,178 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2018 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "brw_vec4.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
class dead_code_eliminate_vec4_test : public ::testing::Test {
|
||||
virtual void SetUp();
|
||||
virtual void TearDown();
|
||||
|
||||
public:
|
||||
struct brw_compiler *compiler;
|
||||
struct brw_compile_params params;
|
||||
struct intel_device_info *devinfo;
|
||||
void *ctx;
|
||||
struct gl_shader_program *shader_prog;
|
||||
struct brw_vue_prog_data *prog_data;
|
||||
vec4_visitor *v;
|
||||
};
|
||||
|
||||
class dead_code_eliminate_vec4_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
dead_code_eliminate_vec4_visitor(struct brw_compiler *compiler,
|
||||
struct brw_compile_params *params,
|
||||
nir_shader *shader,
|
||||
struct brw_vue_prog_data *prog_data)
|
||||
: vec4_visitor(compiler, params, NULL, prog_data, shader,
|
||||
false /* no_spills */, false)
|
||||
{
|
||||
prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual dst_reg *make_reg_for_system_value(int /* location */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void setup_payload()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_prolog()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_thread_end()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_urb_write_header(int /* mrf */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void dead_code_eliminate_vec4_test::SetUp()
|
||||
{
|
||||
ctx = ralloc_context(NULL);
|
||||
compiler = rzalloc(ctx, struct brw_compiler);
|
||||
devinfo = rzalloc(ctx, struct intel_device_info);
|
||||
compiler->devinfo = devinfo;
|
||||
|
||||
params = {};
|
||||
params.mem_ctx = ctx;
|
||||
|
||||
prog_data = ralloc(ctx, struct brw_vue_prog_data);
|
||||
nir_shader *shader =
|
||||
nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
|
||||
|
||||
v = new dead_code_eliminate_vec4_visitor(compiler, ¶ms, shader, prog_data);
|
||||
|
||||
devinfo->ver = 4;
|
||||
devinfo->verx10 = devinfo->ver * 10;
|
||||
}
|
||||
|
||||
void dead_code_eliminate_vec4_test::TearDown()
|
||||
{
|
||||
delete v;
|
||||
v = NULL;
|
||||
|
||||
ralloc_free(ctx);
|
||||
ctx = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
dead_code_eliminate(vec4_visitor *v)
|
||||
{
|
||||
const bool print = getenv("TEST_DEBUG");
|
||||
|
||||
if (print) {
|
||||
fprintf(stderr, "instructions before:\n");
|
||||
v->dump_instructions();
|
||||
}
|
||||
|
||||
v->calculate_cfg();
|
||||
v->dead_code_eliminate();
|
||||
|
||||
if (print) {
|
||||
fprintf(stderr, "instructions after:\n");
|
||||
v->dump_instructions();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(dead_code_eliminate_vec4_test, some_dead_channels_all_flags_used)
|
||||
{
|
||||
const vec4_builder bld = vec4_builder(v).at_end();
|
||||
src_reg r1 = src_reg(v, glsl_vec4_type());
|
||||
src_reg r2 = src_reg(v, glsl_vec4_type());
|
||||
src_reg r3 = src_reg(v, glsl_vec4_type());
|
||||
src_reg r4 = src_reg(v, glsl_vec4_type());
|
||||
src_reg r5 = src_reg(v, glsl_vec4_type());
|
||||
src_reg r6 = src_reg(v, glsl_vec4_type());
|
||||
|
||||
/* Sequence like the following should not be modified by DCE.
|
||||
*
|
||||
* cmp.l.f0(8) g4<1>F g2<4,4,1>.wF g1<4,4,1>.xF
|
||||
* mov(8) g5<1>.xF g4<4,4,1>.xF
|
||||
* (+f0.x) sel(8) g6<1>UD g3<4>UD g6<4>UD
|
||||
*/
|
||||
vec4_instruction *test_cmp =
|
||||
bld.CMP(dst_reg(r4), r2, r1, BRW_CONDITIONAL_L);
|
||||
|
||||
test_cmp->src[0].swizzle = BRW_SWIZZLE_WWWW;
|
||||
test_cmp->src[1].swizzle = BRW_SWIZZLE_XXXX;
|
||||
|
||||
vec4_instruction *test_mov =
|
||||
bld.MOV(dst_reg(r5), r4);
|
||||
|
||||
test_mov->dst.writemask = WRITEMASK_X;
|
||||
test_mov->src[0].swizzle = BRW_SWIZZLE_XXXX;
|
||||
|
||||
vec4_instruction *test_sel =
|
||||
bld.SEL(dst_reg(r6), r3, r6);
|
||||
|
||||
set_predicate(BRW_PREDICATE_NORMAL, test_sel);
|
||||
|
||||
/* The scratch write is here just to make r5 and r6 be live so that the
|
||||
* whole program doesn't get eliminated by DCE.
|
||||
*/
|
||||
v->emit(v->SCRATCH_WRITE(dst_reg(r4), r6, r5));
|
||||
|
||||
dead_code_eliminate(v);
|
||||
|
||||
EXPECT_EQ(test_cmp->dst.writemask, WRITEMASK_XYZW);
|
||||
}
|
||||
|
|
@ -1,256 +0,0 @@
|
|||
/*
|
||||
* Copyright © 2012 Intel Corporation
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a
|
||||
* copy of this software and associated documentation files (the "Software"),
|
||||
* to deal in the Software without restriction, including without limitation
|
||||
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
* and/or sell copies of the Software, and to permit persons to whom the
|
||||
* Software is furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice (including the next
|
||||
* paragraph) shall be included in all copies or substantial portions of the
|
||||
* Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
||||
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
* IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include "brw_vec4.h"
|
||||
|
||||
using namespace brw;
|
||||
|
||||
#define register_coalesce(v) _register_coalesce(v, __func__)
|
||||
|
||||
class register_coalesce_vec4_test : public ::testing::Test {
|
||||
virtual void SetUp();
|
||||
virtual void TearDown();
|
||||
|
||||
public:
|
||||
struct brw_compiler *compiler;
|
||||
struct brw_compile_params params;
|
||||
struct intel_device_info *devinfo;
|
||||
void *ctx;
|
||||
struct gl_shader_program *shader_prog;
|
||||
struct brw_vue_prog_data *prog_data;
|
||||
vec4_visitor *v;
|
||||
};
|
||||
|
||||
|
||||
class register_coalesce_vec4_visitor : public vec4_visitor
|
||||
{
|
||||
public:
|
||||
register_coalesce_vec4_visitor(struct brw_compiler *compiler,
|
||||
struct brw_compile_params *params,
|
||||
nir_shader *shader,
|
||||
struct brw_vue_prog_data *prog_data)
|
||||
: vec4_visitor(compiler, params, NULL, prog_data, shader,
|
||||
false /* no_spills */, false)
|
||||
{
|
||||
prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
|
||||
}
|
||||
|
||||
protected:
|
||||
virtual dst_reg *make_reg_for_system_value(int /* location */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void setup_payload()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_prolog()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_thread_end()
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual void emit_urb_write_header(int /* mrf */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
|
||||
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
|
||||
{
|
||||
unreachable("Not reached");
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
void register_coalesce_vec4_test::SetUp()
|
||||
{
|
||||
ctx = ralloc_context(NULL);
|
||||
compiler = rzalloc(ctx, struct brw_compiler);
|
||||
devinfo = rzalloc(ctx, struct intel_device_info);
|
||||
compiler->devinfo = devinfo;
|
||||
|
||||
prog_data = ralloc(ctx, struct brw_vue_prog_data);
|
||||
|
||||
params = {};
|
||||
params.mem_ctx = ctx;
|
||||
|
||||
nir_shader *shader =
|
||||
nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
|
||||
|
||||
v = new register_coalesce_vec4_visitor(compiler, ¶ms, shader, prog_data);
|
||||
|
||||
devinfo->ver = 4;
|
||||
devinfo->verx10 = devinfo->ver * 10;
|
||||
}
|
||||
|
||||
void register_coalesce_vec4_test::TearDown()
|
||||
{
|
||||
delete v;
|
||||
v = NULL;
|
||||
|
||||
ralloc_free(ctx);
|
||||
ctx = NULL;
|
||||
}
|
||||
|
||||
static void
|
||||
_register_coalesce(vec4_visitor *v, const char *func)
|
||||
{
|
||||
const bool print = getenv("TEST_DEBUG");
|
||||
|
||||
if (print) {
|
||||
printf("%s: instructions before:\n", func);
|
||||
v->dump_instructions();
|
||||
}
|
||||
|
||||
v->calculate_cfg();
|
||||
v->opt_register_coalesce();
|
||||
|
||||
if (print) {
|
||||
printf("%s: instructions after:\n", func);
|
||||
v->dump_instructions();
|
||||
}
|
||||
}
|
||||
|
||||
TEST_F(register_coalesce_vec4_test, test_compute_to_mrf)
|
||||
{
|
||||
src_reg something = src_reg(v, glsl_float_type());
|
||||
dst_reg temp = dst_reg(v, glsl_float_type());
|
||||
dst_reg init;
|
||||
|
||||
dst_reg m0 = dst_reg(MRF, 0);
|
||||
m0.writemask = WRITEMASK_X;
|
||||
m0.type = BRW_REGISTER_TYPE_F;
|
||||
|
||||
vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
|
||||
v->emit(v->MOV(m0, src_reg(temp)));
|
||||
|
||||
register_coalesce(v);
|
||||
|
||||
EXPECT_EQ(mul->dst.file, MRF);
|
||||
}
|
||||
|
||||
|
||||
TEST_F(register_coalesce_vec4_test, test_multiple_use)
|
||||
{
|
||||
src_reg something = src_reg(v, glsl_float_type());
|
||||
dst_reg temp = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg init;
|
||||
|
||||
dst_reg m0 = dst_reg(MRF, 0);
|
||||
m0.writemask = WRITEMASK_X;
|
||||
m0.type = BRW_REGISTER_TYPE_F;
|
||||
|
||||
dst_reg m1 = dst_reg(MRF, 1);
|
||||
m1.writemask = WRITEMASK_XYZW;
|
||||
m1.type = BRW_REGISTER_TYPE_F;
|
||||
|
||||
src_reg src = src_reg(temp);
|
||||
vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
|
||||
src.swizzle = BRW_SWIZZLE_XXXX;
|
||||
v->emit(v->MOV(m0, src));
|
||||
src.swizzle = BRW_SWIZZLE_XYZW;
|
||||
v->emit(v->MOV(m1, src));
|
||||
|
||||
register_coalesce(v);
|
||||
|
||||
EXPECT_NE(mul->dst.file, MRF);
|
||||
}
|
||||
|
||||
TEST_F(register_coalesce_vec4_test, test_dp4_mrf)
|
||||
{
|
||||
src_reg some_src_1 = src_reg(v, glsl_vec4_type());
|
||||
src_reg some_src_2 = src_reg(v, glsl_vec4_type());
|
||||
dst_reg init;
|
||||
|
||||
dst_reg m0 = dst_reg(MRF, 0);
|
||||
m0.writemask = WRITEMASK_Y;
|
||||
m0.type = BRW_REGISTER_TYPE_F;
|
||||
|
||||
dst_reg temp = dst_reg(v, glsl_float_type());
|
||||
|
||||
vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
|
||||
v->emit(v->MOV(m0, src_reg(temp)));
|
||||
|
||||
register_coalesce(v);
|
||||
|
||||
EXPECT_EQ(dp4->dst.file, MRF);
|
||||
EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
|
||||
}
|
||||
|
||||
TEST_F(register_coalesce_vec4_test, test_dp4_grf)
|
||||
{
|
||||
src_reg some_src_1 = src_reg(v, glsl_vec4_type());
|
||||
src_reg some_src_2 = src_reg(v, glsl_vec4_type());
|
||||
dst_reg init;
|
||||
|
||||
dst_reg to = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg temp = dst_reg(v, glsl_float_type());
|
||||
|
||||
vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
|
||||
to.writemask = WRITEMASK_Y;
|
||||
v->emit(v->MOV(to, src_reg(temp)));
|
||||
|
||||
/* if we don't do something with the result, the automatic dead code
|
||||
* elimination will remove all our instructions.
|
||||
*/
|
||||
src_reg src = src_reg(to);
|
||||
src.negate = true;
|
||||
v->emit(v->MOV(dst_reg(MRF, 0), src));
|
||||
|
||||
register_coalesce(v);
|
||||
|
||||
EXPECT_EQ(dp4->dst.nr, to.nr);
|
||||
EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
|
||||
}
|
||||
|
||||
TEST_F(register_coalesce_vec4_test, test_channel_mul_grf)
|
||||
{
|
||||
src_reg some_src_1 = src_reg(v, glsl_vec4_type());
|
||||
src_reg some_src_2 = src_reg(v, glsl_vec4_type());
|
||||
dst_reg init;
|
||||
|
||||
dst_reg to = dst_reg(v, glsl_vec4_type());
|
||||
dst_reg temp = dst_reg(v, glsl_float_type());
|
||||
|
||||
vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2));
|
||||
to.writemask = WRITEMASK_Y;
|
||||
v->emit(v->MOV(to, src_reg(temp)));
|
||||
|
||||
/* if we don't do something with the result, the automatic dead code
|
||||
* elimination will remove all our instructions.
|
||||
*/
|
||||
src_reg src = src_reg(to);
|
||||
src.negate = true;
|
||||
v->emit(v->MOV(dst_reg(MRF, 0), src));
|
||||
|
||||
register_coalesce(v);
|
||||
|
||||
EXPECT_EQ(mul->dst.nr, to.nr);
|
||||
}
|
||||
Loading…
Add table
Reference in a new issue