intel/brw: Remove vec4 backend

It still exists as part of ELK for older gfx versions.

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27691>
This commit is contained in:
Caio Oliveira 2024-02-14 22:57:40 -08:00 committed by Marge Bot
parent 7c23b90537
commit a641aa294e
39 changed files with 0 additions and 17138 deletions

View file

@ -87,8 +87,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo)
brw_init_isa_info(&compiler->isa, devinfo);
brw_fs_alloc_reg_sets(compiler);
if (devinfo->ver < 8)
brw_vec4_alloc_reg_set(compiler);
compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false);

View file

@ -57,16 +57,6 @@ struct brw_compiler {
struct brw_isa_info isa;
struct {
struct ra_regs *regs;
/**
* Array of the ra classes for the unaligned contiguous register
* block sizes used.
*/
struct ra_class **classes;
} vec4_reg_set;
struct {
struct ra_regs *regs;

View file

@ -33,7 +33,6 @@
#include "brw_fs_builder.h"
#include "brw_fs_live_variables.h"
#include "brw_nir.h"
#include "brw_vec4_gs_visitor.h"
#include "brw_cfg.h"
#include "brw_dead_control_flow.h"
#include "brw_private.h"

View file

@ -23,7 +23,6 @@
#include "brw_eu.h"
#include "brw_fs.h"
#include "brw_vec4.h"
#include "brw_cfg.h"
using namespace brw;
@ -152,29 +151,6 @@ namespace {
rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0;
}
instruction_info(const struct brw_isa_info *isa,
const vec4_instruction *inst) :
isa(isa), devinfo(isa->devinfo), op(inst->opcode),
td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)),
tx(get_exec_type(inst)), sx(0), ss(0), sc(0),
desc(inst->desc), sfid(inst->sfid), rcount(0)
{
/* Compute the maximum source size. */
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++)
ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
/* Convert the execution size to GRF units. */
sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE);
/* 32x32 integer multiplication has half the usual ALU throughput.
* Treat it as double-precision.
*/
if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) &&
!brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 &&
type_sz(inst->src[0].type) == type_sz(inst->src[1].type))
tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D);
}
/** ISA encoding information */
const struct brw_isa_info *isa;
/** Device information. */
@ -1505,102 +1481,6 @@ namespace {
}
}
/**
* Model the performance behavior of a VEC4 back-end instruction.
*/
void
issue_vec4_instruction(state &st, const struct brw_isa_info *isa,
const backend_instruction *be_inst)
{
const struct intel_device_info *devinfo = isa->devinfo;
const vec4_instruction *inst =
static_cast<const vec4_instruction *>(be_inst);
const instruction_info info(isa, inst);
const perf_desc perf = instruction_desc(info);
/* Stall on any source dependencies. */
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
for (unsigned j = 0; j < regs_read(inst, i); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, inst->src[i], j));
}
if (inst->reads_accumulator_implicitly()) {
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
j <= accum_reg_of_channel(devinfo, inst, info.tx,
inst->exec_size - 1); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
}
if (inst->base_mrf != -1) {
for (unsigned j = 0; j < inst->mlen; j++)
stall_on_dependency(
st, reg_dependency_id(
devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
}
if (inst->reads_flag())
stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
/* Stall on any write dependencies. */
if (!inst->no_dd_check) {
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, inst->dst, j));
}
if (inst->writes_accumulator_implicitly(devinfo)) {
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
j <= accum_reg_of_channel(devinfo, inst, info.tx,
inst->exec_size - 1); j++)
stall_on_dependency(
st, reg_dependency_id(devinfo, brw_acc_reg(8), j));
}
if (inst->writes_flag(devinfo))
stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0);
}
/* Execute the instruction. */
execute_instruction(st, perf);
/* Mark any source dependencies. */
if (inst->is_send_from_grf()) {
for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) {
for (unsigned j = 0; j < regs_read(inst, i); j++)
mark_read_dependency(
st, perf, reg_dependency_id(devinfo, inst->src[i], j));
}
}
if (inst->base_mrf != -1) {
for (unsigned j = 0; j < inst->mlen; j++)
mark_read_dependency(st, perf,
reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j));
}
/* Mark any destination dependencies. */
if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) {
for (unsigned j = 0; j < regs_written(inst); j++) {
mark_write_dependency(st, perf,
reg_dependency_id(devinfo, inst->dst, j));
}
}
if (inst->writes_accumulator_implicitly(devinfo)) {
for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0);
j <= accum_reg_of_channel(devinfo, inst, info.tx,
inst->exec_size - 1); j++)
mark_write_dependency(st, perf,
reg_dependency_id(devinfo, brw_acc_reg(8), j));
}
if (inst->writes_flag(devinfo))
mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0);
}
/**
* Calculate the maximum possible throughput of the program compatible with
* the cycle-count utilization estimated for each asynchronous unit, in
@ -1692,12 +1572,6 @@ brw::performance::performance(const fs_visitor *v) :
calculate_performance(*this, v, issue_fs_inst, v->dispatch_width);
}
brw::performance::performance(const vec4_visitor *v) :
block_latency(new unsigned[v->cfg->num_blocks])
{
calculate_performance(*this, v, issue_vec4_instruction, 8);
}
brw::performance::~performance()
{
delete[] block_latency;

View file

@ -28,15 +28,12 @@
class fs_visitor;
namespace brw {
class vec4_visitor;
/**
* Various estimates of the performance of a shader based on static
* analysis.
*/
struct performance {
performance(const fs_visitor *v);
performance(const vec4_visitor *v);
~performance();
analysis_dependency_class

View file

@ -1,475 +0,0 @@
/* -*- c++ -*- */
/*
* Copyright © 2011-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_IR_VEC4_H
#define BRW_IR_VEC4_H
#include "brw_shader.h"
namespace brw {
class dst_reg;
class src_reg : public backend_reg
{
public:
DECLARE_RALLOC_CXX_OPERATORS(src_reg)
void init();
src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
src_reg();
src_reg(struct ::brw_reg reg);
bool equals(const src_reg &r) const;
bool negative_equals(const src_reg &r) const;
src_reg(class vec4_visitor *v, const struct glsl_type *type);
src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
explicit src_reg(const dst_reg &reg);
src_reg *reladdr;
};
static inline src_reg
retype(src_reg reg, enum brw_reg_type type)
{
reg.type = type;
return reg;
}
namespace detail {
static inline void
add_byte_offset(backend_reg *reg, unsigned bytes)
{
switch (reg->file) {
case BAD_FILE:
break;
case VGRF:
case ATTR:
case UNIFORM:
reg->offset += bytes;
assert(reg->offset % 16 == 0);
break;
case MRF: {
const unsigned suboffset = reg->offset + bytes;
reg->nr += suboffset / REG_SIZE;
reg->offset = suboffset % REG_SIZE;
assert(reg->offset % 16 == 0);
break;
}
case ARF:
case FIXED_GRF: {
const unsigned suboffset = reg->subnr + bytes;
reg->nr += suboffset / REG_SIZE;
reg->subnr = suboffset % REG_SIZE;
assert(reg->subnr % 16 == 0);
break;
}
default:
assert(bytes == 0);
}
}
} /* namespace detail */
static inline src_reg
byte_offset(src_reg reg, unsigned bytes)
{
detail::add_byte_offset(&reg, bytes);
return reg;
}
static inline src_reg
offset(src_reg reg, unsigned width, unsigned delta)
{
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
const unsigned num_components = MAX2(width / 4 * stride, 4);
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
}
static inline src_reg
horiz_offset(src_reg reg, unsigned delta)
{
return byte_offset(reg, delta * type_sz(reg.type));
}
/**
* Reswizzle a given source register.
* \sa brw_swizzle().
*/
static inline src_reg
swizzle(src_reg reg, unsigned swizzle)
{
if (reg.file == IMM)
reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
else
reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
return reg;
}
static inline src_reg
negate(src_reg reg)
{
assert(reg.file != IMM);
reg.negate = !reg.negate;
return reg;
}
static inline bool
is_uniform(const src_reg &reg)
{
return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
(!reg.reladdr || is_uniform(*reg.reladdr));
}
class dst_reg : public backend_reg
{
public:
DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
void init();
dst_reg();
dst_reg(enum brw_reg_file file, int nr);
dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
unsigned writemask);
dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
unsigned writemask);
dst_reg(struct ::brw_reg reg);
dst_reg(class vec4_visitor *v, const struct glsl_type *type);
explicit dst_reg(const src_reg &reg);
bool equals(const dst_reg &r) const;
src_reg *reladdr;
};
static inline dst_reg
retype(dst_reg reg, enum brw_reg_type type)
{
reg.type = type;
return reg;
}
static inline dst_reg
byte_offset(dst_reg reg, unsigned bytes)
{
detail::add_byte_offset(&reg, bytes);
return reg;
}
static inline dst_reg
offset(dst_reg reg, unsigned width, unsigned delta)
{
const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
const unsigned num_components = MAX2(width / 4 * stride, 4);
return byte_offset(reg, num_components * type_sz(reg.type) * delta);
}
static inline dst_reg
horiz_offset(const dst_reg &reg, unsigned delta)
{
if (is_uniform(src_reg(reg)))
return reg;
else
return byte_offset(reg, delta * type_sz(reg.type));
}
static inline dst_reg
writemask(dst_reg reg, unsigned mask)
{
assert(reg.file != IMM);
assert((reg.writemask & mask) != 0);
reg.writemask &= mask;
return reg;
}
/**
* Return an integer identifying the discrete address space a register is
* contained in. A register is by definition fully contained in the single
* reg_space it belongs to, so two registers with different reg_space ids are
* guaranteed not to overlap. Most register files are a single reg_space of
* its own, only the VGRF file is composed of multiple discrete address
* spaces, one for each VGRF allocation.
*/
static inline uint32_t
reg_space(const backend_reg &r)
{
return r.file << 16 | (r.file == VGRF ? r.nr : 0);
}
/**
* Return the base offset in bytes of a register relative to the start of its
* reg_space().
*/
static inline unsigned
reg_offset(const backend_reg &r)
{
return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
(r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
(r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
}
/**
* Return whether the register region starting at \p r and spanning \p dr
* bytes could potentially overlap the register region starting at \p s and
* spanning \p ds bytes.
*/
static inline bool
regions_overlap(const backend_reg &r, unsigned dr,
const backend_reg &s, unsigned ds)
{
if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
/* COMPR4 regions are translated by the hardware during decompression
* into two separate half-regions 4 MRFs apart from each other.
*/
backend_reg t0 = r;
t0.nr &= ~BRW_MRF_COMPR4;
backend_reg t1 = t0;
t1.offset += 4 * REG_SIZE;
return regions_overlap(t0, dr / 2, s, ds) ||
regions_overlap(t1, dr / 2, s, ds);
} else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
return regions_overlap(s, ds, r, dr);
} else {
return reg_space(r) == reg_space(s) &&
!(reg_offset(r) + dr <= reg_offset(s) ||
reg_offset(s) + ds <= reg_offset(r));
}
}
class vec4_instruction : public backend_instruction {
public:
DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
vec4_instruction(enum opcode opcode,
const dst_reg &dst = dst_reg(),
const src_reg &src0 = src_reg(),
const src_reg &src1 = src_reg(),
const src_reg &src2 = src_reg());
dst_reg dst;
src_reg src[3];
enum brw_urb_write_flags urb_write_flags;
unsigned sol_binding; /**< gfx6: SOL binding table index */
bool sol_final_write; /**< gfx6: send commit message */
unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */
bool is_send_from_grf() const;
unsigned size_read(unsigned arg) const;
bool can_reswizzle(const struct intel_device_info *devinfo,
int dst_writemask,
int swizzle, int swizzle_mask);
void reswizzle(int dst_writemask, int swizzle);
bool can_do_source_mods(const struct intel_device_info *devinfo);
bool can_do_cmod();
bool can_do_writemask(const struct intel_device_info *devinfo);
bool can_change_types() const;
bool has_source_and_destination_hazard() const;
unsigned implied_mrf_writes() const;
bool is_align1_partial_write()
{
return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
opcode == VEC4_OPCODE_SET_HIGH_32BIT;
}
bool reads_flag() const
{
return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
}
bool reads_flag(unsigned c)
{
if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
return true;
switch (predicate) {
case BRW_PREDICATE_NONE:
return false;
case BRW_PREDICATE_ALIGN16_REPLICATE_X:
return c == 0;
case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
return c == 1;
case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
return c == 2;
case BRW_PREDICATE_ALIGN16_REPLICATE_W:
return c == 3;
default:
return true;
}
}
bool writes_flag(const intel_device_info *devinfo) const
{
return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) &&
opcode != BRW_OPCODE_CSEL &&
opcode != BRW_OPCODE_IF &&
opcode != BRW_OPCODE_WHILE));
}
bool reads_g0_implicitly() const
{
switch (opcode) {
case SHADER_OPCODE_TEX:
case SHADER_OPCODE_TXL:
case SHADER_OPCODE_TXD:
case SHADER_OPCODE_TXF:
case SHADER_OPCODE_TXF_CMS_W:
case SHADER_OPCODE_TXF_CMS:
case SHADER_OPCODE_TXF_MCS:
case SHADER_OPCODE_TXS:
case SHADER_OPCODE_TG4:
case SHADER_OPCODE_TG4_OFFSET:
case SHADER_OPCODE_SAMPLEINFO:
case VS_OPCODE_PULL_CONSTANT_LOAD:
case GS_OPCODE_SET_PRIMITIVE_ID:
case GS_OPCODE_GET_INSTANCE_ID:
case SHADER_OPCODE_GFX4_SCRATCH_READ:
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
return true;
default:
return false;
}
}
};
/**
* Make the execution of \p inst dependent on the evaluation of a possibly
* inverted predicate.
*/
inline vec4_instruction *
set_predicate_inv(enum brw_predicate pred, bool inverse,
vec4_instruction *inst)
{
inst->predicate = pred;
inst->predicate_inverse = inverse;
return inst;
}
/**
* Make the execution of \p inst dependent on the evaluation of a predicate.
*/
inline vec4_instruction *
set_predicate(enum brw_predicate pred, vec4_instruction *inst)
{
return set_predicate_inv(pred, false, inst);
}
/**
* Write the result of evaluating the condition given by \p mod to a flag
* register.
*/
inline vec4_instruction *
set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
{
inst->conditional_mod = mod;
return inst;
}
/**
* Clamp the result of \p inst to the saturation range of its destination
* datatype.
*/
inline vec4_instruction *
set_saturate(bool saturate, vec4_instruction *inst)
{
inst->saturate = saturate;
return inst;
}
/**
* Return the number of dataflow registers written by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->dst) /
* register_size)'. The somewhat arbitrary register size unit is 16B for the
* UNIFORM and IMM files and 32B for all other files.
*/
inline unsigned
regs_written(const vec4_instruction *inst)
{
assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
REG_SIZE);
}
/**
* Return the number of dataflow registers read by the instruction (either
* fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
* register_size)'. The somewhat arbitrary register size unit is 16B for the
* UNIFORM and IMM files and 32B for all other files.
*/
inline unsigned
regs_read(const vec4_instruction *inst, unsigned i)
{
const unsigned reg_size =
inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
reg_size);
}
static inline enum brw_reg_type
get_exec_type(const vec4_instruction *inst)
{
enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B;
for (int i = 0; i < 3; i++) {
if (inst->src[i].file != BAD_FILE) {
const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type));
if (type_sz(t) > type_sz(exec_type))
exec_type = t;
else if (type_sz(t) == type_sz(exec_type) &&
brw_reg_type_is_floating_point(t))
exec_type = t;
}
}
if (exec_type == BRW_REGISTER_TYPE_B)
exec_type = inst->dst.type;
/* TODO: We need to handle half-float conversions. */
assert(exec_type != BRW_REGISTER_TYPE_HF ||
inst->dst.type == BRW_REGISTER_TYPE_HF);
assert(exec_type != BRW_REGISTER_TYPE_B);
return exec_type;
}
static inline unsigned
get_exec_type_size(const vec4_instruction *inst)
{
return type_sz(get_exec_type(inst));
}
} /* namespace brw */
#endif

View file

@ -28,7 +28,6 @@
#include "brw_eu.h"
#include "brw_fs.h"
#include "brw_fs_live_variables.h"
#include "brw_vec4.h"
#include "brw_cfg.h"
#include "brw_shader.h"
#include <new>
@ -1027,25 +1026,6 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
return benefit;
}
class vec4_instruction_scheduler : public instruction_scheduler
{
public:
vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v, int grf_count);
void calculate_deps();
schedule_node *choose_instruction_to_schedule();
const vec4_visitor *v;
void run();
};
vec4_instruction_scheduler::vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v,
int grf_count)
: instruction_scheduler(mem_ctx, v, grf_count, /* grf_write_scale */ 1,
/* post_reg_alloc */ true),
v(v)
{
}
void
instruction_scheduler::set_current_block(bblock_t *block)
{
@ -1534,179 +1514,6 @@ fs_instruction_scheduler::calculate_deps()
clear_last_grf_write();
}
void
vec4_instruction_scheduler::calculate_deps()
{
schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)];
schedule_node *last_conditional_mod = NULL;
schedule_node *last_accumulator_write = NULL;
/* Fixed HW registers are assumed to be separate from the virtual
* GRFs, so they can be tracked separately. We don't really write
* to fixed GRFs much, so don't bother tracking them on a more
* granular level.
*/
schedule_node *last_fixed_grf_write = NULL;
memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write));
memset(last_mrf_write, 0, sizeof(last_mrf_write));
/* top-to-bottom dependencies: RAW and WAW. */
for (schedule_node *n = current.start; n < current.end; n++) {
vec4_instruction *inst = (vec4_instruction *)n->inst;
if (is_scheduling_barrier(inst))
add_barrier_deps(n);
/* read-after-write deps. */
for (int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF) {
for (unsigned j = 0; j < regs_read(inst, i); ++j)
add_dep(last_grf_write[inst->src[i].nr + j], n);
} else if (inst->src[i].file == FIXED_GRF) {
add_dep(last_fixed_grf_write, n);
} else if (inst->src[i].is_accumulator()) {
assert(last_accumulator_write);
add_dep(last_accumulator_write, n);
} else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
add_barrier_deps(n);
}
}
if (inst->reads_g0_implicitly())
add_dep(last_fixed_grf_write, n);
if (!inst->is_send_from_grf()) {
for (int i = 0; i < inst->mlen; i++) {
/* It looks like the MRF regs are released in the send
* instruction once it's sent, not when the result comes
* back.
*/
add_dep(last_mrf_write[inst->base_mrf + i], n);
}
}
if (inst->reads_flag()) {
assert(last_conditional_mod);
add_dep(last_conditional_mod, n);
}
if (inst->reads_accumulator_implicitly()) {
assert(last_accumulator_write);
add_dep(last_accumulator_write, n);
}
/* write-after-write deps. */
if (inst->dst.file == VGRF) {
for (unsigned j = 0; j < regs_written(inst); ++j) {
add_dep(last_grf_write[inst->dst.nr + j], n);
last_grf_write[inst->dst.nr + j] = n;
}
} else if (inst->dst.file == MRF) {
add_dep(last_mrf_write[inst->dst.nr], n);
last_mrf_write[inst->dst.nr] = n;
} else if (inst->dst.file == FIXED_GRF) {
add_dep(last_fixed_grf_write, n);
last_fixed_grf_write = n;
} else if (inst->dst.is_accumulator()) {
add_dep(last_accumulator_write, n);
last_accumulator_write = n;
} else if (inst->dst.file == ARF && !inst->dst.is_null()) {
add_barrier_deps(n);
}
if (inst->mlen > 0 && !inst->is_send_from_grf()) {
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
add_dep(last_mrf_write[inst->base_mrf + i], n);
last_mrf_write[inst->base_mrf + i] = n;
}
}
if (inst->writes_flag(v->devinfo)) {
add_dep(last_conditional_mod, n, 0);
last_conditional_mod = n;
}
if (inst->writes_accumulator_implicitly(v->devinfo) &&
!inst->dst.is_accumulator()) {
add_dep(last_accumulator_write, n);
last_accumulator_write = n;
}
}
/* bottom-to-top dependencies: WAR */
memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write));
memset(last_mrf_write, 0, sizeof(last_mrf_write));
last_conditional_mod = NULL;
last_accumulator_write = NULL;
last_fixed_grf_write = NULL;
for (schedule_node *n = current.end - 1; n >= current.start; n--) {
vec4_instruction *inst = (vec4_instruction *)n->inst;
/* write-after-read deps. */
for (int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF) {
for (unsigned j = 0; j < regs_read(inst, i); ++j)
add_dep(n, last_grf_write[inst->src[i].nr + j]);
} else if (inst->src[i].file == FIXED_GRF) {
add_dep(n, last_fixed_grf_write);
} else if (inst->src[i].is_accumulator()) {
add_dep(n, last_accumulator_write);
} else if (inst->src[i].file == ARF && !inst->src[i].is_null()) {
add_barrier_deps(n);
}
}
if (!inst->is_send_from_grf()) {
for (int i = 0; i < inst->mlen; i++) {
/* It looks like the MRF regs are released in the send
* instruction once it's sent, not when the result comes
* back.
*/
add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
}
}
if (inst->reads_flag()) {
add_dep(n, last_conditional_mod);
}
if (inst->reads_accumulator_implicitly()) {
add_dep(n, last_accumulator_write);
}
/* Update the things this instruction wrote, so earlier reads
* can mark this as WAR dependency.
*/
if (inst->dst.file == VGRF) {
for (unsigned j = 0; j < regs_written(inst); ++j)
last_grf_write[inst->dst.nr + j] = n;
} else if (inst->dst.file == MRF) {
last_mrf_write[inst->dst.nr] = n;
} else if (inst->dst.file == FIXED_GRF) {
last_fixed_grf_write = n;
} else if (inst->dst.is_accumulator()) {
last_accumulator_write = n;
} else if (inst->dst.file == ARF && !inst->dst.is_null()) {
add_barrier_deps(n);
}
if (inst->mlen > 0 && !inst->is_send_from_grf()) {
for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) {
last_mrf_write[inst->base_mrf + i] = n;
}
}
if (inst->writes_flag(v->devinfo)) {
last_conditional_mod = n;
}
if (inst->writes_accumulator_implicitly(v->devinfo)) {
last_accumulator_write = n;
}
}
}
schedule_node *
fs_instruction_scheduler::choose_instruction_to_schedule()
{
@ -1837,25 +1644,6 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
return chosen;
}
schedule_node *
vec4_instruction_scheduler::choose_instruction_to_schedule()
{
schedule_node *chosen = NULL;
int chosen_time = 0;
/* Of the instructions ready to execute or the closest to being ready,
* choose the oldest one.
*/
foreach_in_list(schedule_node, n, &current.available) {
if (!chosen || n->tmp.unblocked_time < chosen_time) {
chosen = n;
chosen_time = n->tmp.unblocked_time;
}
}
return chosen;
}
int
fs_instruction_scheduler::calculate_issue_time(backend_instruction *inst0)
{
@ -2009,41 +1797,6 @@ fs_instruction_scheduler::run(instruction_scheduler_mode mode)
}
}
void
vec4_instruction_scheduler::run()
{
foreach_block(block, v->cfg) {
set_current_block(block);
for (schedule_node *n = current.start; n < current.end; n++) {
/* We always execute as two vec4s in parallel. */
n->issue_time = 2;
}
calculate_deps();
compute_delays();
compute_exits();
assert(current.available.is_empty());
for (schedule_node *n = current.start; n < current.end; n++) {
reset_node_tmp(n);
/* Add DAG heads to the list of available instructions. */
if (n->tmp.parent_count == 0)
current.available.push_tail(n);
}
current.block->instructions.make_empty();
while (!current.available.is_empty()) {
schedule_node *chosen = choose_instruction_to_schedule();
schedule(chosen);
update_children(chosen);
}
}
}
fs_instruction_scheduler *
fs_visitor::prepare_scheduler(void *mem_ctx)
{
@ -2082,16 +1835,3 @@ fs_visitor::schedule_instructions_post_ra()
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}
void
vec4_visitor::opt_schedule_instructions()
{
void *mem_ctx = ralloc_context(NULL);
vec4_instruction_scheduler sched(mem_ctx, this, prog_data->total_grf);
sched.run();
ralloc_free(mem_ctx);
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
}

View file

@ -114,9 +114,6 @@ extern "C" {
/* brw_fs_reg_allocate.cpp */
void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
/* brw_vec4_reg_allocate.cpp */
void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
/* brw_disasm.c */
extern const char *const conditional_modifier[16];
extern const char *const pred_ctrl_align16[16];

File diff suppressed because it is too large Load diff

View file

@ -1,350 +0,0 @@
/*
* Copyright © 2011 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_VEC4_H
#define BRW_VEC4_H
#include "brw_shader.h"
#ifdef __cplusplus
#include "brw_ir_vec4.h"
#include "brw_ir_performance.h"
#include "brw_vec4_builder.h"
#include "brw_vec4_live_variables.h"
#endif
#include "compiler/glsl/ir.h"
#include "compiler/nir/nir.h"
#ifdef __cplusplus
extern "C" {
#endif
const unsigned *
brw_vec4_generate_assembly(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const nir_shader *nir,
struct brw_vue_prog_data *prog_data,
const struct cfg_t *cfg,
const brw::performance &perf,
bool debug_enabled);
#ifdef __cplusplus
} /* extern "C" */
namespace brw {
/**
* The vertex shader front-end.
*
* Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
* fixed-function) into VS IR.
*/
class vec4_visitor : public backend_shader
{
public:
vec4_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_sampler_prog_key_data *key,
struct brw_vue_prog_data *prog_data,
const nir_shader *shader,
bool no_spills,
bool debug_enabled);
dst_reg dst_null_f()
{
return dst_reg(brw_null_reg());
}
dst_reg dst_null_df()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
}
dst_reg dst_null_d()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
}
dst_reg dst_null_ud()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
}
const struct brw_sampler_prog_key_data * const key_tex;
struct brw_vue_prog_data * const prog_data;
char *fail_msg;
bool failed;
/**
* GLSL IR currently being processed, which is associated with our
* driver IR instructions for debugging purposes.
*/
const void *base_ir;
const char *current_annotation;
int first_non_payload_grf;
unsigned ubo_push_start[4];
unsigned push_length;
unsigned int max_grf;
brw_analysis<brw::vec4_live_variables, backend_shader> live_analysis;
brw_analysis<brw::performance, vec4_visitor> performance_analysis;
/* Regs for vertex results. Generated at ir_variable visiting time
* for the ir->location's used.
*/
dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
int uniforms;
bool run();
void fail(const char *msg, ...);
int setup_uniforms(int payload_reg);
bool reg_allocate_trivial();
bool reg_allocate();
void evaluate_spill_costs(float *spill_costs, bool *no_spill);
int choose_spill_reg(struct ra_graph *g);
void spill_reg(unsigned spill_reg);
void move_grf_array_access_to_scratch();
void split_uniform_registers();
void setup_push_ranges();
virtual void invalidate_analysis(brw::analysis_dependency_class c);
void split_virtual_grfs();
bool opt_vector_float();
bool opt_reduce_swizzle();
bool dead_code_eliminate();
bool opt_cmod_propagation();
bool opt_copy_propagation(bool do_constant_prop = true);
bool opt_cse_local(bblock_t *block, const vec4_live_variables &live);
bool opt_cse();
bool opt_algebraic();
bool opt_register_coalesce();
bool eliminate_find_live_channel();
bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
void opt_set_dependency_control();
void opt_schedule_instructions();
void convert_to_hw_regs();
void fixup_3src_null_dest();
bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
bool lower_simd_width();
bool scalarize_df();
bool lower_64bit_mad_to_mul_add();
void apply_logical_swizzle(struct brw_reg *hw_reg,
vec4_instruction *inst, int arg);
vec4_instruction *emit(vec4_instruction *inst);
vec4_instruction *emit(enum opcode opcode);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0, const src_reg &src1);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0, const src_reg &src1,
const src_reg &src2);
vec4_instruction *emit_before(bblock_t *block,
vec4_instruction *inst,
vec4_instruction *new_inst);
#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
EMIT1(MOV)
EMIT1(NOT)
EMIT1(RNDD)
EMIT1(RNDE)
EMIT1(RNDZ)
EMIT1(FRC)
EMIT1(F32TO16)
EMIT1(F16TO32)
EMIT2(ADD)
EMIT2(MUL)
EMIT2(MACH)
EMIT2(MAC)
EMIT2(AND)
EMIT2(OR)
EMIT2(XOR)
EMIT2(DP3)
EMIT2(DP4)
EMIT2(DPH)
EMIT2(SHL)
EMIT2(SHR)
EMIT2(ASR)
vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
enum brw_conditional_mod condition);
vec4_instruction *IF(src_reg src0, src_reg src1,
enum brw_conditional_mod condition);
vec4_instruction *IF(enum brw_predicate predicate);
EMIT1(SCRATCH_READ)
EMIT2(SCRATCH_WRITE)
EMIT3(LRP)
EMIT1(BFREV)
EMIT3(BFE)
EMIT2(BFI1)
EMIT3(BFI2)
EMIT1(FBH)
EMIT1(FBL)
EMIT1(CBIT)
EMIT1(LZD)
EMIT3(MAD)
EMIT2(ADDC)
EMIT2(SUBB)
EMIT1(DIM)
#undef EMIT1
#undef EMIT2
#undef EMIT3
vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
src_reg src0, src_reg src1);
/**
* Copy any live channel from \p src to the first channel of the
* result.
*/
src_reg emit_uniformize(const src_reg &src);
/** Fix all float operands of a 3-source instruction. */
void fix_float_operands(src_reg op[3], nir_alu_instr *instr);
src_reg fix_3src_operand(const src_reg &src);
vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1 = src_reg());
src_reg fix_math_operand(const src_reg &src);
void emit_pack_half_2x16(dst_reg dst, src_reg src0);
void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
src_reg surface);
void emit_ndc_computation();
void emit_psiz_and_flags(dst_reg reg);
vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
virtual void emit_urb_slot(dst_reg reg, int varying);
src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
int base_offset);
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
int base_offset);
void emit_pull_constant_load_reg(dst_reg dst,
src_reg surf_index,
src_reg offset,
bblock_t *before_block,
vec4_instruction *before_inst);
src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
vec4_instruction *inst, src_reg src);
void resolve_ud_negate(src_reg *reg);
void emit_shader_float_controls_execution_mode();
bool lower_minmax();
src_reg get_timestamp();
virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const;
bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
void emit_conversion_from_double(dst_reg dst, src_reg src);
void emit_conversion_to_double(dst_reg dst, src_reg src);
vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
bool for_write,
bool for_scratch = false,
bblock_t *block = NULL,
vec4_instruction *ref = NULL);
virtual void emit_nir_code();
virtual void nir_setup_uniforms();
virtual void nir_emit_impl(nir_function_impl *impl);
virtual void nir_emit_cf_list(exec_list *list);
virtual void nir_emit_if(nir_if *if_stmt);
virtual void nir_emit_loop(nir_loop *loop);
virtual void nir_emit_block(nir_block *block);
virtual void nir_emit_instr(nir_instr *instr);
virtual void nir_emit_load_const(nir_load_const_instr *instr);
src_reg get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr);
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
virtual void nir_emit_alu(nir_alu_instr *instr);
virtual void nir_emit_jump(nir_jump_instr *instr);
virtual void nir_emit_texture(nir_tex_instr *instr);
virtual void nir_emit_undef(nir_undef_instr *instr);
virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
dst_reg get_nir_def(const nir_def &def, enum brw_reg_type type);
dst_reg get_nir_def(const nir_def &def, nir_alu_type type);
dst_reg get_nir_def(const nir_def &def);
src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
unsigned num_components = 4);
src_reg get_nir_src(const nir_src &src, nir_alu_type type,
unsigned num_components = 4);
src_reg get_nir_src(const nir_src &src,
unsigned num_components = 4);
src_reg get_nir_src_imm(const nir_src &src);
src_reg get_indirect_offset(nir_intrinsic_instr *instr);
dst_reg *nir_ssa_values;
protected:
void emit_vertex();
void setup_payload_interference(struct ra_graph *g, int first_payload_node,
int reg_node_count);
virtual void setup_payload() = 0;
virtual void emit_prolog() = 0;
virtual void emit_thread_end() = 0;
virtual void emit_urb_write_header(int mrf) = 0;
virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
virtual void gs_emit_vertex(int stream_id);
virtual void gs_end_primitive();
private:
/**
* If true, then register allocation should fail instead of spilling.
*/
const bool no_spills;
unsigned last_scratch; /**< measured in 32-byte (register size) units */
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* BRW_VEC4_H */

View file

@ -1,646 +0,0 @@
/* -*- c++ -*- */
/*
* Copyright © 2010-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_VEC4_BUILDER_H
#define BRW_VEC4_BUILDER_H
#include "brw_ir_vec4.h"
#include "brw_ir_allocator.h"
namespace brw {
/**
* Toolbox to assemble a VEC4 IR program out of individual instructions.
*
* This object is meant to have an interface consistent with
* brw::fs_builder. They cannot be fully interchangeable because
* brw::fs_builder generates scalar code while brw::vec4_builder generates
* vector code.
*/
class vec4_builder {
public:
/** Type used in this IR to represent a source of an instruction. */
typedef brw::src_reg src_reg;
/** Type used in this IR to represent the destination of an instruction. */
typedef brw::dst_reg dst_reg;
/** Type used in this IR to represent an instruction. */
typedef vec4_instruction instruction;
/**
* Construct a vec4_builder that inserts instructions into \p shader.
*/
vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
shader(shader), block(NULL), cursor(NULL),
_dispatch_width(dispatch_width), _group(0),
force_writemask_all(false),
annotation()
{
}
/**
* Construct a vec4_builder that inserts instructions into \p shader
* before instruction \p inst in basic block \p block. The default
* execution controls and debug annotation are initialized from the
* instruction passed as argument.
*/
vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
shader(shader), block(block), cursor(inst),
_dispatch_width(inst->exec_size), _group(inst->group),
force_writemask_all(inst->force_writemask_all)
{
annotation.str = inst->annotation;
annotation.ir = inst->ir;
}
/**
* Construct a vec4_builder that inserts instructions before \p cursor
* in basic block \p block, inheriting other code generation parameters
* from this.
*/
vec4_builder
at(bblock_t *block, exec_node *cursor) const
{
vec4_builder bld = *this;
bld.block = block;
bld.cursor = cursor;
return bld;
}
/**
* Construct a vec4_builder appending instructions at the end of the
* instruction list of the shader, inheriting other code generation
* parameters from this.
*/
vec4_builder
at_end() const
{
return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
}
/**
* Construct a builder specifying the default SIMD width and group of
* channel enable signals, inheriting other code generation parameters
* from this.
*
* \p n gives the default SIMD width, \p i gives the slot group used for
* predication and control flow masking in multiples of \p n channels.
*/
vec4_builder
group(unsigned n, unsigned i) const
{
assert(force_writemask_all ||
(n <= dispatch_width() && i < dispatch_width() / n));
vec4_builder bld = *this;
bld._dispatch_width = n;
bld._group += i * n;
return bld;
}
/**
* Construct a builder with per-channel control flow execution masking
* disabled if \p b is true. If control flow execution masking is
* already disabled this has no effect.
*/
vec4_builder
exec_all(bool b = true) const
{
vec4_builder bld = *this;
if (b)
bld.force_writemask_all = true;
return bld;
}
/**
* Construct a builder with the given debug annotation info.
*/
vec4_builder
annotate(const char *str, const void *ir = NULL) const
{
vec4_builder bld = *this;
bld.annotation.str = str;
bld.annotation.ir = ir;
return bld;
}
/**
* Get the SIMD width in use.
*/
unsigned
dispatch_width() const
{
return _dispatch_width;
}
/**
* Get the channel group in use.
*/
unsigned
group() const
{
return _group;
}
/**
* Allocate a virtual register of natural vector size (four for this IR)
* and SIMD width. \p n gives the amount of space to allocate in
* dispatch_width units (which is just enough space for four logical
* components in this IR).
*/
dst_reg
vgrf(enum brw_reg_type type, unsigned n = 1) const
{
assert(dispatch_width() <= 32);
if (n > 0)
return retype(dst_reg(VGRF, shader->alloc.allocate(
n * DIV_ROUND_UP(type_sz(type), 4))),
type);
else
return retype(null_reg_ud(), type);
}
/**
* Create a null register of floating type.
*/
dst_reg
null_reg_f() const
{
return dst_reg(retype(brw_null_vec(dispatch_width()),
BRW_REGISTER_TYPE_F));
}
/**
* Create a null register of signed integer type.
*/
dst_reg
null_reg_d() const
{
return dst_reg(retype(brw_null_vec(dispatch_width()),
BRW_REGISTER_TYPE_D));
}
/**
* Create a null register of unsigned integer type.
*/
dst_reg
null_reg_ud() const
{
return dst_reg(retype(brw_null_vec(dispatch_width()),
BRW_REGISTER_TYPE_UD));
}
/**
* Insert an instruction into the program.
*/
instruction *
emit(const instruction &inst) const
{
return emit(new(shader->mem_ctx) instruction(inst));
}
/**
* Create and insert a nullary control instruction into the program.
*/
instruction *
emit(enum opcode opcode) const
{
return emit(instruction(opcode));
}
/**
* Create and insert a nullary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst) const
{
return emit(instruction(opcode, dst));
}
/**
* Create and insert a unary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
{
switch (opcode) {
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
case SHADER_OPCODE_SQRT:
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
return fix_math_instruction(
emit(instruction(opcode, dst,
fix_math_operand(src0))));
default:
return emit(instruction(opcode, dst, src0));
}
}
/**
* Create and insert a binary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1) const
{
switch (opcode) {
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
return fix_math_instruction(
emit(instruction(opcode, dst,
fix_math_operand(src0),
fix_math_operand(src1))));
default:
return emit(instruction(opcode, dst, src0, src1));
}
}
/**
* Create and insert a ternary instruction into the program.
*/
instruction *
emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1, const src_reg &src2) const
{
switch (opcode) {
case BRW_OPCODE_BFE:
case BRW_OPCODE_BFI2:
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
return emit(instruction(opcode, dst,
fix_3src_operand(src0),
fix_3src_operand(src1),
fix_3src_operand(src2)));
default:
return emit(instruction(opcode, dst, src0, src1, src2));
}
}
/**
* Insert a preallocated instruction into the program.
*/
instruction *
emit(instruction *inst) const
{
inst->exec_size = dispatch_width();
inst->group = group();
inst->force_writemask_all = force_writemask_all;
inst->size_written = inst->exec_size * type_sz(inst->dst.type);
inst->annotation = annotation.str;
inst->ir = annotation.ir;
if (block)
static_cast<instruction *>(cursor)->insert_before(block, inst);
else
cursor->insert_before(inst);
return inst;
}
/**
* Select \p src0 if the comparison of both sources with the given
* conditional mod evaluates to true, otherwise select \p src1.
*
* Generally useful to get the minimum or maximum of two values.
*/
instruction *
emit_minmax(const dst_reg &dst, const src_reg &src0,
const src_reg &src1, brw_conditional_mod mod) const
{
assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* Copy any live channel from \p src to the first channel of the result.
*/
src_reg
emit_uniformize(const src_reg &src) const
{
const vec4_builder ubld = exec_all();
const dst_reg chan_index =
writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
const dst_reg dst = vgrf(src.type);
ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
return src_reg(dst);
}
/**
* Assorted arithmetic ops.
* @{
*/
#define ALU1(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0) const \
{ \
return emit(BRW_OPCODE_##op, dst, src0); \
}
#define ALU2(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
{ \
return emit(BRW_OPCODE_##op, dst, src0, src1); \
}
#define ALU2_ACC(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
{ \
instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \
inst->writes_accumulator = true; \
return inst; \
}
#define ALU3(op) \
instruction * \
op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \
const src_reg &src2) const \
{ \
return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \
}
ALU2(ADD)
ALU2_ACC(ADDC)
ALU2(AND)
ALU2(ASR)
ALU2(AVG)
ALU3(BFE)
ALU2(BFI1)
ALU3(BFI2)
ALU1(BFREV)
ALU1(CBIT)
ALU3(CSEL)
ALU1(DIM)
ALU2(DP2)
ALU2(DP3)
ALU2(DP4)
ALU2(DPH)
ALU1(F16TO32)
ALU1(F32TO16)
ALU1(FBH)
ALU1(FBL)
ALU1(FRC)
ALU2(LINE)
ALU1(LZD)
ALU2(MAC)
ALU2_ACC(MACH)
ALU3(MAD)
ALU1(MOV)
ALU2(MUL)
ALU1(NOT)
ALU2(OR)
ALU2(PLN)
ALU1(RNDD)
ALU1(RNDE)
ALU1(RNDU)
ALU1(RNDZ)
ALU2(SAD2)
ALU2_ACC(SADA2)
ALU2(SEL)
ALU2(SHL)
ALU2(SHR)
ALU2_ACC(SUBB)
ALU2(XOR)
#undef ALU3
#undef ALU2_ACC
#undef ALU2
#undef ALU1
/** @} */
/**
* CMP: Sets the low bit of the destination channels with the result
* of the comparison, while the upper bits are undefined, and updates
* the flag register with the packed 16 bits of the result.
*/
instruction *
CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
brw_conditional_mod condition) const
{
/* Take the instruction:
*
* CMP null<d> src0<f> src1<f>
*
* Original gfx4 does type conversion to the destination type
* before comparison, producing garbage results for floating
* point comparisons.
*
* The destination type doesn't matter on newer generations,
* so we set the type to match src0 so we can compact the
* instruction.
*/
return set_condmod(condition,
emit(BRW_OPCODE_CMP, retype(dst, src0.type),
fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* CMPN: Behaves like CMP, but produces true if src1 is NaN.
*/
instruction *
CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
brw_conditional_mod condition) const
{
/* Take the instruction:
*
* CMPN null<d> src0<f> src1<f>
*
* Original gfx4 does type conversion to the destination type
* before comparison, producing garbage results for floating
* point comparisons.
*
* The destination type doesn't matter on newer generations,
* so we set the type to match src0 so we can compact the
* instruction.
*/
return set_condmod(condition,
emit(BRW_OPCODE_CMPN, retype(dst, src0.type),
fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* Gfx4 predicated IF.
*/
instruction *
IF(brw_predicate predicate) const
{
return set_predicate(predicate, emit(BRW_OPCODE_IF));
}
/**
* Gfx6 IF with embedded comparison.
*/
instruction *
IF(const src_reg &src0, const src_reg &src1,
brw_conditional_mod condition) const
{
assert(shader->devinfo->ver == 6);
return set_condmod(condition,
emit(BRW_OPCODE_IF,
null_reg_d(),
fix_unsigned_negate(src0),
fix_unsigned_negate(src1)));
}
/**
* Emit a linear interpolation instruction.
*/
instruction *
LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
const src_reg &a) const
{
/* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
* we need to reorder the operands.
*/
assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9);
return emit(BRW_OPCODE_LRP, dst, a, y, x);
}
backend_shader *shader;
protected:
/**
* Workaround for negation of UD registers. See comment in
* fs_generator::generate_code() for the details.
*/
src_reg
fix_unsigned_negate(const src_reg &src) const
{
if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
MOV(temp, src);
return src_reg(temp);
} else {
return src;
}
}
/**
* Workaround for register access modes not supported by the ternary
* instruction encoding.
*/
src_reg
fix_3src_operand(const src_reg &src) const
{
/* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
* able to use vertical stride of zero to replicate the vec4 uniform, like
*
* g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
*
* But you can't, since vertical stride is always four in three-source
* instructions. Instead, insert a MOV instruction to do the replication so
* that the three-source instruction can consume it.
*/
/* The MOV is only needed if the source is a uniform or immediate. */
if (src.file != UNIFORM && src.file != IMM)
return src;
if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
return src;
const dst_reg expanded = vgrf(src.type);
emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
return src_reg(expanded);
}
/**
* Workaround for register access modes not supported by the math
* instruction.
*/
src_reg
fix_math_operand(const src_reg &src) const
{
/* The gfx6 math instruction ignores the source modifiers --
* swizzle, abs, negate, and at least some parts of the register
* region description.
*
* Rather than trying to enumerate all these cases, *always* expand the
* operand to a temp GRF for gfx6.
*
* For gfx7, keep the operand as-is, except if immediate, which gfx7 still
* can't use.
*/
if (shader->devinfo->ver == 6 ||
(shader->devinfo->ver == 7 && src.file == IMM)) {
const dst_reg tmp = vgrf(src.type);
MOV(tmp, src);
return src_reg(tmp);
} else {
return src;
}
}
/**
* Workaround other weirdness of the math instruction.
*/
instruction *
fix_math_instruction(instruction *inst) const
{
if (shader->devinfo->ver == 6 &&
inst->dst.writemask != WRITEMASK_XYZW) {
const dst_reg tmp = vgrf(inst->dst.type);
MOV(inst->dst, src_reg(tmp));
inst->dst = tmp;
} else if (shader->devinfo->ver < 6) {
const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
inst->base_mrf = 1;
inst->mlen = sources;
}
return inst;
}
bblock_t *block;
exec_node *cursor;
unsigned _dispatch_width;
unsigned _group;
bool force_writemask_all;
/** Debug annotation info. */
struct {
const char *str;
const void *ir;
} annotation;
};
}
#endif

View file

@ -1,365 +0,0 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
/** @file brw_vec4_cmod_propagation.cpp
*
* Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
* brw_fs_cmod_propagation for further details on the rationale behind this
* optimization.
*/
#include "brw_vec4.h"
#include "brw_cfg.h"
#include "brw_eu.h"
namespace brw {
static bool
writemasks_incompatible(const vec4_instruction *earlier,
const vec4_instruction *later)
{
return (earlier->dst.writemask != WRITEMASK_X &&
earlier->dst.writemask != WRITEMASK_XYZW) ||
(earlier->dst.writemask == WRITEMASK_XYZW &&
later->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
(later->dst.writemask & ~earlier->dst.writemask) != 0;
}
static bool
opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v)
{
bool progress = false;
UNUSED int ip = block->end_ip + 1;
foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
ip--;
if ((inst->opcode != BRW_OPCODE_AND &&
inst->opcode != BRW_OPCODE_CMP &&
inst->opcode != BRW_OPCODE_MOV) ||
inst->predicate != BRW_PREDICATE_NONE ||
!inst->dst.is_null() ||
(inst->src[0].file != VGRF && inst->src[0].file != ATTR &&
inst->src[0].file != UNIFORM))
continue;
/* An ABS source modifier can only be handled when processing a compare
* with a value other than zero.
*/
if (inst->src[0].abs &&
(inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero()))
continue;
if (inst->opcode == BRW_OPCODE_AND &&
!(inst->src[1].is_one() &&
inst->conditional_mod == BRW_CONDITIONAL_NZ &&
!inst->src[0].negate))
continue;
if (inst->opcode == BRW_OPCODE_MOV &&
inst->conditional_mod != BRW_CONDITIONAL_NZ)
continue;
bool read_flag = false;
foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
/* A CMP with a second source of zero can match with anything. A CMP
* with a second source that is not zero can only match with an ADD
* instruction.
*/
if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) {
bool negate;
if (scan_inst->opcode != BRW_OPCODE_ADD)
goto not_match;
if (writemasks_incompatible(scan_inst, inst))
goto not_match;
/* A CMP is basically a subtraction. The result of the
* subtraction must be the same as the result of the addition.
* This means that one of the operands must be negated. So (a +
* b) vs (a == -b) or (a + -b) vs (a == b).
*/
if ((inst->src[0].equals(scan_inst->src[0]) &&
inst->src[1].negative_equals(scan_inst->src[1])) ||
(inst->src[0].equals(scan_inst->src[1]) &&
inst->src[1].negative_equals(scan_inst->src[0]))) {
negate = false;
} else if ((inst->src[0].negative_equals(scan_inst->src[0]) &&
inst->src[1].equals(scan_inst->src[1])) ||
(inst->src[0].negative_equals(scan_inst->src[1]) &&
inst->src[1].equals(scan_inst->src[0]))) {
negate = true;
} else {
goto not_match;
}
if (scan_inst->exec_size != inst->exec_size ||
scan_inst->group != inst->group)
goto not_match;
/* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
*
* * Note that the [post condition signal] bits generated at
* the output of a compute are before the .sat.
*
* So we don't have to bail if scan_inst has saturate.
*/
/* Otherwise, try propagating the conditional. */
const enum brw_conditional_mod cond =
negate ? brw_swap_cmod(inst->conditional_mod)
: inst->conditional_mod;
if (scan_inst->can_do_cmod() &&
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
scan_inst->conditional_mod == cond)) {
scan_inst->conditional_mod = cond;
inst->remove(block);
progress = true;
}
break;
}
if (regions_overlap(inst->src[0], inst->size_read(0),
scan_inst->dst, scan_inst->size_written)) {
if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
scan_inst->dst.offset != inst->src[0].offset ||
scan_inst->exec_size != inst->exec_size ||
scan_inst->group != inst->group) {
break;
}
/* If scan_inst is a CMP that produces a single value and inst is
* a CMP.NZ that consumes only that value, remove inst.
*/
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
(inst->src[0].type == BRW_REGISTER_TYPE_D ||
inst->src[0].type == BRW_REGISTER_TYPE_UD) &&
(inst->opcode == BRW_OPCODE_CMP ||
inst->opcode == BRW_OPCODE_MOV) &&
scan_inst->opcode == BRW_OPCODE_CMP &&
((inst->src[0].swizzle == BRW_SWIZZLE_XXXX &&
scan_inst->dst.writemask == WRITEMASK_X) ||
(inst->src[0].swizzle == BRW_SWIZZLE_YYYY &&
scan_inst->dst.writemask == WRITEMASK_Y) ||
(inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ &&
scan_inst->dst.writemask == WRITEMASK_Z) ||
(inst->src[0].swizzle == BRW_SWIZZLE_WWWW &&
scan_inst->dst.writemask == WRITEMASK_W))) {
if (inst->dst.writemask != scan_inst->dst.writemask) {
src_reg temp(v, glsl_vec4_type(), 1);
/* Given a sequence like:
*
* cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF
* ...
* cmp.nz.f0(8) null<1>D g21<4>.zD 0D
*
* Replace it with something like:
*
* cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF
* mov(8) g21<1>.xF g22<1>.zzzzF
*
* The added MOV will most likely be removed later. In the
* worst case, it should be cheaper to schedule.
*/
temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask);
temp.type = scan_inst->src[0].type;
vec4_instruction *mov = v->MOV(scan_inst->dst, temp);
/* Modify the source swizzles on scan_inst. If scan_inst
* was
*
* cmp.ge.f0(8) g21<1>.zF g20<4>.wzyxF g18<4>.yxwzF
*
* replace it with
*
* cmp.ge.f0(8) g21<1>.zF g20<4>.yyyyF g18<4>.wwwwF
*/
unsigned src0_chan;
unsigned src1_chan;
switch (scan_inst->dst.writemask) {
case WRITEMASK_X:
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0);
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0);
break;
case WRITEMASK_Y:
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1);
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1);
break;
case WRITEMASK_Z:
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2);
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2);
break;
case WRITEMASK_W:
src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3);
src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3);
break;
default:
unreachable("Impossible writemask");
}
scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan,
src0_chan,
src0_chan,
src0_chan);
/* There's no swizzle on immediate value sources. */
if (scan_inst->src[1].file != IMM) {
scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan,
src1_chan,
src1_chan,
src1_chan);
}
scan_inst->dst = dst_reg(temp);
scan_inst->dst.writemask = inst->dst.writemask;
scan_inst->insert_after(block, mov);
}
inst->remove(block);
progress = true;
break;
}
if (writemasks_incompatible(scan_inst, inst))
break;
/* CMP's result is the same regardless of dest type. */
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
scan_inst->opcode == BRW_OPCODE_CMP &&
(inst->dst.type == BRW_REGISTER_TYPE_D ||
inst->dst.type == BRW_REGISTER_TYPE_UD)) {
inst->remove(block);
progress = true;
break;
}
/* If the AND wasn't handled by the previous case, it isn't safe
* to remove it.
*/
if (inst->opcode == BRW_OPCODE_AND)
break;
/* Comparisons operate differently for ints and floats */
if (scan_inst->dst.type != inst->dst.type &&
(scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
inst->dst.type == BRW_REGISTER_TYPE_F))
break;
/* If the instruction generating inst's source also wrote the
* flag, and inst is doing a simple .nz comparison, then inst
* is redundant - the appropriate value is already in the flag
* register. Delete inst.
*/
if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
!inst->src[0].negate &&
scan_inst->writes_flag(v->devinfo)) {
inst->remove(block);
progress = true;
break;
}
/* The conditional mod of the CMP/CMPN instructions behaves
* specially because the flag output is not calculated from the
* result of the instruction, but the other way around, which
* means that even if the condmod to propagate and the condmod
* from the CMP instruction are the same they will in general give
* different results because they are evaluated based on different
* inputs.
*/
if (scan_inst->opcode == BRW_OPCODE_CMP ||
scan_inst->opcode == BRW_OPCODE_CMPN)
break;
/* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods":
*
* * Note that the [post condition signal] bits generated at
* the output of a compute are before the .sat.
*/
if (scan_inst->saturate)
break;
/* From the Sky Lake PRM, Vol 2a, "Multiply":
*
* "When multiplying integer data types, if one of the sources
* is a DW, the resulting full precision data is stored in
* the accumulator. However, if the destination data type is
* either W or DW, the low bits of the result are written to
* the destination register and the remaining high bits are
* discarded. This results in undefined Overflow and Sign
* flags. Therefore, conditional modifiers and saturation
* (.sat) cannot be used in this case.
*
* We just disallow cmod propagation on all integer multiplies.
*/
if (!brw_reg_type_is_floating_point(scan_inst->dst.type) &&
scan_inst->opcode == BRW_OPCODE_MUL)
break;
/* Otherwise, try propagating the conditional. */
enum brw_conditional_mod cond =
inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
: inst->conditional_mod;
if (scan_inst->can_do_cmod() &&
((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
scan_inst->conditional_mod == cond)) {
scan_inst->conditional_mod = cond;
inst->remove(block);
progress = true;
}
break;
}
not_match:
if (scan_inst->writes_flag(v->devinfo))
break;
read_flag = read_flag || scan_inst->reads_flag();
}
}
return progress;
}
bool
vec4_visitor::opt_cmod_propagation()
{
bool progress = false;
foreach_block_reverse(block, cfg) {
progress = opt_cmod_propagation_local(block, this) || progress;
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}
} /* namespace brw */

View file

@ -1,556 +0,0 @@
/*
* Copyright © 2011 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
/**
* @file brw_vec4_copy_propagation.cpp
*
* Implements tracking of values copied between registers, and
* optimizations based on that: copy propagation and constant
* propagation.
*/
#include "brw_vec4.h"
#include "brw_cfg.h"
#include "brw_eu.h"
namespace brw {
struct copy_entry {
src_reg *value[4];
int saturatemask;
};
static bool
is_direct_copy(vec4_instruction *inst)
{
return (inst->opcode == BRW_OPCODE_MOV &&
!inst->predicate &&
inst->dst.file == VGRF &&
inst->dst.offset % REG_SIZE == 0 &&
!inst->dst.reladdr &&
!inst->src[0].reladdr &&
(inst->dst.type == inst->src[0].type ||
(inst->dst.type == BRW_REGISTER_TYPE_F &&
inst->src[0].type == BRW_REGISTER_TYPE_VF)));
}
static bool
is_dominated_by_previous_instruction(vec4_instruction *inst)
{
return (inst->opcode != BRW_OPCODE_DO &&
inst->opcode != BRW_OPCODE_WHILE &&
inst->opcode != BRW_OPCODE_ELSE &&
inst->opcode != BRW_OPCODE_ENDIF);
}
static bool
is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
{
const src_reg *src = values[ch];
/* consider GRF only */
assert(inst->dst.file == VGRF);
if (!src || src->file != VGRF)
return false;
return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
(inst->dst.offset != src->offset ||
inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
}
/**
* Get the origin of a copy as a single register if all components present in
* the given readmask originate from the same register and have compatible
* regions, otherwise return a BAD_FILE register.
*/
static src_reg
get_copy_value(const copy_entry &entry, unsigned readmask)
{
unsigned swz[4] = {};
src_reg value;
for (unsigned i = 0; i < 4; i++) {
if (readmask & (1 << i)) {
if (entry.value[i]) {
src_reg src = *entry.value[i];
if (src.file == IMM) {
swz[i] = i;
} else {
swz[i] = BRW_GET_SWZ(src.swizzle, i);
/* Overwrite the original swizzle so the src_reg::equals call
* below doesn't care about it, the correct swizzle will be
* calculated once the swizzles of all components are known.
*/
src.swizzle = BRW_SWIZZLE_XYZW;
}
if (value.file == BAD_FILE) {
value = src;
} else if (!value.equals(src)) {
return src_reg();
}
} else {
return src_reg();
}
}
}
return swizzle(value,
brw_compose_swizzle(brw_swizzle_for_mask(readmask),
BRW_SWIZZLE4(swz[0], swz[1],
swz[2], swz[3])));
}
static bool
try_constant_propagate(vec4_instruction *inst,
int arg, const copy_entry *entry)
{
/* For constant propagation, we only handle the same constant
* across all 4 channels. Some day, we should handle the 8-bit
* float vector format, which would let us constant propagate
* vectors better.
* We could be more aggressive here -- some channels might not get used
* based on the destination writemask.
*/
src_reg value =
get_copy_value(*entry,
brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
WRITEMASK_XYZW));
if (value.file != IMM)
return false;
/* 64-bit types can't be used except for one-source instructions, which
* higher levels should have constant folded away, so there's no point in
* propagating immediates here.
*/
if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
return false;
if (value.type == BRW_REGISTER_TYPE_VF) {
/* The result of bit-casting the component values of a vector float
* cannot in general be represented as an immediate.
*/
if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
return false;
} else {
value.type = inst->src[arg].type;
}
if (inst->src[arg].abs) {
if (!brw_abs_immediate(value.type, &value.as_brw_reg()))
return false;
}
if (inst->src[arg].negate) {
if (!brw_negate_immediate(value.type, &value.as_brw_reg()))
return false;
}
value = swizzle(value, inst->src[arg].swizzle);
switch (inst->opcode) {
case BRW_OPCODE_MOV:
case SHADER_OPCODE_BROADCAST:
inst->src[arg] = value;
return true;
case VEC4_OPCODE_UNTYPED_ATOMIC:
if (arg == 1) {
inst->src[arg] = value;
return true;
}
break;
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
break;
case BRW_OPCODE_DP2:
case BRW_OPCODE_DP3:
case BRW_OPCODE_DP4:
case BRW_OPCODE_DPH:
case BRW_OPCODE_BFI1:
case BRW_OPCODE_ASR:
case BRW_OPCODE_SHL:
case BRW_OPCODE_SHR:
case BRW_OPCODE_SUBB:
if (arg == 1) {
inst->src[arg] = value;
return true;
}
break;
case BRW_OPCODE_MACH:
case BRW_OPCODE_MUL:
case SHADER_OPCODE_MULH:
case BRW_OPCODE_ADD:
case BRW_OPCODE_OR:
case BRW_OPCODE_AND:
case BRW_OPCODE_XOR:
case BRW_OPCODE_ADDC:
if (arg == 1) {
inst->src[arg] = value;
return true;
} else if (arg == 0 && inst->src[1].file != IMM) {
/* Fit this constant in by commuting the operands. Exception: we
* can't do this for 32-bit integer MUL/MACH because it's asymmetric.
*/
if ((inst->opcode == BRW_OPCODE_MUL ||
inst->opcode == BRW_OPCODE_MACH) &&
(inst->src[1].type == BRW_REGISTER_TYPE_D ||
inst->src[1].type == BRW_REGISTER_TYPE_UD))
break;
inst->src[0] = inst->src[1];
inst->src[1] = value;
return true;
}
break;
case GS_OPCODE_SET_WRITE_OFFSET:
/* This is just a multiply by a constant with special strides.
* The generator will handle immediates in both arguments (generating
* a single MOV of the product). So feel free to propagate in src0.
*/
inst->src[arg] = value;
return true;
case BRW_OPCODE_CMP:
if (arg == 1) {
inst->src[arg] = value;
return true;
} else if (arg == 0 && inst->src[1].file != IMM) {
enum brw_conditional_mod new_cmod;
new_cmod = brw_swap_cmod(inst->conditional_mod);
if (new_cmod != BRW_CONDITIONAL_NONE) {
/* Fit this constant in by swapping the operands and
* flipping the test.
*/
inst->src[0] = inst->src[1];
inst->src[1] = value;
inst->conditional_mod = new_cmod;
return true;
}
}
break;
case BRW_OPCODE_SEL:
if (arg == 1) {
inst->src[arg] = value;
return true;
} else if (arg == 0 && inst->src[1].file != IMM) {
inst->src[0] = inst->src[1];
inst->src[1] = value;
/* If this was predicated, flipping operands means
* we also need to flip the predicate.
*/
if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
inst->predicate_inverse = !inst->predicate_inverse;
}
return true;
}
break;
default:
break;
}
return false;
}
static bool
is_align1_opcode(unsigned opcode)
{
switch (opcode) {
case VEC4_OPCODE_DOUBLE_TO_F32:
case VEC4_OPCODE_DOUBLE_TO_D32:
case VEC4_OPCODE_DOUBLE_TO_U32:
case VEC4_OPCODE_TO_DOUBLE:
case VEC4_OPCODE_PICK_LOW_32BIT:
case VEC4_OPCODE_PICK_HIGH_32BIT:
case VEC4_OPCODE_SET_LOW_32BIT:
case VEC4_OPCODE_SET_HIGH_32BIT:
return true;
default:
return false;
}
}
static bool
try_copy_propagate(const struct brw_compiler *compiler,
vec4_instruction *inst, int arg,
const copy_entry *entry, int attributes_per_reg)
{
const struct intel_device_info *devinfo = compiler->devinfo;
/* Build up the value we are propagating as if it were the source of a
* single MOV
*/
src_reg value =
get_copy_value(*entry,
brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
WRITEMASK_XYZW));
/* Check that we can propagate that value */
if (value.file != UNIFORM &&
value.file != VGRF &&
value.file != ATTR)
return false;
/* Instructions that write 2 registers also need to read 2 registers. Make
* sure we don't break that restriction by copy propagating from a uniform.
*/
if (inst->size_written > REG_SIZE && is_uniform(value))
return false;
/* There is a regioning restriction such that if execsize == width
* and hstride != 0 then the vstride can't be 0. When we split instrutions
* that take a single-precision source (like F->DF conversions) we end up
* with a 4-wide source on an instruction with an execution size of 4.
* If we then copy-propagate the source from a uniform we also end up with a
* vstride of 0 and we violate the restriction.
*/
if (inst->exec_size == 4 && value.file == UNIFORM &&
type_sz(value.type) == 4)
return false;
/* If the type of the copy value is different from the type of the
* instruction then the swizzles and writemasks involved don't have the same
* meaning and simply replacing the source would produce different semantics.
*/
if (type_sz(value.type) != type_sz(inst->src[arg].type))
return false;
if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
return false;
bool has_source_modifiers = value.negate || value.abs;
/* gfx6 math and gfx7+ SENDs from GRFs ignore source modifiers on
* instructions.
*/
if (has_source_modifiers && !inst->can_do_source_mods(devinfo))
return false;
/* Reject cases that would violate register regioning restrictions. */
if ((value.file == UNIFORM || value.swizzle != BRW_SWIZZLE_XYZW) &&
((devinfo->ver == 6 && inst->is_math()) ||
inst->is_send_from_grf() ||
inst->uses_indirect_addressing())) {
return false;
}
if (has_source_modifiers &&
value.type != inst->src[arg].type &&
!inst->can_change_types())
return false;
if (has_source_modifiers &&
(inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT))
return false;
unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
value.swizzle);
/* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
* so copy-propagation won't be safe if the composed swizzle is anything
* other than the identity.
*/
if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
return false;
if (inst->is_3src(compiler) &&
(value.file == UNIFORM ||
(value.file == ATTR && attributes_per_reg != 1)) &&
!brw_is_single_value_swizzle(composed_swizzle))
return false;
if (inst->is_send_from_grf())
return false;
/* we can't generally copy-propagate UD negations because we
* end up accessing the resulting values as signed integers
* instead. See also resolve_ud_negate().
*/
if (value.negate &&
value.type == BRW_REGISTER_TYPE_UD)
return false;
/* Don't report progress if this is a noop. */
if (value.equals(inst->src[arg]))
return false;
const unsigned dst_saturate_mask = inst->dst.writemask &
brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
if (dst_saturate_mask) {
/* We either saturate all or nothing. */
if (dst_saturate_mask != inst->dst.writemask)
return false;
/* Limit saturate propagation only to SEL with src1 bounded within 0.0
* and 1.0, otherwise skip copy propagate altogether.
*/
switch(inst->opcode) {
case BRW_OPCODE_SEL:
if (arg != 0 ||
inst->src[0].type != BRW_REGISTER_TYPE_F ||
inst->src[1].file != IMM ||
inst->src[1].type != BRW_REGISTER_TYPE_F ||
inst->src[1].f < 0.0 ||
inst->src[1].f > 1.0) {
return false;
}
if (!inst->saturate)
inst->saturate = true;
break;
default:
return false;
}
}
/* Build the final value */
if (inst->src[arg].abs) {
value.negate = false;
value.abs = true;
}
if (inst->src[arg].negate)
value.negate = !value.negate;
value.swizzle = composed_swizzle;
if (has_source_modifiers &&
value.type != inst->src[arg].type) {
assert(inst->can_change_types());
for (int i = 0; i < 3; i++) {
inst->src[i].type = value.type;
}
inst->dst.type = value.type;
} else {
value.type = inst->src[arg].type;
}
inst->src[arg] = value;
return true;
}
bool
vec4_visitor::opt_copy_propagation(bool do_constant_prop)
{
/* If we are in dual instanced or single mode, then attributes are going
* to be interleaved, so one register contains two attribute slots.
*/
const int attributes_per_reg =
prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
bool progress = false;
struct copy_entry entries[alloc.total_size];
memset(&entries, 0, sizeof(entries));
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
/* This pass only works on basic blocks. If there's flow
* control, throw out all our information and start from
* scratch.
*
* This should really be fixed by using a structure like in
* src/glsl/opt_copy_propagation.cpp to track available copies.
*/
if (!is_dominated_by_previous_instruction(inst)) {
memset(&entries, 0, sizeof(entries));
continue;
}
/* For each source arg, see if each component comes from a copy
* from the same type file (IMM, VGRF, UNIFORM), and try
* optimizing out access to the copy result
*/
for (int i = 2; i >= 0; i--) {
/* Copied values end up in GRFs, and we don't track reladdr
* accesses.
*/
if (inst->src[i].file != VGRF ||
inst->src[i].reladdr)
continue;
/* We only handle register-aligned single GRF copies. */
if (inst->size_read(i) != REG_SIZE ||
inst->src[i].offset % REG_SIZE)
continue;
const unsigned reg = (alloc.offsets[inst->src[i].nr] +
inst->src[i].offset / REG_SIZE);
const copy_entry &entry = entries[reg];
if (do_constant_prop && try_constant_propagate(inst, i, &entry))
progress = true;
else if (try_copy_propagate(compiler, inst, i, &entry, attributes_per_reg))
progress = true;
}
/* Track available source registers. */
if (inst->dst.file == VGRF) {
const int reg =
alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
/* Update our destination's current channel values. For a direct copy,
* the value is the newly propagated source. Otherwise, we don't know
* the new value, so clear it.
*/
bool direct_copy = is_direct_copy(inst);
entries[reg].saturatemask &= ~inst->dst.writemask;
for (int i = 0; i < 4; i++) {
if (inst->dst.writemask & (1 << i)) {
entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
entries[reg].saturatemask |=
inst->saturate && direct_copy ? 1 << i : 0;
}
}
/* Clear the records for any registers whose current value came from
* our destination's updated channels, as the two are no longer equal.
*/
if (inst->dst.reladdr)
memset(&entries, 0, sizeof(entries));
else {
for (unsigned i = 0; i < alloc.total_size; i++) {
for (int j = 0; j < 4; j++) {
if (is_channel_updated(inst, entries[i].value, j)) {
entries[i].value[j] = NULL;
entries[i].saturatemask &= ~(1 << j);
}
}
}
}
}
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_INSTRUCTION_DETAIL);
return progress;
}
} /* namespace brw */

View file

@ -1,322 +0,0 @@
/*
* Copyright © 2012, 2013, 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_vec4.h"
#include "brw_vec4_live_variables.h"
#include "brw_cfg.h"
using namespace brw;
/** @file brw_vec4_cse.cpp
*
* Support for local common subexpression elimination.
*
* See Muchnick's Advanced Compiler Design and Implementation, section
* 13.1 (p378).
*/
namespace {
struct aeb_entry : public exec_node {
/** The instruction that generates the expression value. */
vec4_instruction *generator;
/** The temporary where the value is stored. */
src_reg tmp;
};
}
static bool
is_expression(const vec4_instruction *const inst)
{
switch (inst->opcode) {
case BRW_OPCODE_MOV:
case BRW_OPCODE_SEL:
case BRW_OPCODE_NOT:
case BRW_OPCODE_AND:
case BRW_OPCODE_OR:
case BRW_OPCODE_XOR:
case BRW_OPCODE_SHR:
case BRW_OPCODE_SHL:
case BRW_OPCODE_ASR:
case BRW_OPCODE_CMP:
case BRW_OPCODE_CMPN:
case BRW_OPCODE_ADD:
case BRW_OPCODE_MUL:
case SHADER_OPCODE_MULH:
case BRW_OPCODE_FRC:
case BRW_OPCODE_RNDU:
case BRW_OPCODE_RNDD:
case BRW_OPCODE_RNDE:
case BRW_OPCODE_RNDZ:
case BRW_OPCODE_LINE:
case BRW_OPCODE_PLN:
case BRW_OPCODE_MAD:
case BRW_OPCODE_LRP:
case VEC4_OPCODE_UNPACK_UNIFORM:
case SHADER_OPCODE_FIND_LIVE_CHANNEL:
case SHADER_OPCODE_BROADCAST:
case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS:
case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
return true;
case SHADER_OPCODE_RCP:
case SHADER_OPCODE_RSQ:
case SHADER_OPCODE_SQRT:
case SHADER_OPCODE_EXP2:
case SHADER_OPCODE_LOG2:
case SHADER_OPCODE_POW:
case SHADER_OPCODE_INT_QUOTIENT:
case SHADER_OPCODE_INT_REMAINDER:
case SHADER_OPCODE_SIN:
case SHADER_OPCODE_COS:
return inst->mlen == 0;
default:
return false;
}
}
static bool
operands_match(const vec4_instruction *a, const vec4_instruction *b)
{
const src_reg *xs = a->src;
const src_reg *ys = b->src;
if (a->opcode == BRW_OPCODE_MAD) {
return xs[0].equals(ys[0]) &&
((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
(xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
} else if (a->opcode == BRW_OPCODE_MOV &&
xs[0].file == IMM &&
xs[0].type == BRW_REGISTER_TYPE_VF) {
src_reg tmp_x = xs[0];
src_reg tmp_y = ys[0];
/* Smash out the values that are not part of the writemask. Otherwise
* the equals operator will fail due to mismatches in unused components.
*/
const unsigned ab_writemask = a->dst.writemask & b->dst.writemask;
const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) |
((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) |
((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) |
((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0);
tmp_x.ud &= mask;
tmp_y.ud &= mask;
return tmp_x.equals(tmp_y);
} else if (!a->is_commutative()) {
return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
} else {
return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
(xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
}
}
/**
* Checks if instructions match, exactly for sources, but loosely for
* destination writemasks.
*
* \param 'a' is the generating expression from the AEB entry.
* \param 'b' is the second occurrence of the expression that we're
* considering eliminating.
*/
static bool
instructions_match(vec4_instruction *a, vec4_instruction *b)
{
return a->opcode == b->opcode &&
a->saturate == b->saturate &&
a->predicate == b->predicate &&
a->predicate_inverse == b->predicate_inverse &&
a->conditional_mod == b->conditional_mod &&
a->flag_subreg == b->flag_subreg &&
a->dst.type == b->dst.type &&
a->offset == b->offset &&
a->mlen == b->mlen &&
a->base_mrf == b->base_mrf &&
a->header_size == b->header_size &&
a->shadow_compare == b->shadow_compare &&
((a->dst.writemask & b->dst.writemask) == a->dst.writemask) &&
a->force_writemask_all == b->force_writemask_all &&
a->size_written == b->size_written &&
a->exec_size == b->exec_size &&
a->group == b->group &&
operands_match(a, b);
}
bool
vec4_visitor::opt_cse_local(bblock_t *block, const vec4_live_variables &live)
{
bool progress = false;
exec_list aeb;
void *cse_ctx = ralloc_context(NULL);
int ip = block->start_ip;
foreach_inst_in_block (vec4_instruction, inst, block) {
/* Skip some cases. */
if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
inst->dst.is_null()))
{
bool found = false;
foreach_in_list_use_after(aeb_entry, entry, &aeb) {
/* Match current instruction's expression against those in AEB. */
if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
instructions_match(inst, entry->generator)) {
found = true;
progress = true;
break;
}
}
if (!found) {
if (inst->opcode != BRW_OPCODE_MOV ||
(inst->opcode == BRW_OPCODE_MOV &&
inst->src[0].file == IMM &&
inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
/* Our first sighting of this expression. Create an entry. */
aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
entry->tmp = src_reg(); /* file will be BAD_FILE */
entry->generator = inst;
aeb.push_tail(entry);
}
} else {
/* This is at least our second sighting of this expression.
* If we don't have a temporary already, make one.
*/
bool no_existing_temp = entry->tmp.file == BAD_FILE;
if (no_existing_temp && !entry->generator->dst.is_null()) {
entry->tmp = retype(src_reg(VGRF, alloc.allocate(
regs_written(entry->generator)),
NULL), inst->dst.type);
const unsigned width = entry->generator->exec_size;
unsigned component_size = width * type_sz(entry->tmp.type);
unsigned num_copy_movs =
DIV_ROUND_UP(entry->generator->size_written, component_size);
for (unsigned i = 0; i < num_copy_movs; ++i) {
vec4_instruction *copy =
MOV(offset(entry->generator->dst, width, i),
offset(entry->tmp, width, i));
copy->exec_size = width;
copy->group = entry->generator->group;
copy->force_writemask_all =
entry->generator->force_writemask_all;
entry->generator->insert_after(block, copy);
}
entry->generator->dst = dst_reg(entry->tmp);
}
/* dest <- temp */
if (!inst->dst.is_null()) {
assert(inst->dst.type == entry->tmp.type);
const unsigned width = inst->exec_size;
unsigned component_size = width * type_sz(inst->dst.type);
unsigned num_copy_movs =
DIV_ROUND_UP(inst->size_written, component_size);
for (unsigned i = 0; i < num_copy_movs; ++i) {
vec4_instruction *copy =
MOV(offset(inst->dst, width, i),
offset(entry->tmp, width, i));
copy->exec_size = inst->exec_size;
copy->group = inst->group;
copy->force_writemask_all = inst->force_writemask_all;
inst->insert_before(block, copy);
}
}
/* Set our iterator so that next time through the loop inst->next
* will get the instruction in the basic block after the one we've
* removed.
*/
vec4_instruction *prev = (vec4_instruction *)inst->prev;
inst->remove(block);
inst = prev;
}
}
foreach_in_list_safe(aeb_entry, entry, &aeb) {
/* Kill all AEB entries that write a different value to or read from
* the flag register if we just wrote it.
*/
if (inst->writes_flag(devinfo)) {
if (entry->generator->reads_flag() ||
(entry->generator->writes_flag(devinfo) &&
!instructions_match(inst, entry->generator))) {
entry->remove();
ralloc_free(entry);
continue;
}
}
for (int i = 0; i < 3; i++) {
src_reg *src = &entry->generator->src[i];
/* Kill all AEB entries that use the destination we just
* overwrote.
*/
if (inst->dst.file == entry->generator->src[i].file &&
inst->dst.nr == entry->generator->src[i].nr) {
entry->remove();
ralloc_free(entry);
break;
}
/* Kill any AEB entries using registers that don't get reused any
* more -- a sure sign they'll fail operands_match().
*/
if (src->file == VGRF) {
if (live.var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
entry->remove();
ralloc_free(entry);
break;
}
}
}
}
ip++;
}
ralloc_free(cse_ctx);
return progress;
}
bool
vec4_visitor::opt_cse()
{
bool progress = false;
const vec4_live_variables &live = live_analysis.require();
foreach_block (block, cfg) {
progress = opt_cse_local(block, live) || progress;
}
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
return progress;
}

View file

@ -1,188 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_vec4.h"
#include "brw_vec4_live_variables.h"
#include "brw_cfg.h"
/** @file brw_vec4_dead_code_eliminate.cpp
*
* Dataflow-aware dead code elimination.
*
* Walks the instruction list from the bottom, removing instructions that
* have results that both aren't used in later blocks and haven't been read
* yet in the tail end of this block.
*/
using namespace brw;
bool
vec4_visitor::dead_code_eliminate()
{
bool progress = false;
const vec4_live_variables &live_vars = live_analysis.require();
int num_vars = live_vars.num_vars;
BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
foreach_block_reverse_safe(block, cfg) {
memcpy(live, live_vars.block_data[block->num].liveout,
sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
memcpy(flag_live, live_vars.block_data[block->num].flag_liveout,
sizeof(BITSET_WORD));
foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
(inst->dst.is_null() && inst->writes_flag(devinfo))){
bool result_live[4] = { false };
if (inst->dst.file == VGRF) {
for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
for (int c = 0; c < 4; c++) {
const unsigned v = var_from_reg(alloc, inst->dst, c, i);
result_live[c] |= BITSET_TEST(live, v);
}
}
} else {
for (unsigned c = 0; c < 4; c++)
result_live[c] = BITSET_TEST(flag_live, c);
}
/* If the instruction can't do writemasking, then it's all or
* nothing.
*/
if (!inst->can_do_writemask(devinfo)) {
bool result = result_live[0] | result_live[1] |
result_live[2] | result_live[3];
result_live[0] = result;
result_live[1] = result;
result_live[2] = result;
result_live[3] = result;
}
if (inst->writes_flag(devinfo)) {
/* Independently calculate the usage of the flag components and
* the destination value components.
*/
uint8_t flag_mask = inst->dst.writemask;
uint8_t dest_mask = inst->dst.writemask;
for (int c = 0; c < 4; c++) {
if (!result_live[c] && dest_mask & (1 << c))
dest_mask &= ~(1 << c);
if (!BITSET_TEST(flag_live, c))
flag_mask &= ~(1 << c);
}
if (inst->dst.writemask != (flag_mask | dest_mask)) {
progress = true;
inst->dst.writemask = flag_mask | dest_mask;
}
/* If none of the destination components are read, replace the
* destination register with the NULL register.
*/
if (dest_mask == 0) {
progress = true;
inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
}
} else {
for (int c = 0; c < 4; c++) {
if (!result_live[c] && inst->dst.writemask & (1 << c)) {
inst->dst.writemask &= ~(1 << c);
progress = true;
if (inst->dst.writemask == 0) {
if (inst->writes_accumulator) {
inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
} else {
inst->opcode = BRW_OPCODE_NOP;
break;
}
}
}
}
}
}
if (inst->dst.is_null() && inst->writes_flag(devinfo)) {
bool combined_live = false;
for (unsigned c = 0; c < 4; c++)
combined_live |= BITSET_TEST(flag_live, c);
if (!combined_live) {
inst->opcode = BRW_OPCODE_NOP;
progress = true;
}
}
if (inst->dst.file == VGRF && !inst->predicate &&
!inst->is_align1_partial_write()) {
for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
for (int c = 0; c < 4; c++) {
if (inst->dst.writemask & (1 << c)) {
const unsigned v = var_from_reg(alloc, inst->dst, c, i);
BITSET_CLEAR(live, v);
}
}
}
}
if (inst->writes_flag(devinfo) && !inst->predicate && inst->exec_size == 8) {
for (unsigned c = 0; c < 4; c++)
BITSET_CLEAR(flag_live, c);
}
if (inst->opcode == BRW_OPCODE_NOP) {
inst->remove(block);
continue;
}
for (int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF) {
for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
for (int c = 0; c < 4; c++) {
const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
BITSET_SET(live, v);
}
}
}
}
for (unsigned c = 0; c < 4; c++) {
if (inst->reads_flag(c)) {
BITSET_SET(flag_live, c);
}
}
}
}
ralloc_free(live);
ralloc_free(flag_live);
if (progress)
invalidate_analysis(DEPENDENCY_INSTRUCTIONS);
return progress;
}

File diff suppressed because it is too large Load diff

View file

@ -1,98 +0,0 @@
/*
* Copyright © 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_vec4_gs_visitor.h"
namespace brw {
void
vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
{
dst_reg dest;
src_reg src;
switch (instr->intrinsic) {
case nir_intrinsic_load_per_vertex_input: {
assert(instr->def.bit_size == 32);
/* The EmitNoIndirectInput flag guarantees our vertex index will
* be constant. We should handle indirects someday.
*/
const unsigned vertex = nir_src_as_uint(instr->src[0]);
const unsigned offset_reg = nir_src_as_uint(instr->src[1]);
const unsigned input_array_stride = prog_data->urb_read_length * 2;
/* Make up a type...we have no way of knowing... */
const glsl_type *const type = glsl_ivec_type(instr->num_components);
src = src_reg(ATTR, input_array_stride * vertex +
nir_intrinsic_base(instr) + offset_reg,
type);
src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
dest = get_nir_def(instr->def, src.type);
dest.writemask = brw_writemask_for_size(instr->num_components);
emit(MOV(dest, src));
break;
}
case nir_intrinsic_load_input:
unreachable("nir_lower_io should have produced per_vertex intrinsics");
case nir_intrinsic_emit_vertex_with_counter:
this->vertex_count =
retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
gs_emit_vertex(nir_intrinsic_stream_id(instr));
break;
case nir_intrinsic_end_primitive_with_counter:
this->vertex_count =
retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
gs_end_primitive();
break;
case nir_intrinsic_set_vertex_and_primitive_count:
this->vertex_count =
retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
break;
case nir_intrinsic_load_primitive_id:
assert(gs_prog_data->include_primitive_id);
dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
break;
case nir_intrinsic_load_invocation_id: {
dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
if (gs_prog_data->invocations > 1)
emit(GS_OPCODE_GET_INSTANCE_ID, dest);
else
emit(MOV(dest, brw_imm_ud(0)));
break;
}
default:
vec4_visitor::nir_emit_intrinsic(instr);
}
}
}

View file

@ -1,560 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_vec4_gs_visitor.cpp
*
* Geometry-shader-specific code derived from the vec4_visitor class.
*/
#include "brw_vec4_gs_visitor.h"
#include "brw_cfg.h"
#include "brw_fs.h"
namespace brw {
vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
struct brw_gs_compile *c,
struct brw_gs_prog_data *prog_data,
const nir_shader *shader,
bool no_spills,
bool debug_enabled)
: vec4_visitor(compiler, params, &c->key.base.tex,
&prog_data->base, shader,
no_spills, debug_enabled),
c(c),
gs_prog_data(prog_data)
{
}
static inline struct brw_reg
attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
{
struct brw_reg reg;
unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
if (interleaved) {
reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
} else {
reg = brw_vecn_grf(width, attr, 0);
}
reg.type = type;
return reg;
}
/**
* Replace each register of type ATTR in this->instructions with a reference
* to a fixed HW register.
*
* If interleaved is true, then each attribute takes up half a register, with
* register N containing attribute 2*N in its first half and attribute 2*N+1
* in its second half (this corresponds to the payload setup used by geometry
* shaders in "single" or "dual instanced" dispatch mode). If interleaved is
* false, then each attribute takes up a whole register, with register N
* containing attribute N (this corresponds to the payload setup used by
* vertex shaders, and by geometry shaders in "dual object" dispatch mode).
*/
int
vec4_gs_visitor::setup_varying_inputs(int payload_reg,
int attributes_per_reg)
{
/* For geometry shaders there are N copies of the input attributes, where N
* is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT *
* i + j] represents attribute j for vertex i.
*
* Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
* so the total number of input slots that will be delivered to the GS (and
* thus the stride of the input arrays) is urb_read_length * 2.
*/
const unsigned num_input_vertices = nir->info.gs.vertices_in;
assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
unsigned input_array_stride = prog_data->urb_read_length * 2;
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (int i = 0; i < 3; i++) {
if (inst->src[i].file != ATTR)
continue;
assert(inst->src[i].offset % REG_SIZE == 0);
int grf = payload_reg * attributes_per_reg +
inst->src[i].nr + inst->src[i].offset / REG_SIZE;
struct brw_reg reg =
attribute_to_hw_reg(grf, inst->src[i].type, attributes_per_reg > 1);
reg.swizzle = inst->src[i].swizzle;
if (inst->src[i].abs)
reg = brw_abs(reg);
if (inst->src[i].negate)
reg = negate(reg);
inst->src[i] = reg;
}
}
int regs_used = ALIGN(input_array_stride * num_input_vertices,
attributes_per_reg) / attributes_per_reg;
return payload_reg + regs_used;
}
void
vec4_gs_visitor::setup_payload()
{
/* If we are in dual instanced or single mode, then attributes are going
* to be interleaved, so one register contains two attribute slots.
*/
int attributes_per_reg =
prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
int reg = 0;
/* The payload always contains important data in r0, which contains
* the URB handles that are passed on to the URB write at the end
* of the thread.
*/
reg++;
/* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
if (gs_prog_data->include_primitive_id)
reg++;
reg = setup_uniforms(reg);
reg = setup_varying_inputs(reg, attributes_per_reg);
this->first_non_payload_grf = reg;
}
void
vec4_gs_visitor::emit_prolog()
{
/* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In
* geometry shaders, it isn't (it contains a bunch of information we don't
* need, like the input primitive type). We need r0.2 to be zero in order
* to build scratch read/write messages correctly (otherwise this value
* will be interpreted as a global offset, causing us to do our scratch
* reads/writes to garbage memory). So just set it to zero at the top of
* the shader.
*/
this->current_annotation = "clear r0.2";
dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u));
inst->force_writemask_all = true;
/* Create a virtual register to hold the vertex count */
this->vertex_count = src_reg(this, glsl_uint_type());
/* Initialize the vertex_count register to 0 */
this->current_annotation = "initialize vertex_count";
inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u)));
inst->force_writemask_all = true;
if (c->control_data_header_size_bits > 0) {
/* Create a virtual register to hold the current set of control data
* bits.
*/
this->control_data_bits = src_reg(this, glsl_uint_type());
/* If we're outputting more than 32 control data bits, then EmitVertex()
* will set control_data_bits to 0 after emitting the first vertex.
* Otherwise, we need to initialize it to 0 here.
*/
if (c->control_data_header_size_bits <= 32) {
this->current_annotation = "initialize control data bits";
inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
inst->force_writemask_all = true;
}
}
this->current_annotation = NULL;
}
void
vec4_gs_visitor::emit_thread_end()
{
if (c->control_data_header_size_bits > 0) {
/* During shader execution, we only ever call emit_control_data_bits()
* just prior to outputting a vertex. Therefore, the control data bits
* corresponding to the most recently output vertex still need to be
* emitted.
*/
current_annotation = "thread end: emit control data bits";
emit_control_data_bits();
}
/* MRF 0 is reserved for the debugger, so start with message header
* in MRF 1.
*/
int base_mrf = 1;
current_annotation = "thread end";
dst_reg mrf_reg(MRF, base_mrf);
src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
vec4_instruction *inst = emit(MOV(mrf_reg, r0));
inst->force_writemask_all = true;
emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
inst = emit(GS_OPCODE_THREAD_END);
inst->base_mrf = base_mrf;
inst->mlen = 1;
}
void
vec4_gs_visitor::emit_urb_write_header(int mrf)
{
/* The SEND instruction that writes the vertex data to the VUE will use
* per_slot_offset=true, which means that DWORDs 3 and 4 of the message
* header specify an offset (in multiples of 256 bits) into the URB entry
* at which the write should take place.
*
* So we have to prepare a message header with the appropriate offset
* values.
*/
dst_reg mrf_reg(MRF, mrf);
src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
this->current_annotation = "URB write header";
vec4_instruction *inst = emit(MOV(mrf_reg, r0));
inst->force_writemask_all = true;
emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
brw_imm_ud(gs_prog_data->output_vertex_size_hwords));
}
vec4_instruction *
vec4_gs_visitor::emit_urb_write_opcode(bool complete)
{
/* We don't care whether the vertex is complete, because in general
* geometry shaders output multiple vertices, and we don't terminate the
* thread until all vertices are complete.
*/
(void) complete;
vec4_instruction *inst = emit(VEC4_GS_OPCODE_URB_WRITE);
inst->offset = gs_prog_data->control_data_header_size_hwords;
inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
return inst;
}
/**
* Write out a batch of 32 control data bits from the control_data_bits
* register to the URB.
*
* The current value of the vertex_count register determines which DWORD in
* the URB receives the control data bits. The control_data_bits register is
* assumed to contain the correct data for the vertex that was most recently
* output, and all previous vertices that share the same DWORD.
*
* This function takes care of ensuring that if no vertices have been output
* yet, no control bits are emitted.
*/
void
vec4_gs_visitor::emit_control_data_bits()
{
assert(c->control_data_bits_per_vertex != 0);
/* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
* granularity, we need to use two tricks to ensure that the batch of 32
* control data bits is written to the appropriate DWORD in the URB. To
* select which vec4 we are writing to, we use the "slot {0,1} offset"
* fields of the message header. To select which DWORD in the vec4 we are
* writing to, we use the channel mask fields of the message header. To
* avoid penalizing geometry shaders that emit a small number of vertices
* with extra bookkeeping, we only do each of these tricks when
* c->prog_data.control_data_header_size_bits is large enough to make it
* necessary.
*
* Note: this means that if we're outputting just a single DWORD of control
* data bits, we'll actually replicate it four times since we won't do any
* channel masking. But that's not a problem since in this case the
* hardware only pays attention to the first DWORD.
*/
enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
if (c->control_data_header_size_bits > 32)
urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
if (c->control_data_header_size_bits > 128)
urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
/* If we are using either channel masks or a per-slot offset, then we
* need to figure out which DWORD we are trying to write to, using the
* formula:
*
* dword_index = (vertex_count - 1) * bits_per_vertex / 32
*
* Since bits_per_vertex is a power of two, and is known at compile
* time, this can be optimized to:
*
* dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
*/
src_reg dword_index(this, glsl_uint_type());
if (urb_write_flags) {
src_reg prev_count(this, glsl_uint_type());
emit(ADD(dst_reg(prev_count), this->vertex_count,
brw_imm_ud(0xffffffffu)));
unsigned log2_bits_per_vertex =
util_last_bit(c->control_data_bits_per_vertex);
emit(SHR(dst_reg(dword_index), prev_count,
brw_imm_ud(6 - log2_bits_per_vertex)));
}
/* Start building the URB write message. The first MRF gets a copy of
* R0.
*/
int base_mrf = 1;
dst_reg mrf_reg(MRF, base_mrf);
src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
vec4_instruction *inst = emit(MOV(mrf_reg, r0));
inst->force_writemask_all = true;
if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
/* Set the per-slot offset to dword_index / 4, to that we'll write to
* the appropriate OWORD within the control data header.
*/
src_reg per_slot_offset(this, glsl_uint_type());
emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
brw_imm_ud(1u));
}
if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
/* Set the channel masks to 1 << (dword_index % 4), so that we'll
* write to the appropriate DWORD within the OWORD. We need to do
* this computation with force_writemask_all, otherwise garbage data
* from invocation 0 might clobber the mask for invocation 1 when
* GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
* together.
*/
src_reg channel(this, glsl_uint_type());
inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
inst->force_writemask_all = true;
src_reg one(this, glsl_uint_type());
inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
inst->force_writemask_all = true;
src_reg channel_mask(this, glsl_uint_type());
inst = emit(SHL(dst_reg(channel_mask), one, channel));
inst->force_writemask_all = true;
emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
channel_mask);
emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
}
/* Store the control data bits in the message payload and send it. */
dst_reg mrf_reg2(MRF, base_mrf + 1);
inst = emit(MOV(mrf_reg2, this->control_data_bits));
inst->force_writemask_all = true;
inst = emit(VEC4_GS_OPCODE_URB_WRITE);
inst->urb_write_flags = urb_write_flags;
inst->base_mrf = base_mrf;
inst->mlen = 2;
}
void
vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
{
/* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
/* Note: we are calling this *before* increasing vertex_count, so
* this->vertex_count == vertex_count - 1 in the formula above.
*/
/* Stream mode uses 2 bits per vertex */
assert(c->control_data_bits_per_vertex == 2);
/* Must be a valid stream */
assert(stream_id < 4); /* MAX_VERTEX_STREAMS */
/* Control data bits are initialized to 0 so we don't have to set any
* bits when sending vertices to stream 0.
*/
if (stream_id == 0)
return;
/* reg::sid = stream_id */
src_reg sid(this, glsl_uint_type());
emit(MOV(dst_reg(sid), brw_imm_ud(stream_id)));
/* reg:shift_count = 2 * (vertex_count - 1) */
src_reg shift_count(this, glsl_uint_type());
emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u)));
/* Note: we're relying on the fact that the GEN SHL instruction only pays
* attention to the lower 5 bits of its second source argument, so on this
* architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
* stream_id << ((2 * (vertex_count - 1)) % 32).
*/
src_reg mask(this, glsl_uint_type());
emit(SHL(dst_reg(mask), sid, shift_count));
emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
}
void
vec4_gs_visitor::gs_emit_vertex(int stream_id)
{
this->current_annotation = "emit vertex: safety check";
/* Haswell and later hardware ignores the "Render Stream Select" bits
* from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
* and instead sends all primitives down the pipeline for rasterization.
* If the SOL stage is enabled, "Render Stream Select" is honored and
* primitives bound to non-zero streams are discarded after stream output.
*
* Since the only purpose of primives sent to non-zero streams is to
* be recorded by transform feedback, we can simply discard all geometry
* bound to these streams when transform feedback is disabled.
*/
if (stream_id > 0 && !nir->info.has_transform_feedback_varyings)
return;
/* If we're outputting 32 control data bits or less, then we can wait
* until the shader is over to output them all. Otherwise we need to
* output them as we go. Now is the time to do it, since we're about to
* output the vertex_count'th vertex, so it's guaranteed that the
* control data bits associated with the (vertex_count - 1)th vertex are
* correct.
*/
if (c->control_data_header_size_bits > 32) {
this->current_annotation = "emit vertex: emit control data bits";
/* Only emit control data bits if we've finished accumulating a batch
* of 32 bits. This is the case when:
*
* (vertex_count * bits_per_vertex) % 32 == 0
*
* (in other words, when the last 5 bits of vertex_count *
* bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some
* integer n (which is always the case, since bits_per_vertex is
* always 1 or 2), this is equivalent to requiring that the last 5-n
* bits of vertex_count are 0:
*
* vertex_count & (2^(5-n) - 1) == 0
*
* 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
* equivalent to:
*
* vertex_count & (32 / bits_per_vertex - 1) == 0
*/
vec4_instruction *inst =
emit(AND(dst_null_ud(), this->vertex_count,
brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
inst->conditional_mod = BRW_CONDITIONAL_Z;
emit(IF(BRW_PREDICATE_NORMAL));
{
/* If vertex_count is 0, then no control data bits have been
* accumulated yet, so we skip emitting them.
*/
emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
BRW_CONDITIONAL_NEQ));
emit(IF(BRW_PREDICATE_NORMAL));
emit_control_data_bits();
emit(BRW_OPCODE_ENDIF);
/* Reset control_data_bits to 0 so we can start accumulating a new
* batch.
*
* Note: in the case where vertex_count == 0, this neutralizes the
* effect of any call to EndPrimitive() that the shader may have
* made before outputting its first vertex.
*/
inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
inst->force_writemask_all = true;
}
emit(BRW_OPCODE_ENDIF);
}
this->current_annotation = "emit vertex: vertex data";
emit_vertex();
/* In stream mode we have to set control data bits for all vertices
* unless we have disabled control data bits completely (which we do
* do for MESA_PRIM_POINTS outputs that don't use streams).
*/
if (c->control_data_header_size_bits > 0 &&
gs_prog_data->control_data_format ==
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
this->current_annotation = "emit vertex: Stream control data bits";
set_stream_control_data_bits(stream_id);
}
this->current_annotation = NULL;
}
void
vec4_gs_visitor::gs_end_primitive()
{
/* We can only do EndPrimitive() functionality when the control data
* consists of cut bits. Fortunately, the only time it isn't is when the
* output type is points, in which case EndPrimitive() is a no-op.
*/
if (gs_prog_data->control_data_format !=
GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
return;
}
if (c->control_data_header_size_bits == 0)
return;
/* Cut bits use one bit per vertex. */
assert(c->control_data_bits_per_vertex == 1);
/* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
* vertex n, 0 otherwise. So all we need to do here is mark bit
* (vertex_count - 1) % 32 in the cut_bits register to indicate that
* EndPrimitive() was called after emitting vertex (vertex_count - 1);
* vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
*
* Note that if EndPrimitve() is called before emitting any vertices, this
* will cause us to set bit 31 of the control_data_bits register to 1.
* That's fine because:
*
* - If max_vertices < 32, then vertex number 31 (zero-based) will never be
* output, so the hardware will ignore cut bit 31.
*
* - If max_vertices == 32, then vertex number 31 is guaranteed to be the
* last vertex, so setting cut bit 31 has no effect (since the primitive
* is automatically ended when the GS terminates).
*
* - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
* control_data_bits register to 0 when the first vertex is emitted.
*/
/* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
src_reg one(this, glsl_uint_type());
emit(MOV(dst_reg(one), brw_imm_ud(1u)));
src_reg prev_count(this, glsl_uint_type());
emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu)));
src_reg mask(this, glsl_uint_type());
/* Note: we're relying on the fact that the GEN SHL instruction only pays
* attention to the lower 5 bits of its second source argument, so on this
* architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
* ((vertex_count - 1) % 32).
*/
emit(SHL(dst_reg(mask), one, prev_count));
emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
}
} /* namespace brw */

View file

@ -1,75 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_vec4_gs_visitor.h
*
* Geometry-shader-specific code derived from the vec4_visitor class.
*/
#ifndef BRW_VEC4_GS_VISITOR_H
#define BRW_VEC4_GS_VISITOR_H
#include "brw_vec4.h"
#define MAX_GS_INPUT_VERTICES 6
#ifdef __cplusplus
namespace brw {
class vec4_gs_visitor : public vec4_visitor
{
public:
vec4_gs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
struct brw_gs_compile *c,
struct brw_gs_prog_data *prog_data,
const nir_shader *shader,
bool no_spills,
bool debug_enabled);
protected:
virtual void setup_payload();
virtual void emit_prolog();
virtual void emit_thread_end();
virtual void emit_urb_write_header(int mrf);
virtual vec4_instruction *emit_urb_write_opcode(bool complete);
virtual void gs_emit_vertex(int stream_id);
virtual void gs_end_primitive();
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
protected:
int setup_varying_inputs(int payload_reg, int attributes_per_reg);
void emit_control_data_bits();
void set_stream_control_data_bits(unsigned stream_id);
src_reg vertex_count;
src_reg control_data_bits;
const struct brw_gs_compile * const c;
struct brw_gs_prog_data * const gs_prog_data;
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* BRW_VEC4_GS_VISITOR_H */

View file

@ -1,331 +0,0 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#include "brw_vec4.h"
#include "brw_vec4_live_variables.h"
using namespace brw;
#define MAX_INSTRUCTION (1 << 30)
/** @file brw_vec4_live_variables.cpp
*
* Support for computing at the basic block level which variables
* (virtual GRFs in our case) are live at entry and exit.
*
* See Muchnick's Advanced Compiler Design and Implementation, section
* 14.1 (p444).
*/
/**
* Sets up the use/def arrays and block-local approximation of the live ranges.
*
* The basic-block-level live variable analysis needs to know which
* variables get used before they're completely defined, and which
* variables are completely defined before they're used.
*
* We independently track each channel of a vec4. This is because we need to
* be able to recognize a sequence like:
*
* ...
* DP4 tmp.x a b;
* DP4 tmp.y c d;
* MUL result.xy tmp.xy e.xy
* ...
*
* as having tmp live only across that sequence (assuming it's used nowhere
* else), because it's a common pattern. A more conservative approach that
* doesn't get tmp marked a deffed in this block will tend to result in
* spilling.
*/
void
vec4_live_variables::setup_def_use()
{
int ip = 0;
foreach_block (block, cfg) {
assert(ip == block->start_ip);
if (block->num > 0)
assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
foreach_inst_in_block(vec4_instruction, inst, block) {
struct block_data *bd = &block_data[block->num];
/* Set up the instruction uses. */
for (unsigned int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF) {
for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
for (int c = 0; c < 4; c++) {
const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
start[v] = MIN2(start[v], ip);
end[v] = ip;
if (!BITSET_TEST(bd->def, v))
BITSET_SET(bd->use, v);
}
}
}
}
for (unsigned c = 0; c < 4; c++) {
if (inst->reads_flag(c) &&
!BITSET_TEST(bd->flag_def, c)) {
BITSET_SET(bd->flag_use, c);
}
}
/* Set up the instruction defs. */
if (inst->dst.file == VGRF) {
for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
for (int c = 0; c < 4; c++) {
if (inst->dst.writemask & (1 << c)) {
const unsigned v = var_from_reg(alloc, inst->dst, c, i);
start[v] = MIN2(start[v], ip);
end[v] = ip;
/* Check for unconditional register writes, these are the
* things that screen off preceding definitions of a
* variable, and thus qualify for being in def[].
*/
if ((!inst->predicate || inst->opcode == BRW_OPCODE_SEL) &&
!BITSET_TEST(bd->use, v))
BITSET_SET(bd->def, v);
}
}
}
}
if (inst->writes_flag(devinfo)) {
for (unsigned c = 0; c < 4; c++) {
if ((inst->dst.writemask & (1 << c)) &&
!BITSET_TEST(bd->flag_use, c)) {
BITSET_SET(bd->flag_def, c);
}
}
}
ip++;
}
}
}
/**
* The algorithm incrementally sets bits in liveout and livein,
* propagating it through control flow. It will eventually terminate
* because it only ever adds bits, and stops when no bits are added in
* a pass.
*/
void
vec4_live_variables::compute_live_variables()
{
bool cont = true;
while (cont) {
cont = false;
foreach_block_reverse (block, cfg) {
struct block_data *bd = &block_data[block->num];
/* Update liveout */
foreach_list_typed(bblock_link, child_link, link, &block->children) {
struct block_data *child_bd = &block_data[child_link->block->num];
for (int i = 0; i < bitset_words; i++) {
BITSET_WORD new_liveout = (child_bd->livein[i] &
~bd->liveout[i]);
if (new_liveout) {
bd->liveout[i] |= new_liveout;
cont = true;
}
}
BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
~bd->flag_liveout[0]);
if (new_liveout) {
bd->flag_liveout[0] |= new_liveout;
cont = true;
}
}
/* Update livein */
for (int i = 0; i < bitset_words; i++) {
BITSET_WORD new_livein = (bd->use[i] |
(bd->liveout[i] &
~bd->def[i]));
if (new_livein & ~bd->livein[i]) {
bd->livein[i] |= new_livein;
cont = true;
}
}
BITSET_WORD new_livein = (bd->flag_use[0] |
(bd->flag_liveout[0] &
~bd->flag_def[0]));
if (new_livein & ~bd->flag_livein[0]) {
bd->flag_livein[0] |= new_livein;
cont = true;
}
}
}
}
/**
* Extend the start/end ranges for each variable to account for the
* new information calculated from control flow.
*/
void
vec4_live_variables::compute_start_end()
{
foreach_block (block, cfg) {
const struct block_data &bd = block_data[block->num];
for (int i = 0; i < num_vars; i++) {
if (BITSET_TEST(bd.livein, i)) {
start[i] = MIN2(start[i], block->start_ip);
end[i] = MAX2(end[i], block->start_ip);
}
if (BITSET_TEST(bd.liveout, i)) {
start[i] = MIN2(start[i], block->end_ip);
end[i] = MAX2(end[i], block->end_ip);
}
}
}
}
vec4_live_variables::vec4_live_variables(const backend_shader *s)
: alloc(s->alloc), cfg(s->cfg)
{
mem_ctx = ralloc_context(NULL);
num_vars = alloc.total_size * 8;
start = ralloc_array(mem_ctx, int, num_vars);
end = ralloc_array(mem_ctx, int, num_vars);
for (int i = 0; i < num_vars; i++) {
start[i] = MAX_INSTRUCTION;
end[i] = -1;
}
devinfo = s->compiler->devinfo;
block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
bitset_words = BITSET_WORDS(num_vars);
for (int i = 0; i < cfg->num_blocks; i++) {
block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
block_data[i].flag_def[0] = 0;
block_data[i].flag_use[0] = 0;
block_data[i].flag_livein[0] = 0;
block_data[i].flag_liveout[0] = 0;
}
setup_def_use();
compute_live_variables();
compute_start_end();
}
vec4_live_variables::~vec4_live_variables()
{
ralloc_free(mem_ctx);
}
static bool
check_register_live_range(const vec4_live_variables *live, int ip,
unsigned var, unsigned n)
{
for (unsigned j = 0; j < n; j += 4) {
if (var + j >= unsigned(live->num_vars) ||
live->start[var + j] > ip || live->end[var + j] < ip)
return false;
}
return true;
}
bool
vec4_live_variables::validate(const backend_shader *s) const
{
unsigned ip = 0;
foreach_block_and_inst(block, vec4_instruction, inst, s->cfg) {
for (unsigned c = 0; c < 4; c++) {
if (inst->dst.writemask & (1 << c)) {
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF &&
!check_register_live_range(this, ip,
var_from_reg(alloc, inst->src[i], c),
regs_read(inst, i)))
return false;
}
if (inst->dst.file == VGRF &&
!check_register_live_range(this, ip,
var_from_reg(alloc, inst->dst, c),
regs_written(inst)))
return false;
}
}
ip++;
}
return true;
}
int
vec4_live_variables::var_range_start(unsigned v, unsigned n) const
{
int ip = INT_MAX;
for (unsigned i = 0; i < n; i++)
ip = MIN2(ip, start[v + i]);
return ip;
}
int
vec4_live_variables::var_range_end(unsigned v, unsigned n) const
{
int ip = INT_MIN;
for (unsigned i = 0; i < n; i++)
ip = MAX2(ip, end[v + i]);
return ip;
}
bool
vec4_live_variables::vgrfs_interfere(int a, int b) const
{
return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <=
var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) ||
(var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <=
var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a])));
}

View file

@ -1,143 +0,0 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* Authors:
* Eric Anholt <eric@anholt.net>
*
*/
#ifndef BRW_VEC4_LIVE_VARIABLES_H
#define BRW_VEC4_LIVE_VARIABLES_H
#include "brw_ir_vec4.h"
#include "brw_ir_analysis.h"
#include "util/bitset.h"
struct backend_shader;
namespace brw {
class vec4_live_variables {
public:
struct block_data {
/**
* Which variables are defined before being used in the block.
*
* Note that for our purposes, "defined" means unconditionally, completely
* defined.
*/
BITSET_WORD *def;
/**
* Which variables are used before being defined in the block.
*/
BITSET_WORD *use;
/** Which defs reach the entry point of the block. */
BITSET_WORD *livein;
/** Which defs reach the exit point of the block. */
BITSET_WORD *liveout;
BITSET_WORD flag_def[1];
BITSET_WORD flag_use[1];
BITSET_WORD flag_livein[1];
BITSET_WORD flag_liveout[1];
};
vec4_live_variables(const backend_shader *s);
~vec4_live_variables();
bool
validate(const backend_shader *s) const;
analysis_dependency_class
dependency_class() const
{
return (DEPENDENCY_INSTRUCTION_IDENTITY |
DEPENDENCY_INSTRUCTION_DATA_FLOW |
DEPENDENCY_VARIABLES);
}
int num_vars;
int bitset_words;
const struct intel_device_info *devinfo;
/** Per-basic-block information on live variables */
struct block_data *block_data;
/** @{
* Final computed live ranges for each variable.
*/
int *start;
int *end;
/** @} */
int var_range_start(unsigned v, unsigned n) const;
int var_range_end(unsigned v, unsigned n) const;
bool vgrfs_interfere(int a, int b) const;
protected:
void setup_def_use();
void compute_live_variables();
void compute_start_end();
const simple_allocator &alloc;
cfg_t *cfg;
void *mem_ctx;
};
/* Returns the variable index for the k-th dword of the c-th component of
* register reg.
*/
inline unsigned
var_from_reg(const simple_allocator &alloc, const src_reg &reg,
unsigned c = 0, unsigned k = 0)
{
assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
unsigned result =
8 * alloc.offsets[reg.nr] + reg.offset / 4 +
(BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize;
/* Do not exceed the limit for this register */
assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
return result;
}
inline unsigned
var_from_reg(const simple_allocator &alloc, const dst_reg &reg,
unsigned c = 0, unsigned k = 0)
{
assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
unsigned result =
8 * alloc.offsets[reg.nr] + reg.offset / 4 +
(c + k / csize * 4) * csize + k % csize;
/* Do not exceed the limit for this register */
assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
return result;
}
} /* namespace brw */
#endif /* BRW_VEC4_LIVE_VARIABLES_H */

File diff suppressed because it is too large Load diff

View file

@ -1,512 +0,0 @@
/*
* Copyright © 2011 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "util/register_allocate.h"
#include "brw_vec4.h"
#include "brw_cfg.h"
using namespace brw;
#define REG_CLASS_COUNT 20
namespace brw {
static void
assign(unsigned int *reg_hw_locations, backend_reg *reg)
{
if (reg->file == VGRF) {
reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
reg->offset %= REG_SIZE;
}
}
bool
vec4_visitor::reg_allocate_trivial()
{
unsigned int hw_reg_mapping[this->alloc.count];
bool virtual_grf_used[this->alloc.count];
int next;
/* Calculate which virtual GRFs are actually in use after whatever
* optimization passes have occurred.
*/
for (unsigned i = 0; i < this->alloc.count; i++) {
virtual_grf_used[i] = false;
}
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
if (inst->dst.file == VGRF)
virtual_grf_used[inst->dst.nr] = true;
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF)
virtual_grf_used[inst->src[i].nr] = true;
}
}
hw_reg_mapping[0] = this->first_non_payload_grf;
next = hw_reg_mapping[0] + this->alloc.sizes[0];
for (unsigned i = 1; i < this->alloc.count; i++) {
if (virtual_grf_used[i]) {
hw_reg_mapping[i] = next;
next += this->alloc.sizes[i];
}
}
prog_data->total_grf = next;
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
assign(hw_reg_mapping, &inst->dst);
assign(hw_reg_mapping, &inst->src[0]);
assign(hw_reg_mapping, &inst->src[1]);
assign(hw_reg_mapping, &inst->src[2]);
}
if (prog_data->total_grf > max_grf) {
fail("Ran out of regs on trivial allocator (%d/%d)\n",
prog_data->total_grf, max_grf);
return false;
}
return true;
}
extern "C" void
brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
{
int base_reg_count =
compiler->devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF;
assert(compiler->devinfo->ver < 8);
/* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
* SEND-from-GRF sources cannot be split, so we also need classes for each
* potential message length.
*/
assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(compiler->devinfo));
int class_sizes[REG_CLASS_COUNT];
for (int i = 0; i < REG_CLASS_COUNT; i++)
class_sizes[i] = i + 1;
ralloc_free(compiler->vec4_reg_set.regs);
compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, base_reg_count, false);
if (compiler->devinfo->ver >= 6)
ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
ralloc_free(compiler->vec4_reg_set.classes);
compiler->vec4_reg_set.classes = ralloc_array(compiler, struct ra_class *, REG_CLASS_COUNT);
/* Now, add the registers to their classes, and add the conflicts
* between them and the base GRF registers (and also each other).
*/
for (int i = 0; i < REG_CLASS_COUNT; i++) {
int class_reg_count = base_reg_count - (class_sizes[i] - 1);
compiler->vec4_reg_set.classes[i] =
ra_alloc_contig_reg_class(compiler->vec4_reg_set.regs, class_sizes[i]);
for (int j = 0; j < class_reg_count; j++)
ra_class_add_reg(compiler->vec4_reg_set.classes[i], j);
}
ra_set_finalize(compiler->vec4_reg_set.regs, NULL);
}
void
vec4_visitor::setup_payload_interference(struct ra_graph *g,
int first_payload_node,
int reg_node_count)
{
int payload_node_count = this->first_non_payload_grf;
for (int i = 0; i < payload_node_count; i++) {
/* Mark each payload reg node as being allocated to its physical register.
*
* The alternative would be to have per-physical register classes, which
* would just be silly.
*/
ra_set_node_reg(g, first_payload_node + i, i);
/* For now, just mark each payload node as interfering with every other
* node to be allocated.
*/
for (int j = 0; j < reg_node_count; j++) {
ra_add_node_interference(g, first_payload_node + i, j);
}
}
}
bool
vec4_visitor::reg_allocate()
{
unsigned int hw_reg_mapping[alloc.count];
int payload_reg_count = this->first_non_payload_grf;
/* Using the trivial allocator can be useful in debugging undefined
* register access as a result of broken optimization passes.
*/
if (0)
return reg_allocate_trivial();
assert(devinfo->ver < 8);
const vec4_live_variables &live = live_analysis.require();
int node_count = alloc.count;
int first_payload_node = node_count;
node_count += payload_reg_count;
struct ra_graph *g =
ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count);
for (unsigned i = 0; i < alloc.count; i++) {
int size = this->alloc.sizes[i];
assert(size >= 1 && size <= MAX_VGRF_SIZE(devinfo));
ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]);
for (unsigned j = 0; j < i; j++) {
if (live.vgrfs_interfere(i, j)) {
ra_add_node_interference(g, i, j);
}
}
}
/* Certain instructions can't safely use the same register for their
* sources and destination. Add interference.
*/
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF) {
ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
}
}
}
}
setup_payload_interference(g, first_payload_node, node_count);
if (!ra_allocate(g)) {
/* Failed to allocate registers. Spill a reg, and the caller will
* loop back into here to try again.
*/
int reg = choose_spill_reg(g);
if (this->no_spills) {
fail("Failure to register allocate. Reduce number of live "
"values to avoid this.");
} else if (reg == -1) {
fail("no register to spill\n");
} else {
spill_reg(reg);
}
ralloc_free(g);
return false;
}
/* Get the chosen virtual registers for each node, and map virtual
* regs in the register classes back down to real hardware reg
* numbers.
*/
prog_data->total_grf = payload_reg_count;
for (unsigned i = 0; i < alloc.count; i++) {
hw_reg_mapping[i] = ra_get_node_reg(g, i);
prog_data->total_grf = MAX2(prog_data->total_grf,
hw_reg_mapping[i] + alloc.sizes[i]);
}
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
assign(hw_reg_mapping, &inst->dst);
assign(hw_reg_mapping, &inst->src[0]);
assign(hw_reg_mapping, &inst->src[1]);
assign(hw_reg_mapping, &inst->src[2]);
}
ralloc_free(g);
return true;
}
/**
* When we decide to spill a register, instead of blindly spilling every use,
* save unspills when the spill register is used (read) in consecutive
* instructions. This can potentially save a bunch of unspills that would
* have very little impact in register allocation anyway.
*
* Notice that we need to account for this behavior when spilling a register
* and when evaluating spilling costs. This function is designed so it can
* be called from both places and avoid repeating the logic.
*
* - When we call this function from spill_reg(), we pass in scratch_reg the
* actual unspill/spill register that we want to reuse in the current
* instruction.
*
* - When we call this from evaluate_spill_costs(), we pass the register for
* which we are evaluating spilling costs.
*
* In either case, we check if the previous instructions read scratch_reg until
* we find one that writes to it with a compatible mask or does not read/write
* scratch_reg at all.
*/
static bool
can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
unsigned scratch_reg)
{
assert(inst->src[i].file == VGRF);
bool prev_inst_read_scratch_reg = false;
/* See if any previous source in the same instructions reads scratch_reg */
for (unsigned n = 0; n < i; n++) {
if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
prev_inst_read_scratch_reg = true;
}
/* Now check if previous instructions read/write scratch_reg */
for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
!prev_inst->is_head_sentinel();
prev_inst = (vec4_instruction *) prev_inst->prev) {
/* If the previous instruction writes to scratch_reg then we can reuse
* it if the write is not conditional and the channels we write are
* compatible with our read mask
*/
if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
(brw_mask_for_swizzle(inst->src[i].swizzle) &
~prev_inst->dst.writemask) == 0;
}
/* Skip scratch read/writes so that instructions generated by spilling
* other registers (that won't read/write scratch_reg) do not stop us from
* reusing scratch_reg for this instruction.
*/
if (prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE ||
prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_READ)
continue;
/* If the previous instruction does not write to scratch_reg, then check
* if it reads it
*/
int n;
for (n = 0; n < 3; n++) {
if (prev_inst->src[n].file == VGRF &&
prev_inst->src[n].nr == scratch_reg) {
prev_inst_read_scratch_reg = true;
break;
}
}
if (n == 3) {
/* The previous instruction does not read scratch_reg. At this point,
* if no previous instruction has read scratch_reg it means that we
* will need to unspill it here and we can't reuse it (so we return
* false). Otherwise, if we found at least one consecutive instruction
* that read scratch_reg, then we know that we got here from
* evaluate_spill_costs (since for the spill_reg path any block of
* consecutive instructions using scratch_reg must start with a write
* to that register, so we would've exited the loop in the check for
* the write that we have at the start of this loop), and in that case
* it means that we found the point at which the scratch_reg would be
* unspilled. Since we always unspill a full vec4, it means that we
* have all the channels available and we can just return true to
* signal that we can reuse the register in the current instruction
* too.
*/
return prev_inst_read_scratch_reg;
}
}
return prev_inst_read_scratch_reg;
}
static inline float
spill_cost_for_type(enum brw_reg_type type)
{
/* Spilling of a 64-bit register involves emitting 2 32-bit scratch
* messages plus the 64b/32b shuffling code.
*/
return type_sz(type) == 8 ? 2.25f : 1.0f;
}
void
vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
{
float loop_scale = 1.0;
unsigned *reg_type_size = (unsigned *)
ralloc_size(NULL, this->alloc.count * sizeof(unsigned));
for (unsigned i = 0; i < this->alloc.count; i++) {
spill_costs[i] = 0.0;
no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2;
reg_type_size[i] = 0;
}
/* Calculate costs for spilling nodes. Call it a cost of 1 per
* spill/unspill we'll have to do, and guess that the insides of
* loops run 10 times.
*/
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (unsigned int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) {
/* We will only unspill src[i] it it wasn't unspilled for the
* previous instruction, in which case we'll just reuse the scratch
* reg for this instruction.
*/
if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
spill_costs[inst->src[i].nr] +=
loop_scale * spill_cost_for_type(inst->src[i].type);
if (inst->src[i].reladdr ||
inst->src[i].offset >= REG_SIZE)
no_spill[inst->src[i].nr] = true;
/* We don't support unspills of partial DF reads.
*
* Our 64-bit unspills are implemented with two 32-bit scratch
* messages, each one reading that for both SIMD4x2 threads that
* we need to shuffle into correct 64-bit data. Ensure that we
* are reading data for both threads.
*/
if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
no_spill[inst->src[i].nr] = true;
}
/* We can't spill registers that mix 32-bit and 64-bit access (that
* contain 64-bit data that is operated on via 32-bit instructions)
*/
unsigned type_size = type_sz(inst->src[i].type);
if (reg_type_size[inst->src[i].nr] == 0)
reg_type_size[inst->src[i].nr] = type_size;
else if (reg_type_size[inst->src[i].nr] != type_size)
no_spill[inst->src[i].nr] = true;
}
}
if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
spill_costs[inst->dst.nr] +=
loop_scale * spill_cost_for_type(inst->dst.type);
if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
no_spill[inst->dst.nr] = true;
/* We don't support spills of partial DF writes.
*
* Our 64-bit spills are implemented with two 32-bit scratch messages,
* each one writing that for both SIMD4x2 threads. Ensure that we
* are writing data for both threads.
*/
if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
no_spill[inst->dst.nr] = true;
/* We can't spill registers that mix 32-bit and 64-bit access (that
* contain 64-bit data that is operated on via 32-bit instructions)
*/
unsigned type_size = type_sz(inst->dst.type);
if (reg_type_size[inst->dst.nr] == 0)
reg_type_size[inst->dst.nr] = type_size;
else if (reg_type_size[inst->dst.nr] != type_size)
no_spill[inst->dst.nr] = true;
}
switch (inst->opcode) {
case BRW_OPCODE_DO:
loop_scale *= 10;
break;
case BRW_OPCODE_WHILE:
loop_scale /= 10;
break;
case SHADER_OPCODE_GFX4_SCRATCH_READ:
case SHADER_OPCODE_GFX4_SCRATCH_WRITE:
case VEC4_OPCODE_MOV_FOR_SCRATCH:
for (int i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF)
no_spill[inst->src[i].nr] = true;
}
if (inst->dst.file == VGRF)
no_spill[inst->dst.nr] = true;
break;
default:
break;
}
}
ralloc_free(reg_type_size);
}
int
vec4_visitor::choose_spill_reg(struct ra_graph *g)
{
float spill_costs[this->alloc.count];
bool no_spill[this->alloc.count];
evaluate_spill_costs(spill_costs, no_spill);
for (unsigned i = 0; i < this->alloc.count; i++) {
if (!no_spill[i])
ra_set_node_spill_cost(g, i, spill_costs[i]);
}
return ra_get_best_spill_node(g);
}
void
vec4_visitor::spill_reg(unsigned spill_reg_nr)
{
assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2);
unsigned spill_offset = last_scratch;
last_scratch += alloc.sizes[spill_reg_nr];
/* Generate spill/unspill instructions for the objects being spilled. */
unsigned scratch_reg = ~0u;
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (unsigned i = 0; i < 3; i++) {
if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
if (scratch_reg == ~0u ||
!can_use_scratch_for_source(inst, i, scratch_reg)) {
/* We need to unspill anyway so make sure we read the full vec4
* in any case. This way, the cached register can be reused
* for consecutive instructions that read different channels of
* the same vec4.
*/
scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]);
src_reg temp = inst->src[i];
temp.nr = scratch_reg;
temp.offset = 0;
temp.swizzle = BRW_SWIZZLE_XYZW;
emit_scratch_read(block, inst,
dst_reg(temp), inst->src[i], spill_offset);
temp.offset = inst->src[i].offset;
}
assert(scratch_reg != ~0u);
inst->src[i].nr = scratch_reg;
}
}
if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
emit_scratch_write(block, inst, spill_offset);
scratch_reg = inst->dst.nr;
}
}
invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES);
}
} /* namespace brw */

View file

@ -1,213 +0,0 @@
/*
* Copyright © 2013-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include "brw_vec4_surface_builder.h"
using namespace brw;
namespace {
namespace array_utils {
/**
* Copy one every \p src_stride logical components of the argument into
* one every \p dst_stride logical components of the result.
*/
static src_reg
emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size,
unsigned dst_stride, unsigned src_stride)
{
if (src_stride == 1 && dst_stride == 1) {
return src;
} else {
const dst_reg dst = bld.vgrf(src.type,
DIV_ROUND_UP(size * dst_stride, 4));
for (unsigned i = 0; i < size; ++i)
bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4),
1 << (i * dst_stride % 4)),
swizzle(offset(src, 8, i * src_stride / 4),
brw_swizzle_for_mask(1 << (i * src_stride % 4))));
return src_reg(dst);
}
}
/**
* Convert a VEC4 into an array of registers with the layout expected by
* the recipient shared unit. If \p has_simd4x2 is true the argument is
* left unmodified in SIMD4x2 form, otherwise it will be rearranged into
* a SIMD8 vector.
*/
static src_reg
emit_insert(const vec4_builder &bld, const src_reg &src,
unsigned n, bool has_simd4x2)
{
if (src.file == BAD_FILE || n == 0) {
return src_reg();
} else {
/* Pad unused components with zeroes. */
const unsigned mask = (1 << n) - 1;
const dst_reg tmp = bld.vgrf(src.type);
bld.MOV(writemask(tmp, mask), src);
if (n < 4)
bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));
return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
}
}
}
}
namespace brw {
namespace surface_access {
namespace {
using namespace array_utils;
/**
* Generate a send opcode for a surface message and return the
* result.
*/
src_reg
emit_send(const vec4_builder &bld, enum opcode op,
const src_reg &header,
const src_reg &addr, unsigned addr_sz,
const src_reg &src, unsigned src_sz,
const src_reg &surface,
unsigned arg, unsigned ret_sz,
brw_predicate pred = BRW_PREDICATE_NONE)
{
/* Calculate the total number of components of the payload. */
const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1);
const unsigned sz = header_sz + addr_sz + src_sz;
/* Construct the payload. */
const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
unsigned n = 0;
if (header_sz)
bld.exec_all().MOV(offset(payload, 8, n++),
retype(header, BRW_REGISTER_TYPE_UD));
for (unsigned i = 0; i < addr_sz; i++)
bld.MOV(offset(payload, 8, n++),
offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i));
for (unsigned i = 0; i < src_sz; i++)
bld.MOV(offset(payload, 8, n++),
offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i));
/* Reduce the dynamically uniform surface index to a single
* scalar.
*/
const src_reg usurface = bld.emit_uniformize(surface);
/* Emit the message send instruction. */
const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
vec4_instruction *inst =
bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
inst->mlen = sz;
inst->size_written = ret_sz * REG_SIZE;
inst->header_size = header_sz;
inst->predicate = pred;
return src_reg(dst);
}
}
/**
* Emit an untyped surface read opcode. \p dims determines the number
* of components of the address and \p size the number of components of
* the returned value.
*/
src_reg
emit_untyped_read(const vec4_builder &bld,
const src_reg &surface, const src_reg &addr,
unsigned dims, unsigned size,
brw_predicate pred)
{
return emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_READ, src_reg(),
emit_insert(bld, addr, dims, true), 1,
src_reg(), 0,
surface, size, 1, pred);
}
/**
* Emit an untyped surface write opcode. \p dims determines the number
* of components of the address and \p size the number of components of
* the argument.
*/
void
emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
const src_reg &addr, const src_reg &src,
unsigned dims, unsigned size,
brw_predicate pred)
{
const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75;
emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(),
emit_insert(bld, addr, dims, has_simd4x2),
has_simd4x2 ? 1 : dims,
emit_insert(bld, src, size, has_simd4x2),
has_simd4x2 ? 1 : size,
surface, size, 0, pred);
}
/**
* Emit an untyped surface atomic opcode. \p dims determines the number
* of components of the address and \p rsize the number of components of
* the returned value (either zero or one).
*/
src_reg
emit_untyped_atomic(const vec4_builder &bld,
const src_reg &surface, const src_reg &addr,
const src_reg &src0, const src_reg &src1,
unsigned dims, unsigned rsize, unsigned op,
brw_predicate pred)
{
const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75;
/* Zip the components of both sources, they are represented as the X
* and Y components of the same vector.
*/
const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
if (size >= 1) {
bld.MOV(writemask(srcs, WRITEMASK_X),
swizzle(src0, BRW_SWIZZLE_XXXX));
}
if (size >= 2) {
bld.MOV(writemask(srcs, WRITEMASK_Y),
swizzle(src1, BRW_SWIZZLE_XXXX));
}
return emit_send(bld, VEC4_OPCODE_UNTYPED_ATOMIC, src_reg(),
emit_insert(bld, addr, dims, has_simd4x2),
has_simd4x2 ? 1 : dims,
emit_insert(bld, src_reg(srcs), size, has_simd4x2),
has_simd4x2 && size ? 1 : size,
surface, op, rsize, pred);
}
}
}

View file

@ -1,53 +0,0 @@
/* -*- c++ -*- */
/*
* Copyright © 2013-2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_VEC4_SURFACE_BUILDER_H
#define BRW_VEC4_SURFACE_BUILDER_H
#include "brw_vec4_builder.h"
namespace brw {
namespace surface_access {
src_reg
emit_untyped_read(const vec4_builder &bld,
const src_reg &surface, const src_reg &addr,
unsigned dims, unsigned size,
brw_predicate pred = BRW_PREDICATE_NONE);
void
emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
const src_reg &addr, const src_reg &src,
unsigned dims, unsigned size,
brw_predicate pred = BRW_PREDICATE_NONE);
src_reg
emit_untyped_atomic(const vec4_builder &bld,
const src_reg &surface, const src_reg &addr,
const src_reg &src0, const src_reg &src1,
unsigned dims, unsigned rsize, unsigned op,
brw_predicate pred = BRW_PREDICATE_NONE);
}
}
#endif

View file

@ -1,320 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_vec4_tcs.cpp
*
* Tessellaton control shader specific code derived from the vec4_visitor class.
*/
#include "intel_nir.h"
#include "brw_vec4_tcs.h"
namespace brw {
vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_tcs_prog_key *key,
struct brw_tcs_prog_data *prog_data,
const nir_shader *nir,
bool debug_enabled)
: vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
nir, false, debug_enabled),
key(key)
{
}
void
vec4_tcs_visitor::setup_payload()
{
int reg = 0;
/* The payload always contains important data in r0, which contains
* the URB handles that are passed on to the URB write at the end
* of the thread.
*/
reg++;
/* r1.0 - r4.7 may contain the input control point URB handles,
* which we use to pull vertex data.
*/
reg += 4;
/* Push constants may start at r5.0 */
reg = setup_uniforms(reg);
this->first_non_payload_grf = reg;
}
void
vec4_tcs_visitor::emit_prolog()
{
invocation_id = src_reg(this, glsl_uint_type());
emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
/* HS threads are dispatched with the dispatch mask set to 0xFF.
* If there are an odd number of output vertices, then the final
* HS instance dispatched will only have its bottom half doing real
* work, and so we need to disable the upper half:
*/
if (nir->info.tess.tcs_vertices_out % 2) {
emit(CMP(dst_null_d(), invocation_id,
brw_imm_ud(nir->info.tess.tcs_vertices_out),
BRW_CONDITIONAL_L));
/* Matching ENDIF is in emit_thread_end() */
emit(IF(BRW_PREDICATE_NORMAL));
}
}
void
vec4_tcs_visitor::emit_thread_end()
{
vec4_instruction *inst;
current_annotation = "thread end";
if (nir->info.tess.tcs_vertices_out % 2) {
emit(BRW_OPCODE_ENDIF);
}
if (devinfo->ver == 7) {
struct brw_tcs_prog_data *tcs_prog_data =
(struct brw_tcs_prog_data *) prog_data;
current_annotation = "release input vertices";
/* Synchronize all threads, so we know that no one is still
* using the input URB handles.
*/
if (tcs_prog_data->instances > 1) {
dst_reg header = dst_reg(this, glsl_uvec4_type());
emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
}
/* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
* We want to compare the bottom half of invocation_id with 0, but
* use that truth value for the top half as well. Unfortunately,
* we don't have stride in the vec4 world, nor UV immediates in
* align16, so we need an opcode to get invocation_id<0,4,0>.
*/
set_condmod(BRW_CONDITIONAL_Z,
emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
invocation_id));
emit(IF(BRW_PREDICATE_NORMAL));
for (unsigned i = 0; i < key->input_vertices; i += 2) {
/* If we have an odd number of input vertices, the last will be
* unpaired. We don't want to use an interleaved URB write in
* that case.
*/
const bool is_unpaired = i == key->input_vertices - 1;
dst_reg header(this, glsl_uvec4_type());
emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
brw_imm_ud(is_unpaired));
}
emit(BRW_OPCODE_ENDIF);
}
inst = emit(TCS_OPCODE_THREAD_END);
inst->base_mrf = 14;
inst->mlen = 2;
}
void
vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
const src_reg &vertex_index,
unsigned base_offset,
unsigned first_component,
const src_reg &indirect_offset)
{
vec4_instruction *inst;
dst_reg temp(this, glsl_ivec4_type());
temp.type = dst.type;
/* Set up the message header to reference the proper parts of the URB */
dst_reg header = dst_reg(this, glsl_uvec4_type());
inst = emit(VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
indirect_offset);
inst->force_writemask_all = true;
/* Read into a temporary, ignoring writemasking. */
inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
inst->offset = base_offset;
inst->mlen = 1;
inst->base_mrf = -1;
/* Copy the temporary to the destination to deal with writemasking.
*
* Also attempt to deal with gl_PointSize being in the .w component.
*/
if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
} else {
src_reg src = src_reg(temp);
src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
emit(MOV(dst, src));
}
}
void
vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
unsigned base_offset,
unsigned first_component,
const src_reg &indirect_offset)
{
vec4_instruction *inst;
/* Set up the message header to reference the proper parts of the URB */
dst_reg header = dst_reg(this, glsl_uvec4_type());
inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
brw_imm_ud(dst.writemask << first_component), indirect_offset);
inst->force_writemask_all = true;
vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
read->offset = base_offset;
read->mlen = 1;
read->base_mrf = -1;
if (first_component) {
/* Read into a temporary and copy with a swizzle and writemask. */
read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type);
emit(MOV(dst, swizzle(src_reg(read->dst),
BRW_SWZ_COMP_INPUT(first_component))));
}
}
void
vec4_tcs_visitor::emit_urb_write(const src_reg &value,
unsigned writemask,
unsigned base_offset,
const src_reg &indirect_offset)
{
if (writemask == 0)
return;
src_reg message(this, glsl_uvec4_type(), 2);
vec4_instruction *inst;
inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
brw_imm_ud(writemask), indirect_offset);
inst->force_writemask_all = true;
inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
value));
inst->force_writemask_all = true;
inst = emit(VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message);
inst->offset = base_offset;
inst->mlen = 2;
inst->base_mrf = -1;
}
void
vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
{
switch (instr->intrinsic) {
case nir_intrinsic_load_invocation_id:
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_UD),
invocation_id));
break;
case nir_intrinsic_load_primitive_id:
emit(TCS_OPCODE_GET_PRIMITIVE_ID,
get_nir_def(instr->def, BRW_REGISTER_TYPE_UD));
break;
case nir_intrinsic_load_patch_vertices_in:
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D),
brw_imm_d(key->input_vertices)));
break;
case nir_intrinsic_load_per_vertex_input: {
assert(instr->def.bit_size == 32);
src_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]),
BRW_REGISTER_TYPE_UD);
unsigned first_component = nir_intrinsic_component(instr);
dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
dst.writemask = brw_writemask_for_size(instr->num_components);
emit_input_urb_read(dst, vertex_index, imm_offset,
first_component, indirect_offset);
break;
}
case nir_intrinsic_load_input:
unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
break;
case nir_intrinsic_load_output:
case nir_intrinsic_load_per_vertex_output: {
src_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
dst.writemask = brw_writemask_for_size(instr->num_components);
emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
indirect_offset);
break;
}
case nir_intrinsic_store_output:
case nir_intrinsic_store_per_vertex_output: {
assert(nir_src_bit_size(instr->src[0]) == 32);
src_reg value = get_nir_src(instr->src[0]);
unsigned mask = nir_intrinsic_write_mask(instr);
unsigned swiz = BRW_SWIZZLE_XYZW;
src_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = nir_intrinsic_base(instr);
unsigned first_component = nir_intrinsic_component(instr);
if (first_component) {
assert(swiz == BRW_SWIZZLE_XYZW);
swiz = BRW_SWZ_COMP_OUTPUT(first_component);
mask = mask << first_component;
}
emit_urb_write(swizzle(value, swiz), mask,
imm_offset, indirect_offset);
break;
}
case nir_intrinsic_barrier:
if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE)
vec4_visitor::nir_emit_intrinsic(instr);
if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) {
dst_reg header = dst_reg(this, glsl_uvec4_type());
emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
}
break;
default:
vec4_visitor::nir_emit_intrinsic(instr);
}
}
} /* namespace brw */

View file

@ -1,83 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_vec4_tcs.h
*
* The vec4-mode tessellation control shader compiler backend.
*/
#ifndef BRW_VEC4_TCS_H
#define BRW_VEC4_TCS_H
#include "brw_compiler.h"
#include "brw_eu.h"
#include "brw_vec4.h"
#ifdef __cplusplus
namespace brw {
class vec4_tcs_visitor : public vec4_visitor
{
public:
vec4_tcs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_tcs_prog_key *key,
struct brw_tcs_prog_data *prog_data,
const nir_shader *nir,
bool debug_enabled);
protected:
virtual void setup_payload();
virtual void emit_prolog();
virtual void emit_thread_end();
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
void emit_input_urb_read(const dst_reg &dst,
const src_reg &vertex_index,
unsigned base_offset,
unsigned first_component,
const src_reg &indirect_offset);
void emit_output_urb_read(const dst_reg &dst,
unsigned base_offset,
unsigned first_component,
const src_reg &indirect_offset);
void emit_urb_write(const src_reg &value, unsigned writemask,
unsigned base_offset, const src_reg &indirect_offset);
/* we do not use the normal end-of-shader URB write mechanism -- but every
* vec4 stage must provide implementations of these:
*/
virtual void emit_urb_write_header(int /* mrf */) {}
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) { return NULL; }
const struct brw_tcs_prog_key *key;
src_reg invocation_id;
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* BRW_VEC4_TCS_H */

View file

@ -1,223 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_vec4_tes.cpp
*
* Tessellaton evaluation shader specific code derived from the vec4_visitor class.
*/
#include "brw_vec4_tes.h"
#include "brw_cfg.h"
#include "dev/intel_debug.h"
namespace brw {
vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_tes_prog_key *key,
struct brw_tes_prog_data *prog_data,
const nir_shader *shader,
bool debug_enabled)
: vec4_visitor(compiler, params, &key->base.tex, &prog_data->base,
shader, false, debug_enabled)
{
}
void
vec4_tes_visitor::setup_payload()
{
int reg = 0;
/* The payload always contains important data in r0 and r1, which contains
* the URB handles that are passed on to the URB write at the end
* of the thread.
*/
reg += 2;
reg = setup_uniforms(reg);
foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
for (int i = 0; i < 3; i++) {
if (inst->src[i].file != ATTR)
continue;
unsigned slot = inst->src[i].nr + inst->src[i].offset / 16;
struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2));
grf = stride(grf, 0, 4, 1);
grf.swizzle = inst->src[i].swizzle;
grf.type = inst->src[i].type;
grf.abs = inst->src[i].abs;
grf.negate = inst->src[i].negate;
inst->src[i] = grf;
}
}
reg += 8 * prog_data->urb_read_length;
this->first_non_payload_grf = reg;
}
void
vec4_tes_visitor::emit_prolog()
{
input_read_header = src_reg(this, glsl_uvec4_type());
emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
this->current_annotation = NULL;
}
void
vec4_tes_visitor::emit_urb_write_header(int mrf)
{
/* No need to do anything for DS; an implied write to this MRF will be
* performed by VEC4_VS_OPCODE_URB_WRITE.
*/
(void) mrf;
}
vec4_instruction *
vec4_tes_visitor::emit_urb_write_opcode(bool complete)
{
vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE);
inst->urb_write_flags = complete ?
BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
return inst;
}
void
vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
{
const struct brw_tes_prog_data *tes_prog_data =
(const struct brw_tes_prog_data *) prog_data;
switch (instr->intrinsic) {
case nir_intrinsic_load_tess_coord:
/* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
src_reg(brw_vec8_grf(1, 0))));
break;
case nir_intrinsic_load_tess_level_outer:
if (tes_prog_data->domain == INTEL_TESS_DOMAIN_ISOLINE) {
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
swizzle(src_reg(ATTR, 1, glsl_vec4_type()),
BRW_SWIZZLE_ZWZW)));
} else {
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
swizzle(src_reg(ATTR, 1, glsl_vec4_type()),
BRW_SWIZZLE_WZYX)));
}
break;
case nir_intrinsic_load_tess_level_inner:
if (tes_prog_data->domain == INTEL_TESS_DOMAIN_QUAD) {
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
swizzle(src_reg(ATTR, 0, glsl_vec4_type()),
BRW_SWIZZLE_WZYX)));
} else {
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F),
src_reg(ATTR, 1, glsl_float_type())));
}
break;
case nir_intrinsic_load_primitive_id:
emit(TES_OPCODE_GET_PRIMITIVE_ID,
get_nir_def(instr->def, BRW_REGISTER_TYPE_UD));
break;
case nir_intrinsic_load_input:
case nir_intrinsic_load_per_vertex_input: {
assert(instr->def.bit_size == 32);
src_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = instr->const_index[0];
src_reg header = input_read_header;
unsigned first_component = nir_intrinsic_component(instr);
if (indirect_offset.file != BAD_FILE) {
src_reg clamped_indirect_offset = src_reg(this, glsl_uvec4_type());
/* Page 190 of "Volume 7: 3D Media GPGPU Engine (Haswell)" says the
* valid range of the offset is [0, 0FFFFFFFh].
*/
emit_minmax(BRW_CONDITIONAL_L,
dst_reg(clamped_indirect_offset),
retype(indirect_offset, BRW_REGISTER_TYPE_UD),
brw_imm_ud(0x0fffffffu));
header = src_reg(this, glsl_uvec4_type());
emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
input_read_header, clamped_indirect_offset);
} else {
/* Arbitrarily only push up to 24 vec4 slots worth of data,
* which is 12 registers (since each holds 2 vec4 slots).
*/
const unsigned max_push_slots = 24;
if (imm_offset < max_push_slots) {
src_reg src = src_reg(ATTR, imm_offset, glsl_ivec4_type());
src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D), src));
prog_data->urb_read_length =
MAX2(prog_data->urb_read_length,
DIV_ROUND_UP(imm_offset + 1, 2));
break;
}
}
dst_reg temp(this, glsl_ivec4_type());
vec4_instruction *read =
emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
read->offset = imm_offset;
read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
src_reg src = src_reg(temp);
src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
/* Copy to target. We might end up with some funky writemasks landing
* in here, but we really don't want them in the above pseudo-ops.
*/
dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D);
dst.writemask = brw_writemask_for_size(instr->num_components);
emit(MOV(dst, src));
break;
}
default:
vec4_visitor::nir_emit_intrinsic(instr);
}
}
void
vec4_tes_visitor::emit_thread_end()
{
/* For DS, we always end the thread by emitting a single vertex.
* emit_urb_write_opcode() will take care of setting the eot flag on the
* SEND instruction.
*/
emit_vertex();
}
} /* namespace brw */

View file

@ -1,65 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
/**
* \file brw_vec4_tes.h
*
* The vec4 mode tessellation evaluation shader compiler backend.
*/
#ifndef BRW_VEC4_TES_H
#define BRW_VEC4_TES_H
#include "brw_vec4.h"
#ifdef __cplusplus
namespace brw {
class vec4_tes_visitor : public vec4_visitor
{
public:
vec4_tes_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_tes_prog_key *key,
struct brw_tes_prog_data *prog_data,
const nir_shader *nir,
bool debug_enabled);
protected:
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
virtual void setup_payload();
virtual void emit_prolog();
virtual void emit_thread_end();
virtual void emit_urb_write_header(int mrf);
virtual vec4_instruction *emit_urb_write_opcode(bool complete);
private:
src_reg input_read_header;
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* BRW_VEC4_TES_H */

File diff suppressed because it is too large Load diff

View file

@ -1,58 +0,0 @@
/*
* Copyright © 2006 - 2015 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_VEC4_VS_VISITOR_H
#define BRW_VEC4_VS_VISITOR_H
#include "brw_vec4.h"
namespace brw {
class vec4_vs_visitor : public vec4_visitor
{
public:
vec4_vs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_vs_prog_key *key,
struct brw_vs_prog_data *vs_prog_data,
const nir_shader *shader,
bool debug_enabled);
protected:
virtual void setup_payload();
virtual void emit_prolog();
virtual void emit_thread_end();
virtual void emit_urb_write_header(int mrf);
virtual void emit_urb_slot(dst_reg reg, int varying);
virtual vec4_instruction *emit_urb_write_opcode(bool complete);
private:
int setup_attributes(int payload_reg);
const struct brw_vs_prog_key *const key;
struct brw_vs_prog_data * const vs_prog_data;
};
} /* namespace brw */
#endif /* BRW_VEC4_VS_VISITOR_H */

View file

@ -1,108 +0,0 @@
/*
* Copyright © 2013 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
* DEALINGS IN THE SOFTWARE.
*/
#include "brw_vec4_vs.h"
#include "dev/intel_debug.h"
namespace brw {
void
vec4_vs_visitor::emit_prolog()
{
}
void
vec4_vs_visitor::emit_urb_write_header(int mrf)
{
/* No need to do anything for VS; an implied write to this MRF will be
* performed by VEC4_VS_OPCODE_URB_WRITE.
*/
(void) mrf;
}
vec4_instruction *
vec4_vs_visitor::emit_urb_write_opcode(bool complete)
{
vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE);
inst->urb_write_flags = complete ?
BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
return inst;
}
void
vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
{
reg.type = BRW_REGISTER_TYPE_F;
output_reg[varying][0].type = reg.type;
switch (varying) {
case VARYING_SLOT_COL0:
case VARYING_SLOT_COL1:
case VARYING_SLOT_BFC0:
case VARYING_SLOT_BFC1: {
/* These built-in varyings are only supported in compatibility mode,
* and we only support GS in core profile. So, this must be a vertex
* shader.
*/
vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0);
if (inst && key->clamp_vertex_color)
inst->saturate = true;
break;
}
default:
return vec4_visitor::emit_urb_slot(reg, varying);
}
}
void
vec4_vs_visitor::emit_thread_end()
{
/* For VS, we always end the thread by emitting a single vertex.
* emit_urb_write_opcode() will take care of setting the eot flag on the
* SEND instruction.
*/
emit_vertex();
}
vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
const struct brw_compile_params *params,
const struct brw_vs_prog_key *key,
struct brw_vs_prog_data *vs_prog_data,
const nir_shader *shader,
bool debug_enabled)
: vec4_visitor(compiler, params, &key->base.tex, &vs_prog_data->base,
shader, false /* no_spills */, debug_enabled),
key(key),
vs_prog_data(vs_prog_data)
{
}
} /* namespace brw */

View file

@ -1,702 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
* This code is based on original work by Ilia Mirkin.
*/
/**
* \file gfx6_gs_visitor.cpp
*
* Gfx6 geometry shader implementation
*/
#include "gfx6_gs_visitor.h"
#include "brw_eu.h"
#include "brw_prim.h"
namespace brw {
void
gfx6_gs_visitor::emit_prolog()
{
vec4_gs_visitor::emit_prolog();
/* Gfx6 geometry shaders require to allocate an initial VUE handle via
* FF_SYNC message, however the documentation remarks that only one thread
* can write to the URB simultaneously and the FF_SYNC message provides the
* synchronization mechanism for this, so using this message effectively
* stalls the thread until it is its turn to write to the URB. Because of
* this, the best way to implement geometry shader algorithms in gfx6 is to
* execute the algorithm before the FF_SYNC message to maximize parallelism.
*
* To achieve this we buffer the geometry shader outputs for each emitted
* vertex in vertex_output during operation. Then, when we have processed
* the last vertex (that is, at thread end time), we send the FF_SYNC
* message to allocate the initial VUE handle and write all buffered vertex
* data to the URB in one go.
*
* For each emitted vertex, vertex_output will hold vue_map.num_slots
* data items plus one additional item to hold required flags
* (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
* which come right after the data items for that vertex. Vertex data and
* flags for the next vertex come right after the data items and flags for
* the previous vertex.
*/
this->current_annotation = "gfx6 prolog";
this->vertex_output = src_reg(this,
glsl_uint_type(),
(prog_data->vue_map.num_slots + 1) *
nir->info.gs.vertices_out);
this->vertex_output_offset = src_reg(this, glsl_uint_type());
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
/* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
* so initialize it once to R0.
*/
vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
retype(brw_vec8_grf(0, 0),
BRW_REGISTER_TYPE_UD)));
inst->force_writemask_all = true;
/* This will be used as a temporary to store writeback data of FF_SYNC
* and URB_WRITE messages.
*/
this->temp = src_reg(this, glsl_uint_type());
/* This will be used to know when we are processing the first vertex of
* a primitive. We will set this to URB_WRITE_PRIM_START only when we know
* that we are processing the first vertex in the primitive and to zero
* otherwise. This way we can use its value directly in the URB write
* headers.
*/
this->first_vertex = src_reg(this, glsl_uint_type());
emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
/* The FF_SYNC message requires to know the number of primitives generated,
* so keep a counter for this.
*/
this->prim_count = src_reg(this, glsl_uint_type());
emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
if (gs_prog_data->num_transform_feedback_bindings) {
/* Create a virtual register to hold destination indices in SOL */
this->destination_indices = src_reg(this, glsl_uvec4_type());
/* Create a virtual register to hold number of written primitives */
this->sol_prim_written = src_reg(this, glsl_uint_type());
/* Create a virtual register to hold Streamed Vertex Buffer Indices */
this->svbi = src_reg(this, glsl_uvec4_type());
/* Create a virtual register to hold max values of SVBI */
this->max_svbi = src_reg(this, glsl_uvec4_type());
emit(MOV(dst_reg(this->max_svbi),
src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
}
/* PrimitveID is delivered in r0.1 of the thread payload. If the program
* needs it we have to move it to a separate register where we can map
* the attribute.
*
* Notice that we cannot use a virtual register for this, because we need to
* map all input attributes to hardware registers in setup_payload(),
* which happens before virtual registers are mapped to hardware registers.
* We could work around that issue if we were able to compute the first
* non-payload register here and move the PrimitiveID information to that
* register, but we can't because at this point we don't know the final
* number uniforms that will be included in the payload.
*
* So, what we do is to place PrimitiveID information in r1, which is always
* delivered as part of the payload, but its only populated with data
* relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE
* in the 3DSTATE_GS state packet. That information can be obtained by other
* means though, so we can safely use r1 for this purpose.
*/
if (gs_prog_data->include_primitive_id) {
this->primitive_id =
src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
}
}
void
gfx6_gs_visitor::gs_emit_vertex(int stream_id)
{
this->current_annotation = "gfx6 emit vertex";
/* Buffer all output slots for this vertex in vertex_output */
for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
int varying = prog_data->vue_map.slot_to_varying[slot];
if (varying != VARYING_SLOT_PSIZ) {
dst_reg dst(this->vertex_output);
dst.reladdr = ralloc(mem_ctx, src_reg);
memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
emit_urb_slot(dst, varying);
} else {
/* The PSIZ slot can pack multiple varyings in different channels
* and emit_urb_slot() will produce a MOV instruction for each of
* them. Since we are writing to an array, that will translate to
* possibly multiple MOV instructions with an array destination and
* each will generate a scratch write with the same offset into
* scratch space (thus, each one overwriting the previous). This is
* not what we want. What we will do instead is emit PSIZ to a
* a regular temporary register, then move that register into the
* array. This way we only have one instruction with an array
* destination and we only produce a single scratch write.
*/
dst_reg tmp = dst_reg(src_reg(this, glsl_uvec4_type()));
emit_urb_slot(tmp, varying);
dst_reg dst(this->vertex_output);
dst.reladdr = ralloc(mem_ctx, src_reg);
memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
inst->force_writemask_all = true;
}
emit(ADD(dst_reg(this->vertex_output_offset),
this->vertex_output_offset, brw_imm_ud(1u)));
}
/* Now buffer flags for this vertex */
dst_reg dst(this->vertex_output);
dst.reladdr = ralloc(mem_ctx, src_reg);
memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) {
/* If we are outputting points, then every vertex has PrimStart and
* PrimEnd set.
*/
emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
} else {
/* Otherwise, we can only set the PrimStart flag, which we have stored
* in the first_vertex register. We will have to wait until we execute
* EndPrimitive() or we end the thread to set the PrimEnd flag on a
* vertex.
*/
emit(OR(dst, this->first_vertex,
brw_imm_ud(gs_prog_data->output_topology <<
URB_WRITE_PRIM_TYPE_SHIFT)));
emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
}
emit(ADD(dst_reg(this->vertex_output_offset),
this->vertex_output_offset, brw_imm_ud(1u)));
}
void
gfx6_gs_visitor::gs_end_primitive()
{
this->current_annotation = "gfx6 end primitive";
/* Calling EndPrimitive() is optional for point output. In this case we set
* the PrimEnd flag when we process EmitVertex().
*/
if (nir->info.gs.output_primitive == MESA_PRIM_POINTS)
return;
/* Otherwise we know that the last vertex we have processed was the last
* vertex in the primitive and we need to set its PrimEnd flag, so do this
* unless we haven't emitted that vertex at all (vertex_count != 0).
*
* Notice that we have already incremented vertex_count when we processed
* the last emit_vertex, so we need to take that into account in the
* comparison below (hence the num_output_vertices + 1 in the comparison
* below).
*/
unsigned num_output_vertices = nir->info.gs.vertices_out;
emit(CMP(dst_null_ud(), this->vertex_count,
brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
vec4_instruction *inst = emit(CMP(dst_null_ud(),
this->vertex_count, brw_imm_ud(0u),
BRW_CONDITIONAL_NEQ));
inst->predicate = BRW_PREDICATE_NORMAL;
emit(IF(BRW_PREDICATE_NORMAL));
{
/* vertex_output_offset is already pointing at the first entry of the
* next vertex. So subtract 1 to modify the flags for the previous
* vertex.
*/
src_reg offset(this, glsl_uint_type());
emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
src_reg dst(this->vertex_output);
dst.reladdr = ralloc(mem_ctx, src_reg);
memcpy(dst.reladdr, &offset, sizeof(src_reg));
emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
/* Set the first vertex flag to indicate that the next vertex will start
* a primitive.
*/
emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
}
emit(BRW_OPCODE_ENDIF);
}
void
gfx6_gs_visitor::emit_urb_write_header(int mrf)
{
this->current_annotation = "gfx6 urb header";
/* Compute offset of the flags for the current vertex in vertex_output and
* write them in dw2 of the message header.
*
* Notice that by the time that emit_thread_end() calls here
* vertex_output_offset should point to the first data item of the current
* vertex in vertex_output, thus we only need to add the number of output
* slots per vertex to that offset to obtain the flags data offset.
*/
src_reg flags_offset(this, glsl_uint_type());
emit(ADD(dst_reg(flags_offset),
this->vertex_output_offset,
brw_imm_d(prog_data->vue_map.num_slots)));
src_reg flags_data(this->vertex_output);
flags_data.reladdr = ralloc(mem_ctx, src_reg);
memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
}
static unsigned
align_interleaved_urb_mlen(unsigned mlen)
{
/* URB data written (does not include the message header reg) must
* be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
* section 5.4.3.2.2: URB_INTERLEAVED.
*/
if ((mlen % 2) != 1)
mlen++;
return mlen;
}
void
gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf,
int last_mrf, int urb_offset)
{
vec4_instruction *inst = NULL;
if (!complete) {
/* If the vertex is not complete we don't have to do anything special */
inst = emit(VEC4_GS_OPCODE_URB_WRITE);
inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
} else {
/* Otherwise we always request to allocate a new VUE handle. If this is
* the last write before the EOT message and the new handle never gets
* used it will be dereferenced when we send the EOT message. This is
* necessary to avoid different setups for the EOT message (one for the
* case when there is no output and another for the case when there is)
* which would require to end the program with an IF/ELSE/ENDIF block,
* something we do not want.
*/
inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE);
inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
inst->dst = dst_reg(MRF, base_mrf);
inst->src[0] = this->temp;
}
inst->base_mrf = base_mrf;
inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
inst->offset = urb_offset;
}
void
gfx6_gs_visitor::emit_thread_end()
{
/* Make sure the current primitive is ended: we know it is not ended when
* first_vertex is not zero. This is only relevant for outputs other than
* points because in the point case we set PrimEnd on all vertices.
*/
if (nir->info.gs.output_primitive != MESA_PRIM_POINTS) {
emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
emit(IF(BRW_PREDICATE_NORMAL));
gs_end_primitive();
emit(BRW_OPCODE_ENDIF);
}
/* Here we have to:
* 1) Emit an FF_SYNC message to obtain an initial VUE handle.
* 2) Loop over all buffered vertex data and write it to corresponding
* URB entries.
* 3) Allocate new VUE handles for all vertices other than the first.
* 4) Send a final EOT message.
*/
/* MRF 0 is reserved for the debugger, so start with message header
* in MRF 1.
*/
int base_mrf = 1;
/* In the process of generating our URB write message contents, we
* may need to unspill a register or load from an array. Those
* reads would use MRFs 21..23
*/
int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver);
/* Issue the FF_SYNC message and obtain the initial VUE handle. */
this->current_annotation = "gfx6 thread end: ff_sync";
vec4_instruction *inst = NULL;
if (gs_prog_data->num_transform_feedback_bindings) {
src_reg sol_temp(this, glsl_uvec4_type());
emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
dst_reg(this->svbi),
this->vertex_count,
this->prim_count,
sol_temp);
inst = emit(GS_OPCODE_FF_SYNC,
dst_reg(this->temp), this->prim_count, this->svbi);
} else {
inst = emit(GS_OPCODE_FF_SYNC,
dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
}
inst->base_mrf = base_mrf;
emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
emit(IF(BRW_PREDICATE_NORMAL));
{
/* Loop over all buffered vertices and emit URB write messages */
this->current_annotation = "gfx6 thread end: urb writes init";
src_reg vertex(this, glsl_uint_type());
emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
this->current_annotation = "gfx6 thread end: urb writes";
emit(BRW_OPCODE_DO);
{
emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
inst = emit(BRW_OPCODE_BREAK);
inst->predicate = BRW_PREDICATE_NORMAL;
/* First we prepare the message header */
emit_urb_write_header(base_mrf);
/* Then add vertex data to the message in interleaved fashion */
int slot = 0;
bool complete = false;
do {
int mrf = base_mrf + 1;
/* URB offset is in URB row increments, and each of our MRFs is half
* of one of those, since we're doing interleaved writes.
*/
int urb_offset = slot / 2;
for (; slot < prog_data->vue_map.num_slots; ++slot) {
int varying = prog_data->vue_map.slot_to_varying[slot];
current_annotation = output_reg_annotation[varying];
/* Compute offset of this slot for the current vertex
* in vertex_output
*/
src_reg data(this->vertex_output);
data.reladdr = ralloc(mem_ctx, src_reg);
memcpy(data.reladdr, &this->vertex_output_offset,
sizeof(src_reg));
/* Copy this slot to the appropriate message register */
dst_reg reg = dst_reg(MRF, mrf);
reg.type = output_reg[varying][0].type;
data.type = reg.type;
inst = emit(MOV(reg, data));
inst->force_writemask_all = true;
mrf++;
emit(ADD(dst_reg(this->vertex_output_offset),
this->vertex_output_offset, brw_imm_ud(1u)));
/* If this was max_usable_mrf, we can't fit anything more into
* this URB WRITE. Same if we reached the max. message length.
*/
if (mrf > max_usable_mrf ||
align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
slot++;
break;
}
}
complete = slot >= prog_data->vue_map.num_slots;
emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
} while (!complete);
/* Skip over the flags data item so that vertex_output_offset points
* to the first data item of the next vertex, so that we can start
* writing the next vertex.
*/
emit(ADD(dst_reg(this->vertex_output_offset),
this->vertex_output_offset, brw_imm_ud(1u)));
emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
}
emit(BRW_OPCODE_WHILE);
if (gs_prog_data->num_transform_feedback_bindings)
xfb_write();
}
emit(BRW_OPCODE_ENDIF);
/* Finally, emit EOT message.
*
* In gfx6 we need to end the thread differently depending on whether we have
* emitted at least one vertex or not. In case we did, the EOT message must
* always include the COMPLETE flag or else the GPU hangs. If we have not
* produced any output we can't use the COMPLETE flag.
*
* However, this would lead us to end the program with an ENDIF opcode,
* which we want to avoid, so what we do is that we always request a new
* VUE handle every time, even if GS produces no output.
* With this we make sure that whether we have emitted at least one vertex
* or none at all, we have to finish the thread without writing to the URB,
* which works for both cases by setting the COMPLETE and UNUSED flags in
* the EOT message.
*/
this->current_annotation = "gfx6 thread end: EOT";
if (gs_prog_data->num_transform_feedback_bindings) {
/* When emitting EOT, set SONumPrimsWritten Increment Value. */
src_reg data(this, glsl_uint_type());
emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
}
inst = emit(GS_OPCODE_THREAD_END);
inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
inst->base_mrf = base_mrf;
inst->mlen = 1;
}
void
gfx6_gs_visitor::setup_payload()
{
int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
/* Attributes are going to be interleaved, so one register contains two
* attribute slots.
*/
int attributes_per_reg = 2;
/* If a geometry shader tries to read from an input that wasn't written by
* the vertex shader, that produces undefined results, but it shouldn't
* crash anything. So initialize attribute_map to zeros--that ensures that
* these undefined results are read from r0.
*/
memset(attribute_map, 0, sizeof(attribute_map));
int reg = 0;
/* The payload always contains important data in r0. */
reg++;
/* r1 is always part of the payload and it holds information relevant
* for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in
* the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
* information (and move the original value to a virtual register if
* necessary).
*/
if (gs_prog_data->include_primitive_id)
attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
reg++;
reg = setup_uniforms(reg);
reg = setup_varying_inputs(reg, attributes_per_reg);
this->first_non_payload_grf = reg;
}
void
gfx6_gs_visitor::xfb_write()
{
unsigned num_verts;
switch (gs_prog_data->output_topology) {
case _3DPRIM_POINTLIST:
num_verts = 1;
break;
case _3DPRIM_LINELIST:
case _3DPRIM_LINESTRIP:
case _3DPRIM_LINELOOP:
num_verts = 2;
break;
case _3DPRIM_TRILIST:
case _3DPRIM_TRIFAN:
case _3DPRIM_TRISTRIP:
case _3DPRIM_RECTLIST:
num_verts = 3;
break;
case _3DPRIM_QUADLIST:
case _3DPRIM_QUADSTRIP:
case _3DPRIM_POLYGON:
num_verts = 3;
break;
default:
unreachable("Unexpected primitive type in Gfx6 SOL program.");
}
this->current_annotation = "gfx6 thread end: svb writes init";
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
/* Check that at least one primitive can be written
*
* Note: since we use the binding table to keep track of buffer offsets
* and stride, the GS doesn't need to keep track of a separate pointer
* into each buffer; it uses a single pointer which increments by 1 for
* each vertex. So we use SVBI0 for this pointer, regardless of whether
* transform feedback is in interleaved or separate attribs mode.
*/
src_reg sol_temp(this, glsl_uvec4_type());
emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
/* Compare SVBI calculated number with the maximum value, which is
* in R1.4 (previously saved in this->max_svbi) for gfx6.
*/
emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
emit(IF(BRW_PREDICATE_NORMAL));
{
vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
brw_imm_vf4(brw_float_to_vf(0.0),
brw_float_to_vf(1.0),
brw_float_to_vf(2.0),
brw_float_to_vf(0.0))));
inst->force_writemask_all = true;
emit(ADD(dst_reg(this->destination_indices),
this->destination_indices,
this->svbi));
}
emit(BRW_OPCODE_ENDIF);
/* Write transform feedback data for all processed vertices. */
for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) {
emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
BRW_CONDITIONAL_L));
emit(IF(BRW_PREDICATE_NORMAL));
{
xfb_program(i, num_verts);
}
emit(BRW_OPCODE_ENDIF);
}
}
void
gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
{
unsigned binding;
unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
src_reg sol_temp(this, glsl_uvec4_type());
/* Check for buffer overflow: we need room to write the complete primitive
* (all vertices). Otherwise, avoid writing any vertices for it
*/
emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
emit(IF(BRW_PREDICATE_NORMAL));
{
/* Avoid overwriting MRF 1 as it is used as URB write message header */
dst_reg mrf_reg(MRF, 2);
this->current_annotation = "gfx6: emit SOL vertex data";
/* For each vertex, generate code to output each varying using the
* appropriate binding table entry.
*/
for (binding = 0; binding < num_bindings; ++binding) {
unsigned char varying =
gs_prog_data->transform_feedback_bindings[binding];
/* Set up the correct destination index for this vertex */
vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
mrf_reg,
this->destination_indices);
inst->sol_vertex = vertex % num_verts;
/* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
*
* "Prior to End of Thread with a URB_WRITE, the kernel must
* ensure that all writes are complete by sending the final
* write as a committed write."
*/
bool final_write = binding == (unsigned) num_bindings - 1 &&
inst->sol_vertex == num_verts - 1;
/* Compute offset of this varying for the current vertex
* in vertex_output
*/
this->current_annotation = output_reg_annotation[varying];
src_reg data(this->vertex_output);
data.reladdr = ralloc(mem_ctx, src_reg);
int offset = get_vertex_output_offset_for_varying(vertex, varying);
emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
data.type = output_reg[varying][0].type;
data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
/* Write data */
inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
inst->sol_binding = binding;
inst->sol_final_write = final_write;
if (final_write) {
/* This is the last vertex of the primitive, then increment
* SO num primitive counter and destination indices.
*/
emit(ADD(dst_reg(this->destination_indices),
this->destination_indices,
brw_imm_ud(num_verts)));
emit(ADD(dst_reg(this->sol_prim_written),
this->sol_prim_written, brw_imm_ud(1u)));
}
}
this->current_annotation = NULL;
}
emit(BRW_OPCODE_ENDIF);
}
int
gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
{
/* Find the output slot assigned to this varying.
*
* VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
* as VARYING_SLOT_PSIZ.
*/
if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
varying = VARYING_SLOT_PSIZ;
int slot = prog_data->vue_map.varying_to_slot[varying];
if (slot < 0) {
/* This varying does not exist in the VUE so we are not writing to it
* and its value is undefined. We still want to return a valid offset
* into vertex_output though, to prevent any out-of-bound accesses into
* the vertex_output array. Since the value for this varying is undefined
* we don't really care for the value we assign to it, so any offset
* within the limits of vertex_output will do.
*/
slot = 0;
}
return vertex * (prog_data->vue_map.num_slots + 1) + slot;
}
} /* namespace brw */

View file

@ -1,84 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*
*/
#ifndef GFX6_GS_VISITOR_H
#define GFX6_GS_VISITOR_H
#include "brw_vec4.h"
#include "brw_vec4_gs_visitor.h"
#ifdef __cplusplus
namespace brw {
class gfx6_gs_visitor : public vec4_gs_visitor
{
public:
gfx6_gs_visitor(const struct brw_compiler *comp,
const struct brw_compile_params *params,
struct brw_gs_compile *c,
struct brw_gs_prog_data *prog_data,
const nir_shader *shader,
bool no_spills,
bool debug_enabled) :
vec4_gs_visitor(comp, params, c, prog_data, shader, no_spills, debug_enabled)
{
}
protected:
virtual void emit_prolog();
virtual void emit_thread_end();
virtual void gs_emit_vertex(int stream_id);
virtual void gs_end_primitive();
virtual void emit_urb_write_header(int mrf);
virtual void setup_payload();
private:
void xfb_write();
void xfb_program(unsigned vertex, unsigned num_verts);
int get_vertex_output_offset_for_varying(int vertex, int varying);
void emit_snb_gs_urb_write_opcode(bool complete,
int base_mrf,
int last_mrf,
int urb_offset);
src_reg vertex_output;
src_reg vertex_output_offset;
src_reg temp;
src_reg first_vertex;
src_reg prim_count;
src_reg primitive_id;
/* Transform Feedback members */
src_reg sol_prim_written;
src_reg svbi;
src_reg max_svbi;
src_reg destination_indices;
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* GFX6_GS_VISITOR_H */

View file

@ -105,7 +105,6 @@ libintel_compiler_brw_files = files(
'brw_ir_fs.h',
'brw_ir_performance.h',
'brw_ir_performance.cpp',
'brw_ir_vec4.h',
'brw_isa_info.h',
'brw_lower_logical_sends.cpp',
'brw_mesh.cpp',
@ -137,33 +136,7 @@ libintel_compiler_brw_files = files(
'brw_shader.cpp',
'brw_shader.h',
'brw_simd_selection.cpp',
'brw_vec4_builder.h',
'brw_vec4_cmod_propagation.cpp',
'brw_vec4_copy_propagation.cpp',
'brw_vec4.cpp',
'brw_vec4_cse.cpp',
'brw_vec4_dead_code_eliminate.cpp',
'brw_vec4_generator.cpp',
'brw_vec4_gs_visitor.cpp',
'brw_vec4_gs_visitor.h',
'brw_vec4.h',
'brw_vec4_live_variables.cpp',
'brw_vec4_live_variables.h',
'brw_vec4_nir.cpp',
'brw_vec4_gs_nir.cpp',
'brw_vec4_reg_allocate.cpp',
'brw_vec4_surface_builder.cpp',
'brw_vec4_surface_builder.h',
'brw_vec4_tcs.cpp',
'brw_vec4_tcs.h',
'brw_vec4_tes.cpp',
'brw_vec4_tes.h',
'brw_vec4_visitor.cpp',
'brw_vec4_vs_visitor.cpp',
'brw_vec4_vs.h',
'brw_vue_map.c',
'gfx6_gs_visitor.cpp',
'gfx6_gs_visitor.h',
)
brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c',
@ -236,10 +209,6 @@ if with_tests
'test_fs_saturate_propagation.cpp',
'test_fs_scoreboard.cpp',
'test_simd_selection.cpp',
'test_vec4_cmod_propagation.cpp',
'test_vec4_copy_propagation.cpp',
'test_vec4_dead_code_eliminate.cpp',
'test_vec4_register_coalesce.cpp',
'test_vf_float_conversions.cpp',
),
ir_expression_operation_h,

File diff suppressed because it is too large Load diff

View file

@ -1,195 +0,0 @@
/*
* Copyright © 2014 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <gtest/gtest.h>
#include "brw_vec4.h"
using namespace brw;
class copy_propagation_vec4_test : public ::testing::Test {
virtual void SetUp();
virtual void TearDown();
public:
struct brw_compiler *compiler;
struct brw_compile_params params;
struct intel_device_info *devinfo;
void *ctx;
struct gl_shader_program *shader_prog;
struct brw_vue_prog_data *prog_data;
vec4_visitor *v;
};
class copy_propagation_vec4_visitor : public vec4_visitor
{
public:
copy_propagation_vec4_visitor(struct brw_compiler *compiler,
struct brw_compile_params *params,
nir_shader *shader,
struct brw_vue_prog_data *prog_data)
: vec4_visitor(compiler, params, NULL, prog_data, shader,
false /* no_spills */, false)
{
prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
}
protected:
virtual dst_reg *make_reg_for_system_value(int /* location */)
{
unreachable("Not reached");
}
virtual void setup_payload()
{
unreachable("Not reached");
}
virtual void emit_prolog()
{
unreachable("Not reached");
}
virtual void emit_thread_end()
{
unreachable("Not reached");
}
virtual void emit_urb_write_header(int /* mrf */)
{
unreachable("Not reached");
}
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
{
unreachable("Not reached");
}
};
void copy_propagation_vec4_test::SetUp()
{
ctx = ralloc_context(NULL);
compiler = rzalloc(ctx, struct brw_compiler);
devinfo = rzalloc(ctx, struct intel_device_info);
compiler->devinfo = devinfo;
params = {};
params.mem_ctx = ctx;
prog_data = ralloc(ctx, struct brw_vue_prog_data);
nir_shader *shader =
nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
v = new copy_propagation_vec4_visitor(compiler, &params, shader, prog_data);
devinfo->ver = 4;
devinfo->verx10 = devinfo->ver * 10;
}
void copy_propagation_vec4_test::TearDown()
{
delete v;
v = NULL;
ralloc_free(ctx);
ctx = NULL;
}
static void
copy_propagation(vec4_visitor *v)
{
const bool print = getenv("TEST_DEBUG");
if (print) {
fprintf(stderr, "instructions before:\n");
v->dump_instructions();
}
v->calculate_cfg();
v->opt_copy_propagation();
if (print) {
fprintf(stderr, "instructions after:\n");
v->dump_instructions();
}
}
TEST_F(copy_propagation_vec4_test, test_swizzle_swizzle)
{
dst_reg a = dst_reg(v, glsl_vec4_type());
dst_reg b = dst_reg(v, glsl_vec4_type());
dst_reg c = dst_reg(v, glsl_vec4_type());
v->emit(v->ADD(a, src_reg(a), src_reg(a)));
v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_Y,
BRW_SWIZZLE_Z,
BRW_SWIZZLE_W,
BRW_SWIZZLE_X))));
vec4_instruction *test_mov =
v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_Y,
BRW_SWIZZLE_Z,
BRW_SWIZZLE_W,
BRW_SWIZZLE_X)));
v->emit(test_mov);
copy_propagation(v);
EXPECT_EQ(test_mov->src[0].nr, a.nr);
EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_Z,
BRW_SWIZZLE_W,
BRW_SWIZZLE_X,
BRW_SWIZZLE_Y));
}
TEST_F(copy_propagation_vec4_test, test_swizzle_writemask)
{
dst_reg a = dst_reg(v, glsl_vec4_type());
dst_reg b = dst_reg(v, glsl_vec4_type());
dst_reg c = dst_reg(v, glsl_vec4_type());
v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_X,
BRW_SWIZZLE_Y,
BRW_SWIZZLE_X,
BRW_SWIZZLE_Z))));
v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f)));
vec4_instruction *test_mov =
v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_W,
BRW_SWIZZLE_W,
BRW_SWIZZLE_W,
BRW_SWIZZLE_W)));
v->emit(test_mov);
copy_propagation(v);
/* should not copy propagate */
EXPECT_EQ(test_mov->src[0].nr, b.nr);
EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_W,
BRW_SWIZZLE_W,
BRW_SWIZZLE_W,
BRW_SWIZZLE_W));
}

View file

@ -1,178 +0,0 @@
/*
* Copyright © 2018 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <gtest/gtest.h>
#include "brw_vec4.h"
using namespace brw;
class dead_code_eliminate_vec4_test : public ::testing::Test {
virtual void SetUp();
virtual void TearDown();
public:
struct brw_compiler *compiler;
struct brw_compile_params params;
struct intel_device_info *devinfo;
void *ctx;
struct gl_shader_program *shader_prog;
struct brw_vue_prog_data *prog_data;
vec4_visitor *v;
};
class dead_code_eliminate_vec4_visitor : public vec4_visitor
{
public:
dead_code_eliminate_vec4_visitor(struct brw_compiler *compiler,
struct brw_compile_params *params,
nir_shader *shader,
struct brw_vue_prog_data *prog_data)
: vec4_visitor(compiler, params, NULL, prog_data, shader,
false /* no_spills */, false)
{
prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
}
protected:
virtual dst_reg *make_reg_for_system_value(int /* location */)
{
unreachable("Not reached");
}
virtual void setup_payload()
{
unreachable("Not reached");
}
virtual void emit_prolog()
{
unreachable("Not reached");
}
virtual void emit_thread_end()
{
unreachable("Not reached");
}
virtual void emit_urb_write_header(int /* mrf */)
{
unreachable("Not reached");
}
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
{
unreachable("Not reached");
}
};
void dead_code_eliminate_vec4_test::SetUp()
{
ctx = ralloc_context(NULL);
compiler = rzalloc(ctx, struct brw_compiler);
devinfo = rzalloc(ctx, struct intel_device_info);
compiler->devinfo = devinfo;
params = {};
params.mem_ctx = ctx;
prog_data = ralloc(ctx, struct brw_vue_prog_data);
nir_shader *shader =
nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
v = new dead_code_eliminate_vec4_visitor(compiler, &params, shader, prog_data);
devinfo->ver = 4;
devinfo->verx10 = devinfo->ver * 10;
}
void dead_code_eliminate_vec4_test::TearDown()
{
delete v;
v = NULL;
ralloc_free(ctx);
ctx = NULL;
}
static void
dead_code_eliminate(vec4_visitor *v)
{
const bool print = getenv("TEST_DEBUG");
if (print) {
fprintf(stderr, "instructions before:\n");
v->dump_instructions();
}
v->calculate_cfg();
v->dead_code_eliminate();
if (print) {
fprintf(stderr, "instructions after:\n");
v->dump_instructions();
}
}
TEST_F(dead_code_eliminate_vec4_test, some_dead_channels_all_flags_used)
{
const vec4_builder bld = vec4_builder(v).at_end();
src_reg r1 = src_reg(v, glsl_vec4_type());
src_reg r2 = src_reg(v, glsl_vec4_type());
src_reg r3 = src_reg(v, glsl_vec4_type());
src_reg r4 = src_reg(v, glsl_vec4_type());
src_reg r5 = src_reg(v, glsl_vec4_type());
src_reg r6 = src_reg(v, glsl_vec4_type());
/* Sequence like the following should not be modified by DCE.
*
* cmp.l.f0(8) g4<1>F g2<4,4,1>.wF g1<4,4,1>.xF
* mov(8) g5<1>.xF g4<4,4,1>.xF
* (+f0.x) sel(8) g6<1>UD g3<4>UD g6<4>UD
*/
vec4_instruction *test_cmp =
bld.CMP(dst_reg(r4), r2, r1, BRW_CONDITIONAL_L);
test_cmp->src[0].swizzle = BRW_SWIZZLE_WWWW;
test_cmp->src[1].swizzle = BRW_SWIZZLE_XXXX;
vec4_instruction *test_mov =
bld.MOV(dst_reg(r5), r4);
test_mov->dst.writemask = WRITEMASK_X;
test_mov->src[0].swizzle = BRW_SWIZZLE_XXXX;
vec4_instruction *test_sel =
bld.SEL(dst_reg(r6), r3, r6);
set_predicate(BRW_PREDICATE_NORMAL, test_sel);
/* The scratch write is here just to make r5 and r6 be live so that the
* whole program doesn't get eliminated by DCE.
*/
v->emit(v->SCRATCH_WRITE(dst_reg(r4), r6, r5));
dead_code_eliminate(v);
EXPECT_EQ(test_cmp->dst.writemask, WRITEMASK_XYZW);
}

View file

@ -1,256 +0,0 @@
/*
* Copyright © 2012 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#include <gtest/gtest.h>
#include "brw_vec4.h"
using namespace brw;
#define register_coalesce(v) _register_coalesce(v, __func__)
class register_coalesce_vec4_test : public ::testing::Test {
virtual void SetUp();
virtual void TearDown();
public:
struct brw_compiler *compiler;
struct brw_compile_params params;
struct intel_device_info *devinfo;
void *ctx;
struct gl_shader_program *shader_prog;
struct brw_vue_prog_data *prog_data;
vec4_visitor *v;
};
class register_coalesce_vec4_visitor : public vec4_visitor
{
public:
register_coalesce_vec4_visitor(struct brw_compiler *compiler,
struct brw_compile_params *params,
nir_shader *shader,
struct brw_vue_prog_data *prog_data)
: vec4_visitor(compiler, params, NULL, prog_data, shader,
false /* no_spills */, false)
{
prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT;
}
protected:
virtual dst_reg *make_reg_for_system_value(int /* location */)
{
unreachable("Not reached");
}
virtual void setup_payload()
{
unreachable("Not reached");
}
virtual void emit_prolog()
{
unreachable("Not reached");
}
virtual void emit_thread_end()
{
unreachable("Not reached");
}
virtual void emit_urb_write_header(int /* mrf */)
{
unreachable("Not reached");
}
virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */)
{
unreachable("Not reached");
}
};
void register_coalesce_vec4_test::SetUp()
{
ctx = ralloc_context(NULL);
compiler = rzalloc(ctx, struct brw_compiler);
devinfo = rzalloc(ctx, struct intel_device_info);
compiler->devinfo = devinfo;
prog_data = ralloc(ctx, struct brw_vue_prog_data);
params = {};
params.mem_ctx = ctx;
nir_shader *shader =
nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL);
v = new register_coalesce_vec4_visitor(compiler, &params, shader, prog_data);
devinfo->ver = 4;
devinfo->verx10 = devinfo->ver * 10;
}
void register_coalesce_vec4_test::TearDown()
{
delete v;
v = NULL;
ralloc_free(ctx);
ctx = NULL;
}
static void
_register_coalesce(vec4_visitor *v, const char *func)
{
const bool print = getenv("TEST_DEBUG");
if (print) {
printf("%s: instructions before:\n", func);
v->dump_instructions();
}
v->calculate_cfg();
v->opt_register_coalesce();
if (print) {
printf("%s: instructions after:\n", func);
v->dump_instructions();
}
}
TEST_F(register_coalesce_vec4_test, test_compute_to_mrf)
{
src_reg something = src_reg(v, glsl_float_type());
dst_reg temp = dst_reg(v, glsl_float_type());
dst_reg init;
dst_reg m0 = dst_reg(MRF, 0);
m0.writemask = WRITEMASK_X;
m0.type = BRW_REGISTER_TYPE_F;
vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
v->emit(v->MOV(m0, src_reg(temp)));
register_coalesce(v);
EXPECT_EQ(mul->dst.file, MRF);
}
TEST_F(register_coalesce_vec4_test, test_multiple_use)
{
src_reg something = src_reg(v, glsl_float_type());
dst_reg temp = dst_reg(v, glsl_vec4_type());
dst_reg init;
dst_reg m0 = dst_reg(MRF, 0);
m0.writemask = WRITEMASK_X;
m0.type = BRW_REGISTER_TYPE_F;
dst_reg m1 = dst_reg(MRF, 1);
m1.writemask = WRITEMASK_XYZW;
m1.type = BRW_REGISTER_TYPE_F;
src_reg src = src_reg(temp);
vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
src.swizzle = BRW_SWIZZLE_XXXX;
v->emit(v->MOV(m0, src));
src.swizzle = BRW_SWIZZLE_XYZW;
v->emit(v->MOV(m1, src));
register_coalesce(v);
EXPECT_NE(mul->dst.file, MRF);
}
TEST_F(register_coalesce_vec4_test, test_dp4_mrf)
{
src_reg some_src_1 = src_reg(v, glsl_vec4_type());
src_reg some_src_2 = src_reg(v, glsl_vec4_type());
dst_reg init;
dst_reg m0 = dst_reg(MRF, 0);
m0.writemask = WRITEMASK_Y;
m0.type = BRW_REGISTER_TYPE_F;
dst_reg temp = dst_reg(v, glsl_float_type());
vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
v->emit(v->MOV(m0, src_reg(temp)));
register_coalesce(v);
EXPECT_EQ(dp4->dst.file, MRF);
EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
}
TEST_F(register_coalesce_vec4_test, test_dp4_grf)
{
src_reg some_src_1 = src_reg(v, glsl_vec4_type());
src_reg some_src_2 = src_reg(v, glsl_vec4_type());
dst_reg init;
dst_reg to = dst_reg(v, glsl_vec4_type());
dst_reg temp = dst_reg(v, glsl_float_type());
vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
to.writemask = WRITEMASK_Y;
v->emit(v->MOV(to, src_reg(temp)));
/* if we don't do something with the result, the automatic dead code
* elimination will remove all our instructions.
*/
src_reg src = src_reg(to);
src.negate = true;
v->emit(v->MOV(dst_reg(MRF, 0), src));
register_coalesce(v);
EXPECT_EQ(dp4->dst.nr, to.nr);
EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
}
TEST_F(register_coalesce_vec4_test, test_channel_mul_grf)
{
src_reg some_src_1 = src_reg(v, glsl_vec4_type());
src_reg some_src_2 = src_reg(v, glsl_vec4_type());
dst_reg init;
dst_reg to = dst_reg(v, glsl_vec4_type());
dst_reg temp = dst_reg(v, glsl_float_type());
vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2));
to.writemask = WRITEMASK_Y;
v->emit(v->MOV(to, src_reg(temp)));
/* if we don't do something with the result, the automatic dead code
* elimination will remove all our instructions.
*/
src_reg src = src_reg(to);
src.negate = true;
v->emit(v->MOV(dst_reg(MRF, 0), src));
register_coalesce(v);
EXPECT_EQ(mul->dst.nr, to.nr);
}