From a641aa294ef155ca1133e131a920dcab4cb1c990 Mon Sep 17 00:00:00 2001 From: Caio Oliveira Date: Wed, 14 Feb 2024 22:57:40 -0800 Subject: [PATCH] intel/brw: Remove vec4 backend It still exists as part of ELK for older gfx versions. Reviewed-by: Kenneth Graunke Part-of: --- src/intel/compiler/brw_compiler.c | 2 - src/intel/compiler/brw_compiler.h | 10 - src/intel/compiler/brw_fs.cpp | 1 - src/intel/compiler/brw_ir_performance.cpp | 126 - src/intel/compiler/brw_ir_performance.h | 3 - src/intel/compiler/brw_ir_vec4.h | 475 --- .../compiler/brw_schedule_instructions.cpp | 260 -- src/intel/compiler/brw_shader.h | 3 - src/intel/compiler/brw_vec4.cpp | 2542 ----------------- src/intel/compiler/brw_vec4.h | 350 --- src/intel/compiler/brw_vec4_builder.h | 646 ----- .../compiler/brw_vec4_cmod_propagation.cpp | 365 --- .../compiler/brw_vec4_copy_propagation.cpp | 556 ---- src/intel/compiler/brw_vec4_cse.cpp | 322 --- .../compiler/brw_vec4_dead_code_eliminate.cpp | 188 -- src/intel/compiler/brw_vec4_generator.cpp | 2319 --------------- src/intel/compiler/brw_vec4_gs_nir.cpp | 98 - src/intel/compiler/brw_vec4_gs_visitor.cpp | 560 ---- src/intel/compiler/brw_vec4_gs_visitor.h | 75 - .../compiler/brw_vec4_live_variables.cpp | 331 --- src/intel/compiler/brw_vec4_live_variables.h | 143 - src/intel/compiler/brw_vec4_nir.cpp | 2307 --------------- src/intel/compiler/brw_vec4_reg_allocate.cpp | 512 ---- .../compiler/brw_vec4_surface_builder.cpp | 213 -- src/intel/compiler/brw_vec4_surface_builder.h | 53 - src/intel/compiler/brw_vec4_tcs.cpp | 320 --- src/intel/compiler/brw_vec4_tcs.h | 83 - src/intel/compiler/brw_vec4_tes.cpp | 223 -- src/intel/compiler/brw_vec4_tes.h | 65 - src/intel/compiler/brw_vec4_visitor.cpp | 1319 --------- src/intel/compiler/brw_vec4_vs.h | 58 - src/intel/compiler/brw_vec4_vs_visitor.cpp | 108 - src/intel/compiler/gfx6_gs_visitor.cpp | 702 ----- src/intel/compiler/gfx6_gs_visitor.h | 84 - src/intel/compiler/meson.build | 31 - .../compiler/test_vec4_cmod_propagation.cpp | 1056 ------- .../compiler/test_vec4_copy_propagation.cpp | 195 -- .../test_vec4_dead_code_eliminate.cpp | 178 -- .../compiler/test_vec4_register_coalesce.cpp | 256 -- 39 files changed, 17138 deletions(-) delete mode 100644 src/intel/compiler/brw_ir_vec4.h delete mode 100644 src/intel/compiler/brw_vec4.cpp delete mode 100644 src/intel/compiler/brw_vec4.h delete mode 100644 src/intel/compiler/brw_vec4_builder.h delete mode 100644 src/intel/compiler/brw_vec4_cmod_propagation.cpp delete mode 100644 src/intel/compiler/brw_vec4_copy_propagation.cpp delete mode 100644 src/intel/compiler/brw_vec4_cse.cpp delete mode 100644 src/intel/compiler/brw_vec4_dead_code_eliminate.cpp delete mode 100644 src/intel/compiler/brw_vec4_generator.cpp delete mode 100644 src/intel/compiler/brw_vec4_gs_nir.cpp delete mode 100644 src/intel/compiler/brw_vec4_gs_visitor.cpp delete mode 100644 src/intel/compiler/brw_vec4_gs_visitor.h delete mode 100644 src/intel/compiler/brw_vec4_live_variables.cpp delete mode 100644 src/intel/compiler/brw_vec4_live_variables.h delete mode 100644 src/intel/compiler/brw_vec4_nir.cpp delete mode 100644 src/intel/compiler/brw_vec4_reg_allocate.cpp delete mode 100644 src/intel/compiler/brw_vec4_surface_builder.cpp delete mode 100644 src/intel/compiler/brw_vec4_surface_builder.h delete mode 100644 src/intel/compiler/brw_vec4_tcs.cpp delete mode 100644 src/intel/compiler/brw_vec4_tcs.h delete mode 100644 src/intel/compiler/brw_vec4_tes.cpp delete mode 100644 src/intel/compiler/brw_vec4_tes.h delete mode 100644 src/intel/compiler/brw_vec4_visitor.cpp delete mode 100644 src/intel/compiler/brw_vec4_vs.h delete mode 100644 src/intel/compiler/brw_vec4_vs_visitor.cpp delete mode 100644 src/intel/compiler/gfx6_gs_visitor.cpp delete mode 100644 src/intel/compiler/gfx6_gs_visitor.h delete mode 100644 src/intel/compiler/test_vec4_cmod_propagation.cpp delete mode 100644 src/intel/compiler/test_vec4_copy_propagation.cpp delete mode 100644 src/intel/compiler/test_vec4_dead_code_eliminate.cpp delete mode 100644 src/intel/compiler/test_vec4_register_coalesce.cpp diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c index d7eac3ca69c..337e0177d54 100644 --- a/src/intel/compiler/brw_compiler.c +++ b/src/intel/compiler/brw_compiler.c @@ -87,8 +87,6 @@ brw_compiler_create(void *mem_ctx, const struct intel_device_info *devinfo) brw_init_isa_info(&compiler->isa, devinfo); brw_fs_alloc_reg_sets(compiler); - if (devinfo->ver < 8) - brw_vec4_alloc_reg_set(compiler); compiler->precise_trig = debug_get_bool_option("INTEL_PRECISE_TRIG", false); diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h index 0b2155a5626..3628a2eab75 100644 --- a/src/intel/compiler/brw_compiler.h +++ b/src/intel/compiler/brw_compiler.h @@ -57,16 +57,6 @@ struct brw_compiler { struct brw_isa_info isa; - struct { - struct ra_regs *regs; - - /** - * Array of the ra classes for the unaligned contiguous register - * block sizes used. - */ - struct ra_class **classes; - } vec4_reg_set; - struct { struct ra_regs *regs; diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index 2a000cb74a8..ab1aea820b6 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -33,7 +33,6 @@ #include "brw_fs_builder.h" #include "brw_fs_live_variables.h" #include "brw_nir.h" -#include "brw_vec4_gs_visitor.h" #include "brw_cfg.h" #include "brw_dead_control_flow.h" #include "brw_private.h" diff --git a/src/intel/compiler/brw_ir_performance.cpp b/src/intel/compiler/brw_ir_performance.cpp index e94006cca65..eeb0921e011 100644 --- a/src/intel/compiler/brw_ir_performance.cpp +++ b/src/intel/compiler/brw_ir_performance.cpp @@ -23,7 +23,6 @@ #include "brw_eu.h" #include "brw_fs.h" -#include "brw_vec4.h" #include "brw_cfg.h" using namespace brw; @@ -152,29 +151,6 @@ namespace { rcount = inst->opcode == BRW_OPCODE_DPAS ? inst->rcount : 0; } - instruction_info(const struct brw_isa_info *isa, - const vec4_instruction *inst) : - isa(isa), devinfo(isa->devinfo), op(inst->opcode), - td(inst->dst.type), sd(DIV_ROUND_UP(inst->size_written, REG_SIZE)), - tx(get_exec_type(inst)), sx(0), ss(0), sc(0), - desc(inst->desc), sfid(inst->sfid), rcount(0) - { - /* Compute the maximum source size. */ - for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) - ss = MAX2(ss, DIV_ROUND_UP(inst->size_read(i), REG_SIZE)); - - /* Convert the execution size to GRF units. */ - sx = DIV_ROUND_UP(inst->exec_size * type_sz(tx), REG_SIZE); - - /* 32x32 integer multiplication has half the usual ALU throughput. - * Treat it as double-precision. - */ - if ((inst->opcode == BRW_OPCODE_MUL || inst->opcode == BRW_OPCODE_MAD) && - !brw_reg_type_is_floating_point(tx) && type_sz(tx) == 4 && - type_sz(inst->src[0].type) == type_sz(inst->src[1].type)) - tx = brw_int_type(8, tx == BRW_REGISTER_TYPE_D); - } - /** ISA encoding information */ const struct brw_isa_info *isa; /** Device information. */ @@ -1505,102 +1481,6 @@ namespace { } } - /** - * Model the performance behavior of a VEC4 back-end instruction. - */ - void - issue_vec4_instruction(state &st, const struct brw_isa_info *isa, - const backend_instruction *be_inst) - { - const struct intel_device_info *devinfo = isa->devinfo; - const vec4_instruction *inst = - static_cast(be_inst); - const instruction_info info(isa, inst); - const perf_desc perf = instruction_desc(info); - - /* Stall on any source dependencies. */ - for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { - for (unsigned j = 0; j < regs_read(inst, i); j++) - stall_on_dependency( - st, reg_dependency_id(devinfo, inst->src[i], j)); - } - - if (inst->reads_accumulator_implicitly()) { - for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); - j <= accum_reg_of_channel(devinfo, inst, info.tx, - inst->exec_size - 1); j++) - stall_on_dependency( - st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); - } - - if (inst->base_mrf != -1) { - for (unsigned j = 0; j < inst->mlen; j++) - stall_on_dependency( - st, reg_dependency_id( - devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); - } - - if (inst->reads_flag()) - stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0); - - /* Stall on any write dependencies. */ - if (!inst->no_dd_check) { - if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { - for (unsigned j = 0; j < regs_written(inst); j++) - stall_on_dependency( - st, reg_dependency_id(devinfo, inst->dst, j)); - } - - if (inst->writes_accumulator_implicitly(devinfo)) { - for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); - j <= accum_reg_of_channel(devinfo, inst, info.tx, - inst->exec_size - 1); j++) - stall_on_dependency( - st, reg_dependency_id(devinfo, brw_acc_reg(8), j)); - } - - if (inst->writes_flag(devinfo)) - stall_on_dependency(st, EU_DEPENDENCY_ID_FLAG0); - } - - /* Execute the instruction. */ - execute_instruction(st, perf); - - /* Mark any source dependencies. */ - if (inst->is_send_from_grf()) { - for (unsigned i = 0; i < ARRAY_SIZE(inst->src); i++) { - for (unsigned j = 0; j < regs_read(inst, i); j++) - mark_read_dependency( - st, perf, reg_dependency_id(devinfo, inst->src[i], j)); - } - } - - if (inst->base_mrf != -1) { - for (unsigned j = 0; j < inst->mlen; j++) - mark_read_dependency(st, perf, - reg_dependency_id(devinfo, brw_uvec_mrf(8, inst->base_mrf, 0), j)); - } - - /* Mark any destination dependencies. */ - if (inst->dst.file != BAD_FILE && !inst->dst.is_null()) { - for (unsigned j = 0; j < regs_written(inst); j++) { - mark_write_dependency(st, perf, - reg_dependency_id(devinfo, inst->dst, j)); - } - } - - if (inst->writes_accumulator_implicitly(devinfo)) { - for (unsigned j = accum_reg_of_channel(devinfo, inst, info.tx, 0); - j <= accum_reg_of_channel(devinfo, inst, info.tx, - inst->exec_size - 1); j++) - mark_write_dependency(st, perf, - reg_dependency_id(devinfo, brw_acc_reg(8), j)); - } - - if (inst->writes_flag(devinfo)) - mark_write_dependency(st, perf, EU_DEPENDENCY_ID_FLAG0); - } - /** * Calculate the maximum possible throughput of the program compatible with * the cycle-count utilization estimated for each asynchronous unit, in @@ -1692,12 +1572,6 @@ brw::performance::performance(const fs_visitor *v) : calculate_performance(*this, v, issue_fs_inst, v->dispatch_width); } -brw::performance::performance(const vec4_visitor *v) : - block_latency(new unsigned[v->cfg->num_blocks]) -{ - calculate_performance(*this, v, issue_vec4_instruction, 8); -} - brw::performance::~performance() { delete[] block_latency; diff --git a/src/intel/compiler/brw_ir_performance.h b/src/intel/compiler/brw_ir_performance.h index c3cefe838aa..80dc95b0d2e 100644 --- a/src/intel/compiler/brw_ir_performance.h +++ b/src/intel/compiler/brw_ir_performance.h @@ -28,15 +28,12 @@ class fs_visitor; namespace brw { - class vec4_visitor; - /** * Various estimates of the performance of a shader based on static * analysis. */ struct performance { performance(const fs_visitor *v); - performance(const vec4_visitor *v); ~performance(); analysis_dependency_class diff --git a/src/intel/compiler/brw_ir_vec4.h b/src/intel/compiler/brw_ir_vec4.h deleted file mode 100644 index 78d34729c0b..00000000000 --- a/src/intel/compiler/brw_ir_vec4.h +++ /dev/null @@ -1,475 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright © 2011-2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef BRW_IR_VEC4_H -#define BRW_IR_VEC4_H - -#include "brw_shader.h" - -namespace brw { - -class dst_reg; - -class src_reg : public backend_reg -{ -public: - DECLARE_RALLOC_CXX_OPERATORS(src_reg) - - void init(); - - src_reg(enum brw_reg_file file, int nr, const glsl_type *type); - src_reg(); - src_reg(struct ::brw_reg reg); - - bool equals(const src_reg &r) const; - bool negative_equals(const src_reg &r) const; - - src_reg(class vec4_visitor *v, const struct glsl_type *type); - src_reg(class vec4_visitor *v, const struct glsl_type *type, int size); - - explicit src_reg(const dst_reg ®); - - src_reg *reladdr; -}; - -static inline src_reg -retype(src_reg reg, enum brw_reg_type type) -{ - reg.type = type; - return reg; -} - -namespace detail { - -static inline void -add_byte_offset(backend_reg *reg, unsigned bytes) -{ - switch (reg->file) { - case BAD_FILE: - break; - case VGRF: - case ATTR: - case UNIFORM: - reg->offset += bytes; - assert(reg->offset % 16 == 0); - break; - case MRF: { - const unsigned suboffset = reg->offset + bytes; - reg->nr += suboffset / REG_SIZE; - reg->offset = suboffset % REG_SIZE; - assert(reg->offset % 16 == 0); - break; - } - case ARF: - case FIXED_GRF: { - const unsigned suboffset = reg->subnr + bytes; - reg->nr += suboffset / REG_SIZE; - reg->subnr = suboffset % REG_SIZE; - assert(reg->subnr % 16 == 0); - break; - } - default: - assert(bytes == 0); - } -} - -} /* namespace detail */ - -static inline src_reg -byte_offset(src_reg reg, unsigned bytes) -{ - detail::add_byte_offset(®, bytes); - return reg; -} - -static inline src_reg -offset(src_reg reg, unsigned width, unsigned delta) -{ - const unsigned stride = (reg.file == UNIFORM ? 0 : 4); - const unsigned num_components = MAX2(width / 4 * stride, 4); - return byte_offset(reg, num_components * type_sz(reg.type) * delta); -} - -static inline src_reg -horiz_offset(src_reg reg, unsigned delta) -{ - return byte_offset(reg, delta * type_sz(reg.type)); -} - -/** - * Reswizzle a given source register. - * \sa brw_swizzle(). - */ -static inline src_reg -swizzle(src_reg reg, unsigned swizzle) -{ - if (reg.file == IMM) - reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle); - else - reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle); - - return reg; -} - -static inline src_reg -negate(src_reg reg) -{ - assert(reg.file != IMM); - reg.negate = !reg.negate; - return reg; -} - -static inline bool -is_uniform(const src_reg ®) -{ - return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) && - (!reg.reladdr || is_uniform(*reg.reladdr)); -} - -class dst_reg : public backend_reg -{ -public: - DECLARE_RALLOC_CXX_OPERATORS(dst_reg) - - void init(); - - dst_reg(); - dst_reg(enum brw_reg_file file, int nr); - dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, - unsigned writemask); - dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, - unsigned writemask); - dst_reg(struct ::brw_reg reg); - dst_reg(class vec4_visitor *v, const struct glsl_type *type); - - explicit dst_reg(const src_reg ®); - - bool equals(const dst_reg &r) const; - - src_reg *reladdr; -}; - -static inline dst_reg -retype(dst_reg reg, enum brw_reg_type type) -{ - reg.type = type; - return reg; -} - -static inline dst_reg -byte_offset(dst_reg reg, unsigned bytes) -{ - detail::add_byte_offset(®, bytes); - return reg; -} - -static inline dst_reg -offset(dst_reg reg, unsigned width, unsigned delta) -{ - const unsigned stride = (reg.file == UNIFORM ? 0 : 4); - const unsigned num_components = MAX2(width / 4 * stride, 4); - return byte_offset(reg, num_components * type_sz(reg.type) * delta); -} - -static inline dst_reg -horiz_offset(const dst_reg ®, unsigned delta) -{ - if (is_uniform(src_reg(reg))) - return reg; - else - return byte_offset(reg, delta * type_sz(reg.type)); -} - -static inline dst_reg -writemask(dst_reg reg, unsigned mask) -{ - assert(reg.file != IMM); - assert((reg.writemask & mask) != 0); - reg.writemask &= mask; - return reg; -} - -/** - * Return an integer identifying the discrete address space a register is - * contained in. A register is by definition fully contained in the single - * reg_space it belongs to, so two registers with different reg_space ids are - * guaranteed not to overlap. Most register files are a single reg_space of - * its own, only the VGRF file is composed of multiple discrete address - * spaces, one for each VGRF allocation. - */ -static inline uint32_t -reg_space(const backend_reg &r) -{ - return r.file << 16 | (r.file == VGRF ? r.nr : 0); -} - -/** - * Return the base offset in bytes of a register relative to the start of its - * reg_space(). - */ -static inline unsigned -reg_offset(const backend_reg &r) -{ - return (r.file == VGRF || r.file == IMM ? 0 : r.nr) * - (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset + - (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0); -} - -/** - * Return whether the register region starting at \p r and spanning \p dr - * bytes could potentially overlap the register region starting at \p s and - * spanning \p ds bytes. - */ -static inline bool -regions_overlap(const backend_reg &r, unsigned dr, - const backend_reg &s, unsigned ds) -{ - if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) { - /* COMPR4 regions are translated by the hardware during decompression - * into two separate half-regions 4 MRFs apart from each other. - */ - backend_reg t0 = r; - t0.nr &= ~BRW_MRF_COMPR4; - backend_reg t1 = t0; - t1.offset += 4 * REG_SIZE; - return regions_overlap(t0, dr / 2, s, ds) || - regions_overlap(t1, dr / 2, s, ds); - - } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) { - return regions_overlap(s, ds, r, dr); - - } else { - return reg_space(r) == reg_space(s) && - !(reg_offset(r) + dr <= reg_offset(s) || - reg_offset(s) + ds <= reg_offset(r)); - } -} - -class vec4_instruction : public backend_instruction { -public: - DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction) - - vec4_instruction(enum opcode opcode, - const dst_reg &dst = dst_reg(), - const src_reg &src0 = src_reg(), - const src_reg &src1 = src_reg(), - const src_reg &src2 = src_reg()); - - dst_reg dst; - src_reg src[3]; - - enum brw_urb_write_flags urb_write_flags; - - unsigned sol_binding; /**< gfx6: SOL binding table index */ - bool sol_final_write; /**< gfx6: send commit message */ - unsigned sol_vertex; /**< gfx6: used for setting dst index in SVB header */ - - bool is_send_from_grf() const; - unsigned size_read(unsigned arg) const; - bool can_reswizzle(const struct intel_device_info *devinfo, - int dst_writemask, - int swizzle, int swizzle_mask); - void reswizzle(int dst_writemask, int swizzle); - bool can_do_source_mods(const struct intel_device_info *devinfo); - bool can_do_cmod(); - bool can_do_writemask(const struct intel_device_info *devinfo); - bool can_change_types() const; - bool has_source_and_destination_hazard() const; - unsigned implied_mrf_writes() const; - - bool is_align1_partial_write() - { - return opcode == VEC4_OPCODE_SET_LOW_32BIT || - opcode == VEC4_OPCODE_SET_HIGH_32BIT; - } - - bool reads_flag() const - { - return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2; - } - - bool reads_flag(unsigned c) - { - if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) - return true; - - switch (predicate) { - case BRW_PREDICATE_NONE: - return false; - case BRW_PREDICATE_ALIGN16_REPLICATE_X: - return c == 0; - case BRW_PREDICATE_ALIGN16_REPLICATE_Y: - return c == 1; - case BRW_PREDICATE_ALIGN16_REPLICATE_Z: - return c == 2; - case BRW_PREDICATE_ALIGN16_REPLICATE_W: - return c == 3; - default: - return true; - } - } - - bool writes_flag(const intel_device_info *devinfo) const - { - return (conditional_mod && ((opcode != BRW_OPCODE_SEL || devinfo->ver <= 5) && - opcode != BRW_OPCODE_CSEL && - opcode != BRW_OPCODE_IF && - opcode != BRW_OPCODE_WHILE)); - } - - bool reads_g0_implicitly() const - { - switch (opcode) { - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_CMS: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_SAMPLEINFO: - case VS_OPCODE_PULL_CONSTANT_LOAD: - case GS_OPCODE_SET_PRIMITIVE_ID: - case GS_OPCODE_GET_INSTANCE_ID: - case SHADER_OPCODE_GFX4_SCRATCH_READ: - case SHADER_OPCODE_GFX4_SCRATCH_WRITE: - return true; - default: - return false; - } - } -}; - -/** - * Make the execution of \p inst dependent on the evaluation of a possibly - * inverted predicate. - */ -inline vec4_instruction * -set_predicate_inv(enum brw_predicate pred, bool inverse, - vec4_instruction *inst) -{ - inst->predicate = pred; - inst->predicate_inverse = inverse; - return inst; -} - -/** - * Make the execution of \p inst dependent on the evaluation of a predicate. - */ -inline vec4_instruction * -set_predicate(enum brw_predicate pred, vec4_instruction *inst) -{ - return set_predicate_inv(pred, false, inst); -} - -/** - * Write the result of evaluating the condition given by \p mod to a flag - * register. - */ -inline vec4_instruction * -set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst) -{ - inst->conditional_mod = mod; - return inst; -} - -/** - * Clamp the result of \p inst to the saturation range of its destination - * datatype. - */ -inline vec4_instruction * -set_saturate(bool saturate, vec4_instruction *inst) -{ - inst->saturate = saturate; - return inst; -} - -/** - * Return the number of dataflow registers written by the instruction (either - * fully or partially) counted from 'floor(reg_offset(inst->dst) / - * register_size)'. The somewhat arbitrary register size unit is 16B for the - * UNIFORM and IMM files and 32B for all other files. - */ -inline unsigned -regs_written(const vec4_instruction *inst) -{ - assert(inst->dst.file != UNIFORM && inst->dst.file != IMM); - return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written, - REG_SIZE); -} - -/** - * Return the number of dataflow registers read by the instruction (either - * fully or partially) counted from 'floor(reg_offset(inst->src[i]) / - * register_size)'. The somewhat arbitrary register size unit is 16B for the - * UNIFORM and IMM files and 32B for all other files. - */ -inline unsigned -regs_read(const vec4_instruction *inst, unsigned i) -{ - const unsigned reg_size = - inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE; - return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i), - reg_size); -} - -static inline enum brw_reg_type -get_exec_type(const vec4_instruction *inst) -{ - enum brw_reg_type exec_type = BRW_REGISTER_TYPE_B; - - for (int i = 0; i < 3; i++) { - if (inst->src[i].file != BAD_FILE) { - const brw_reg_type t = get_exec_type(brw_reg_type(inst->src[i].type)); - if (type_sz(t) > type_sz(exec_type)) - exec_type = t; - else if (type_sz(t) == type_sz(exec_type) && - brw_reg_type_is_floating_point(t)) - exec_type = t; - } - } - - if (exec_type == BRW_REGISTER_TYPE_B) - exec_type = inst->dst.type; - - /* TODO: We need to handle half-float conversions. */ - assert(exec_type != BRW_REGISTER_TYPE_HF || - inst->dst.type == BRW_REGISTER_TYPE_HF); - assert(exec_type != BRW_REGISTER_TYPE_B); - - return exec_type; -} - -static inline unsigned -get_exec_type_size(const vec4_instruction *inst) -{ - return type_sz(get_exec_type(inst)); -} - -} /* namespace brw */ - -#endif diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp index 01d1243bc77..4bb50369ec2 100644 --- a/src/intel/compiler/brw_schedule_instructions.cpp +++ b/src/intel/compiler/brw_schedule_instructions.cpp @@ -28,7 +28,6 @@ #include "brw_eu.h" #include "brw_fs.h" #include "brw_fs_live_variables.h" -#include "brw_vec4.h" #include "brw_cfg.h" #include "brw_shader.h" #include @@ -1027,25 +1026,6 @@ fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) return benefit; } -class vec4_instruction_scheduler : public instruction_scheduler -{ -public: - vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v, int grf_count); - void calculate_deps(); - schedule_node *choose_instruction_to_schedule(); - const vec4_visitor *v; - - void run(); -}; - -vec4_instruction_scheduler::vec4_instruction_scheduler(void *mem_ctx, const vec4_visitor *v, - int grf_count) - : instruction_scheduler(mem_ctx, v, grf_count, /* grf_write_scale */ 1, - /* post_reg_alloc */ true), - v(v) -{ -} - void instruction_scheduler::set_current_block(bblock_t *block) { @@ -1534,179 +1514,6 @@ fs_instruction_scheduler::calculate_deps() clear_last_grf_write(); } -void -vec4_instruction_scheduler::calculate_deps() -{ - schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->ver)]; - schedule_node *last_conditional_mod = NULL; - schedule_node *last_accumulator_write = NULL; - /* Fixed HW registers are assumed to be separate from the virtual - * GRFs, so they can be tracked separately. We don't really write - * to fixed GRFs much, so don't bother tracking them on a more - * granular level. - */ - schedule_node *last_fixed_grf_write = NULL; - - memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write)); - memset(last_mrf_write, 0, sizeof(last_mrf_write)); - - /* top-to-bottom dependencies: RAW and WAW. */ - for (schedule_node *n = current.start; n < current.end; n++) { - vec4_instruction *inst = (vec4_instruction *)n->inst; - - if (is_scheduling_barrier(inst)) - add_barrier_deps(n); - - /* read-after-write deps. */ - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) { - for (unsigned j = 0; j < regs_read(inst, i); ++j) - add_dep(last_grf_write[inst->src[i].nr + j], n); - } else if (inst->src[i].file == FIXED_GRF) { - add_dep(last_fixed_grf_write, n); - } else if (inst->src[i].is_accumulator()) { - assert(last_accumulator_write); - add_dep(last_accumulator_write, n); - } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { - add_barrier_deps(n); - } - } - - if (inst->reads_g0_implicitly()) - add_dep(last_fixed_grf_write, n); - - if (!inst->is_send_from_grf()) { - for (int i = 0; i < inst->mlen; i++) { - /* It looks like the MRF regs are released in the send - * instruction once it's sent, not when the result comes - * back. - */ - add_dep(last_mrf_write[inst->base_mrf + i], n); - } - } - - if (inst->reads_flag()) { - assert(last_conditional_mod); - add_dep(last_conditional_mod, n); - } - - if (inst->reads_accumulator_implicitly()) { - assert(last_accumulator_write); - add_dep(last_accumulator_write, n); - } - - /* write-after-write deps. */ - if (inst->dst.file == VGRF) { - for (unsigned j = 0; j < regs_written(inst); ++j) { - add_dep(last_grf_write[inst->dst.nr + j], n); - last_grf_write[inst->dst.nr + j] = n; - } - } else if (inst->dst.file == MRF) { - add_dep(last_mrf_write[inst->dst.nr], n); - last_mrf_write[inst->dst.nr] = n; - } else if (inst->dst.file == FIXED_GRF) { - add_dep(last_fixed_grf_write, n); - last_fixed_grf_write = n; - } else if (inst->dst.is_accumulator()) { - add_dep(last_accumulator_write, n); - last_accumulator_write = n; - } else if (inst->dst.file == ARF && !inst->dst.is_null()) { - add_barrier_deps(n); - } - - if (inst->mlen > 0 && !inst->is_send_from_grf()) { - for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { - add_dep(last_mrf_write[inst->base_mrf + i], n); - last_mrf_write[inst->base_mrf + i] = n; - } - } - - if (inst->writes_flag(v->devinfo)) { - add_dep(last_conditional_mod, n, 0); - last_conditional_mod = n; - } - - if (inst->writes_accumulator_implicitly(v->devinfo) && - !inst->dst.is_accumulator()) { - add_dep(last_accumulator_write, n); - last_accumulator_write = n; - } - } - - /* bottom-to-top dependencies: WAR */ - memset(last_grf_write, 0, grf_count * sizeof(*last_grf_write)); - memset(last_mrf_write, 0, sizeof(last_mrf_write)); - last_conditional_mod = NULL; - last_accumulator_write = NULL; - last_fixed_grf_write = NULL; - - for (schedule_node *n = current.end - 1; n >= current.start; n--) { - vec4_instruction *inst = (vec4_instruction *)n->inst; - - /* write-after-read deps. */ - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) { - for (unsigned j = 0; j < regs_read(inst, i); ++j) - add_dep(n, last_grf_write[inst->src[i].nr + j]); - } else if (inst->src[i].file == FIXED_GRF) { - add_dep(n, last_fixed_grf_write); - } else if (inst->src[i].is_accumulator()) { - add_dep(n, last_accumulator_write); - } else if (inst->src[i].file == ARF && !inst->src[i].is_null()) { - add_barrier_deps(n); - } - } - - if (!inst->is_send_from_grf()) { - for (int i = 0; i < inst->mlen; i++) { - /* It looks like the MRF regs are released in the send - * instruction once it's sent, not when the result comes - * back. - */ - add_dep(n, last_mrf_write[inst->base_mrf + i], 2); - } - } - - if (inst->reads_flag()) { - add_dep(n, last_conditional_mod); - } - - if (inst->reads_accumulator_implicitly()) { - add_dep(n, last_accumulator_write); - } - - /* Update the things this instruction wrote, so earlier reads - * can mark this as WAR dependency. - */ - if (inst->dst.file == VGRF) { - for (unsigned j = 0; j < regs_written(inst); ++j) - last_grf_write[inst->dst.nr + j] = n; - } else if (inst->dst.file == MRF) { - last_mrf_write[inst->dst.nr] = n; - } else if (inst->dst.file == FIXED_GRF) { - last_fixed_grf_write = n; - } else if (inst->dst.is_accumulator()) { - last_accumulator_write = n; - } else if (inst->dst.file == ARF && !inst->dst.is_null()) { - add_barrier_deps(n); - } - - if (inst->mlen > 0 && !inst->is_send_from_grf()) { - for (unsigned i = 0; i < inst->implied_mrf_writes(); i++) { - last_mrf_write[inst->base_mrf + i] = n; - } - } - - if (inst->writes_flag(v->devinfo)) { - last_conditional_mod = n; - } - - if (inst->writes_accumulator_implicitly(v->devinfo)) { - last_accumulator_write = n; - } - } -} - schedule_node * fs_instruction_scheduler::choose_instruction_to_schedule() { @@ -1837,25 +1644,6 @@ fs_instruction_scheduler::choose_instruction_to_schedule() return chosen; } -schedule_node * -vec4_instruction_scheduler::choose_instruction_to_schedule() -{ - schedule_node *chosen = NULL; - int chosen_time = 0; - - /* Of the instructions ready to execute or the closest to being ready, - * choose the oldest one. - */ - foreach_in_list(schedule_node, n, ¤t.available) { - if (!chosen || n->tmp.unblocked_time < chosen_time) { - chosen = n; - chosen_time = n->tmp.unblocked_time; - } - } - - return chosen; -} - int fs_instruction_scheduler::calculate_issue_time(backend_instruction *inst0) { @@ -2009,41 +1797,6 @@ fs_instruction_scheduler::run(instruction_scheduler_mode mode) } } -void -vec4_instruction_scheduler::run() -{ - foreach_block(block, v->cfg) { - set_current_block(block); - - for (schedule_node *n = current.start; n < current.end; n++) { - /* We always execute as two vec4s in parallel. */ - n->issue_time = 2; - } - - calculate_deps(); - - compute_delays(); - compute_exits(); - - assert(current.available.is_empty()); - for (schedule_node *n = current.start; n < current.end; n++) { - reset_node_tmp(n); - - /* Add DAG heads to the list of available instructions. */ - if (n->tmp.parent_count == 0) - current.available.push_tail(n); - } - - current.block->instructions.make_empty(); - - while (!current.available.is_empty()) { - schedule_node *chosen = choose_instruction_to_schedule(); - schedule(chosen); - update_children(chosen); - } - } -} - fs_instruction_scheduler * fs_visitor::prepare_scheduler(void *mem_ctx) { @@ -2082,16 +1835,3 @@ fs_visitor::schedule_instructions_post_ra() invalidate_analysis(DEPENDENCY_INSTRUCTIONS); } - -void -vec4_visitor::opt_schedule_instructions() -{ - void *mem_ctx = ralloc_context(NULL); - - vec4_instruction_scheduler sched(mem_ctx, this, prog_data->total_grf); - sched.run(); - - ralloc_free(mem_ctx); - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); -} diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h index fbd50c07e7e..18c867841cf 100644 --- a/src/intel/compiler/brw_shader.h +++ b/src/intel/compiler/brw_shader.h @@ -114,9 +114,6 @@ extern "C" { /* brw_fs_reg_allocate.cpp */ void brw_fs_alloc_reg_sets(struct brw_compiler *compiler); -/* brw_vec4_reg_allocate.cpp */ -void brw_vec4_alloc_reg_set(struct brw_compiler *compiler); - /* brw_disasm.c */ extern const char *const conditional_modifier[16]; extern const char *const pred_ctrl_align16[16]; diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp deleted file mode 100644 index 3473ef59bd7..00000000000 --- a/src/intel/compiler/brw_vec4.cpp +++ /dev/null @@ -1,2542 +0,0 @@ -/* - * Copyright © 2011 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_vec4.h" -#include "brw_cfg.h" -#include "brw_vec4_builder.h" -#include "brw_vec4_vs.h" -#include "brw_dead_control_flow.h" -#include "dev/intel_debug.h" - -#define MAX_INSTRUCTION (1 << 30) - -using namespace brw; - -namespace brw { - -void -src_reg::init() -{ - memset((void*)this, 0, sizeof(*this)); - this->file = BAD_FILE; - this->type = BRW_REGISTER_TYPE_UD; -} - -src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type) -{ - init(); - - this->file = file; - this->nr = nr; - if (type && (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type))) - this->swizzle = brw_swizzle_for_size(type->vector_elements); - else - this->swizzle = BRW_SWIZZLE_XYZW; - if (type) - this->type = brw_type_for_base_type(type); -} - -/** Generic unset register constructor. */ -src_reg::src_reg() -{ - init(); -} - -src_reg::src_reg(struct ::brw_reg reg) : - backend_reg(reg) -{ - this->offset = 0; - this->reladdr = NULL; -} - -src_reg::src_reg(const dst_reg ®) : - backend_reg(reg) -{ - this->reladdr = reg.reladdr; - this->swizzle = brw_swizzle_for_mask(reg.writemask); -} - -void -dst_reg::init() -{ - memset((void*)this, 0, sizeof(*this)); - this->file = BAD_FILE; - this->type = BRW_REGISTER_TYPE_UD; - this->writemask = WRITEMASK_XYZW; -} - -dst_reg::dst_reg() -{ - init(); -} - -dst_reg::dst_reg(enum brw_reg_file file, int nr) -{ - init(); - - this->file = file; - this->nr = nr; -} - -dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type, - unsigned writemask) -{ - init(); - - this->file = file; - this->nr = nr; - this->type = brw_type_for_base_type(type); - this->writemask = writemask; -} - -dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type, - unsigned writemask) -{ - init(); - - this->file = file; - this->nr = nr; - this->type = type; - this->writemask = writemask; -} - -dst_reg::dst_reg(struct ::brw_reg reg) : - backend_reg(reg) -{ - this->offset = 0; - this->reladdr = NULL; -} - -dst_reg::dst_reg(const src_reg ®) : - backend_reg(reg) -{ - this->writemask = brw_mask_for_swizzle(reg.swizzle); - this->reladdr = reg.reladdr; -} - -bool -dst_reg::equals(const dst_reg &r) const -{ - return (this->backend_reg::equals(r) && - (reladdr == r.reladdr || - (reladdr && r.reladdr && reladdr->equals(*r.reladdr)))); -} - -bool -vec4_instruction::is_send_from_grf() const -{ - switch (opcode) { - case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: - case VEC4_OPCODE_UNTYPED_ATOMIC: - case VEC4_OPCODE_UNTYPED_SURFACE_READ: - case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: - case VEC4_OPCODE_URB_READ: - case VEC4_TCS_OPCODE_URB_WRITE: - case TCS_OPCODE_RELEASE_INPUT: - case SHADER_OPCODE_BARRIER: - return true; - default: - return false; - } -} - -/** - * Returns true if this instruction's sources and destinations cannot - * safely be the same register. - * - * In most cases, a register can be written over safely by the same - * instruction that is its last use. For a single instruction, the - * sources are dereferenced before writing of the destination starts - * (naturally). - * - * However, there are a few cases where this can be problematic: - * - * - Virtual opcodes that translate to multiple instructions in the - * code generator: if src == dst and one instruction writes the - * destination before a later instruction reads the source, then - * src will have been clobbered. - * - * The register allocator uses this information to set up conflicts between - * GRF sources and the destination. - */ -bool -vec4_instruction::has_source_and_destination_hazard() const -{ - switch (opcode) { - case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS: - case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: - case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: - return true; - default: - /* 8-wide compressed DF operations are executed as two 4-wide operations, - * so we have a src/dst hazard if the first half of the instruction - * overwrites the source of the second half. Prevent this by marking - * compressed instructions as having src/dst hazards, so the register - * allocator assigns safe register regions for dst and srcs. - */ - return size_written > REG_SIZE; - } -} - -unsigned -vec4_instruction::size_read(unsigned arg) const -{ - switch (opcode) { - case VEC4_OPCODE_UNTYPED_ATOMIC: - case VEC4_OPCODE_UNTYPED_SURFACE_READ: - case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: - case VEC4_TCS_OPCODE_URB_WRITE: - if (arg == 0) - return mlen * REG_SIZE; - break; - case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: - if (arg == 1) - return mlen * REG_SIZE; - break; - default: - break; - } - - switch (src[arg].file) { - case BAD_FILE: - return 0; - case IMM: - case UNIFORM: - return 4 * type_sz(src[arg].type); - default: - /* XXX - Represent actual vertical stride. */ - return exec_size * type_sz(src[arg].type); - } -} - -bool -vec4_instruction::can_do_source_mods(const struct intel_device_info *devinfo) -{ - if (devinfo->ver == 6 && is_math()) - return false; - - if (is_send_from_grf()) - return false; - - if (!backend_instruction::can_do_source_mods()) - return false; - - return true; -} - -bool -vec4_instruction::can_do_cmod() -{ - if (!backend_instruction::can_do_cmod()) - return false; - - /* The accumulator result appears to get used for the conditional modifier - * generation. When negating a UD value, there is a 33rd bit generated for - * the sign in the accumulator value, so now you can't check, for example, - * equality with a 32-bit value. See piglit fs-op-neg-uvec4. - */ - for (unsigned i = 0; i < 3; i++) { - if (src[i].file != BAD_FILE && - brw_reg_type_is_unsigned_integer(src[i].type) && src[i].negate) - return false; - } - - return true; -} - -bool -vec4_instruction::can_do_writemask(const struct intel_device_info *devinfo) -{ - switch (opcode) { - case SHADER_OPCODE_GFX4_SCRATCH_READ: - case VEC4_OPCODE_DOUBLE_TO_F32: - case VEC4_OPCODE_DOUBLE_TO_D32: - case VEC4_OPCODE_DOUBLE_TO_U32: - case VEC4_OPCODE_TO_DOUBLE: - case VEC4_OPCODE_PICK_LOW_32BIT: - case VEC4_OPCODE_PICK_HIGH_32BIT: - case VEC4_OPCODE_SET_LOW_32BIT: - case VEC4_OPCODE_SET_HIGH_32BIT: - case VS_OPCODE_PULL_CONSTANT_LOAD: - case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: - case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS: - case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: - case TES_OPCODE_CREATE_INPUT_READ_HEADER: - case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: - case VEC4_OPCODE_URB_READ: - case SHADER_OPCODE_MOV_INDIRECT: - case SHADER_OPCODE_TEX: - case FS_OPCODE_TXB: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_LZ: - case SHADER_OPCODE_TXF_CMS: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_UMS: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXL_LZ: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_LOD: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_SAMPLEINFO: - return false; - default: - /* The MATH instruction on Gfx6 only executes in align1 mode, which does - * not support writemasking. - */ - if (devinfo->ver == 6 && is_math()) - return false; - - return true; - } -} - -bool -vec4_instruction::can_change_types() const -{ - return dst.type == src[0].type && - !src[0].abs && !src[0].negate && !saturate && - (opcode == BRW_OPCODE_MOV || - (opcode == BRW_OPCODE_SEL && - dst.type == src[1].type && - predicate != BRW_PREDICATE_NONE && - !src[1].abs && !src[1].negate)); -} - -/** - * Returns how many MRFs an opcode will write over. - * - * Note that this is not the 0 or 1 implied writes in an actual gen - * instruction -- the generate_* functions generate additional MOVs - * for setup. - */ -unsigned -vec4_instruction::implied_mrf_writes() const -{ - if (mlen == 0 || is_send_from_grf()) - return 0; - - switch (opcode) { - case SHADER_OPCODE_RCP: - case SHADER_OPCODE_RSQ: - case SHADER_OPCODE_SQRT: - case SHADER_OPCODE_EXP2: - case SHADER_OPCODE_LOG2: - case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: - return 1; - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - case SHADER_OPCODE_POW: - case TCS_OPCODE_THREAD_END: - return 2; - case VEC4_VS_OPCODE_URB_WRITE: - return 1; - case VS_OPCODE_PULL_CONSTANT_LOAD: - return 2; - case SHADER_OPCODE_GFX4_SCRATCH_READ: - return 2; - case SHADER_OPCODE_GFX4_SCRATCH_WRITE: - return 3; - case VEC4_GS_OPCODE_URB_WRITE: - case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE: - case GS_OPCODE_THREAD_END: - return 0; - case GS_OPCODE_FF_SYNC: - return 1; - case VEC4_TCS_OPCODE_URB_WRITE: - return 0; - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_CMS: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_SAMPLEINFO: - case SHADER_OPCODE_GET_BUFFER_SIZE: - return header_size; - default: - unreachable("not reached"); - } -} - -bool -src_reg::equals(const src_reg &r) const -{ - return (this->backend_reg::equals(r) && - !reladdr && !r.reladdr); -} - -bool -src_reg::negative_equals(const src_reg &r) const -{ - return this->backend_reg::negative_equals(r) && - !reladdr && !r.reladdr; -} - -bool -vec4_visitor::opt_vector_float() -{ - bool progress = false; - - foreach_block(block, cfg) { - unsigned last_reg = ~0u, last_offset = ~0u; - enum brw_reg_file last_reg_file = BAD_FILE; - - uint8_t imm[4] = { 0 }; - int inst_count = 0; - vec4_instruction *imm_inst[4]; - unsigned writemask = 0; - enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F; - - foreach_inst_in_block_safe(vec4_instruction, inst, block) { - int vf = -1; - enum brw_reg_type need_type = BRW_REGISTER_TYPE_LAST; - - /* Look for unconditional MOVs from an immediate with a partial - * writemask. Skip type-conversion MOVs other than integer 0, - * where the type doesn't matter. See if the immediate can be - * represented as a VF. - */ - if (inst->opcode == BRW_OPCODE_MOV && - inst->src[0].file == IMM && - inst->predicate == BRW_PREDICATE_NONE && - inst->dst.writemask != WRITEMASK_XYZW && - type_sz(inst->src[0].type) < 8 && - (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) { - - vf = brw_float_to_vf(inst->src[0].d); - need_type = BRW_REGISTER_TYPE_D; - - if (vf == -1) { - vf = brw_float_to_vf(inst->src[0].f); - need_type = BRW_REGISTER_TYPE_F; - } - } else { - last_reg = ~0u; - } - - /* If this wasn't a MOV, or the destination register doesn't match, - * or we have to switch destination types, then this breaks our - * sequence. Combine anything we've accumulated so far. - */ - if (last_reg != inst->dst.nr || - last_offset != inst->dst.offset || - last_reg_file != inst->dst.file || - (vf > 0 && dest_type != need_type)) { - - if (inst_count > 1) { - unsigned vf; - memcpy(&vf, imm, sizeof(vf)); - vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf)); - mov->dst.type = dest_type; - mov->dst.writemask = writemask; - inst->insert_before(block, mov); - - for (int i = 0; i < inst_count; i++) { - imm_inst[i]->remove(block); - } - - progress = true; - } - - inst_count = 0; - last_reg = ~0u;; - writemask = 0; - dest_type = BRW_REGISTER_TYPE_F; - - for (int i = 0; i < 4; i++) { - imm[i] = 0; - } - } - - /* Record this instruction's value (if it was representable). */ - if (vf != -1) { - if ((inst->dst.writemask & WRITEMASK_X) != 0) - imm[0] = vf; - if ((inst->dst.writemask & WRITEMASK_Y) != 0) - imm[1] = vf; - if ((inst->dst.writemask & WRITEMASK_Z) != 0) - imm[2] = vf; - if ((inst->dst.writemask & WRITEMASK_W) != 0) - imm[3] = vf; - - writemask |= inst->dst.writemask; - imm_inst[inst_count++] = inst; - - last_reg = inst->dst.nr; - last_offset = inst->dst.offset; - last_reg_file = inst->dst.file; - if (vf > 0) - dest_type = need_type; - } - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - -/* Replaces unused channels of a swizzle with channels that are used. - * - * For instance, this pass transforms - * - * mov vgrf4.yz, vgrf5.wxzy - * - * into - * - * mov vgrf4.yz, vgrf5.xxzx - * - * This eliminates false uses of some channels, letting dead code elimination - * remove the instructions that wrote them. - */ -bool -vec4_visitor::opt_reduce_swizzle() -{ - bool progress = false; - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == BAD_FILE || - inst->dst.file == ARF || - inst->dst.file == FIXED_GRF || - inst->is_send_from_grf()) - continue; - - unsigned swizzle; - - /* Determine which channels of the sources are read. */ - switch (inst->opcode) { - case VEC4_OPCODE_PACK_BYTES: - case BRW_OPCODE_DP4: - case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0, - * but all four of src1. - */ - swizzle = brw_swizzle_for_size(4); - break; - case BRW_OPCODE_DP3: - swizzle = brw_swizzle_for_size(3); - break; - case BRW_OPCODE_DP2: - swizzle = brw_swizzle_for_size(2); - break; - - case VEC4_OPCODE_TO_DOUBLE: - case VEC4_OPCODE_DOUBLE_TO_F32: - case VEC4_OPCODE_DOUBLE_TO_D32: - case VEC4_OPCODE_DOUBLE_TO_U32: - case VEC4_OPCODE_PICK_LOW_32BIT: - case VEC4_OPCODE_PICK_HIGH_32BIT: - case VEC4_OPCODE_SET_LOW_32BIT: - case VEC4_OPCODE_SET_HIGH_32BIT: - swizzle = brw_swizzle_for_size(4); - break; - - default: - swizzle = brw_swizzle_for_mask(inst->dst.writemask); - break; - } - - /* Update sources' swizzles. */ - for (int i = 0; i < 3; i++) { - if (inst->src[i].file != VGRF && - inst->src[i].file != ATTR && - inst->src[i].file != UNIFORM) - continue; - - const unsigned new_swizzle = - brw_compose_swizzle(swizzle, inst->src[i].swizzle); - if (inst->src[i].swizzle != new_swizzle) { - inst->src[i].swizzle = new_swizzle; - progress = true; - } - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); - - return progress; -} - -void -vec4_visitor::split_uniform_registers() -{ - /* Prior to this, uniforms have been in an array sized according to - * the number of vector uniforms present, sparsely filled (so an - * aggregate results in reg indices being skipped over). Now we're - * going to cut those aggregates up so each .nr index is one - * vector. The goal is to make elimination of unused uniform - * components easier later. - */ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (int i = 0 ; i < 3; i++) { - if (inst->src[i].file != UNIFORM || inst->src[i].nr >= UBO_START) - continue; - - assert(!inst->src[i].reladdr); - - inst->src[i].nr += inst->src[i].offset / 16; - inst->src[i].offset %= 16; - } - } -} - -/** - * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a). - * - * While GLSL IR also performs this optimization, we end up with it in - * our instruction stream for a couple of reasons. One is that we - * sometimes generate silly instructions, for example in array access - * where we'll generate "ADD offset, index, base" even if base is 0. - * The other is that GLSL IR's constant propagation doesn't track the - * components of aggregates, so some VS patterns (initialize matrix to - * 0, accumulate in vertex blending factors) end up breaking down to - * instructions involving 0. - */ -bool -vec4_visitor::opt_algebraic() -{ - bool progress = false; - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - switch (inst->opcode) { - case BRW_OPCODE_MOV: - if (inst->src[0].file != IMM) - break; - - if (inst->saturate) { - /* Full mixed-type saturates don't happen. However, we can end up - * with things like: - * - * mov.sat(8) g21<1>DF -1F - * - * Other mixed-size-but-same-base-type cases may also be possible. - */ - if (inst->dst.type != inst->src[0].type && - inst->dst.type != BRW_REGISTER_TYPE_DF && - inst->src[0].type != BRW_REGISTER_TYPE_F) - assert(!"unimplemented: saturate mixed types"); - - if (brw_saturate_immediate(inst->src[0].type, - &inst->src[0].as_brw_reg())) { - inst->saturate = false; - progress = true; - } - } - break; - - case BRW_OPCODE_OR: - if (inst->src[1].is_zero()) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[1] = src_reg(); - progress = true; - } - break; - - case VEC4_OPCODE_UNPACK_UNIFORM: - if (inst->src[0].file != UNIFORM) { - inst->opcode = BRW_OPCODE_MOV; - progress = true; - } - break; - - case BRW_OPCODE_ADD: - if (inst->src[1].is_zero()) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[1] = src_reg(); - progress = true; - } - break; - - case BRW_OPCODE_MUL: - if (inst->src[1].file != IMM) - continue; - - if (brw_reg_type_is_floating_point(inst->src[1].type)) - break; - - if (inst->src[1].is_zero()) { - inst->opcode = BRW_OPCODE_MOV; - switch (inst->src[0].type) { - case BRW_REGISTER_TYPE_F: - inst->src[0] = brw_imm_f(0.0f); - break; - case BRW_REGISTER_TYPE_D: - inst->src[0] = brw_imm_d(0); - break; - case BRW_REGISTER_TYPE_UD: - inst->src[0] = brw_imm_ud(0u); - break; - default: - unreachable("not reached"); - } - inst->src[1] = src_reg(); - progress = true; - } else if (inst->src[1].is_one()) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[1] = src_reg(); - progress = true; - } else if (inst->src[1].is_negative_one()) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[0].negate = !inst->src[0].negate; - inst->src[1] = src_reg(); - progress = true; - } - break; - case SHADER_OPCODE_BROADCAST: - if (is_uniform(inst->src[0]) || - inst->src[1].is_zero()) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[1] = src_reg(); - inst->force_writemask_all = true; - progress = true; - } - break; - - default: - break; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | - DEPENDENCY_INSTRUCTION_DETAIL); - - return progress; -} - -/* Conditions for which we want to avoid setting the dependency control bits */ -bool -vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst) -{ -#define IS_DWORD(reg) \ - (reg.type == BRW_REGISTER_TYPE_UD || \ - reg.type == BRW_REGISTER_TYPE_D) - -#define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8) - - if (devinfo->ver >= 7) { - if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) || - IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2])) - return true; - } - -#undef IS_64BIT -#undef IS_DWORD - - /* - * mlen: - * In the presence of send messages, totally interrupt dependency - * control. They're long enough that the chance of dependency - * control around them just doesn't matter. - * - * predicate: - * From the Ivy Bridge PRM, volume 4 part 3.7, page 80: - * When a sequence of NoDDChk and NoDDClr are used, the last instruction that - * completes the scoreboard clear must have a non-zero execution mask. This - * means, if any kind of predication can change the execution mask or channel - * enable of the last instruction, the optimization must be avoided. This is - * to avoid instructions being shot down the pipeline when no writes are - * required. - * - * math: - * Dependency control does not work well over math instructions. - * NB: Discovered empirically - */ - return (inst->mlen || inst->predicate || inst->is_math()); -} - -/** - * Sets the dependency control fields on instructions after register - * allocation and before the generator is run. - * - * When you have a sequence of instructions like: - * - * DP4 temp.x vertex uniform[0] - * DP4 temp.y vertex uniform[0] - * DP4 temp.z vertex uniform[0] - * DP4 temp.w vertex uniform[0] - * - * The hardware doesn't know that it can actually run the later instructions - * while the previous ones are in flight, producing stalls. However, we have - * manual fields we can set in the instructions that let it do so. - */ -void -vec4_visitor::opt_set_dependency_control() -{ - vec4_instruction *last_grf_write[BRW_MAX_GRF]; - uint8_t grf_channels_written[BRW_MAX_GRF]; - vec4_instruction *last_mrf_write[BRW_MAX_GRF]; - uint8_t mrf_channels_written[BRW_MAX_GRF]; - - assert(prog_data->total_grf || - !"Must be called after register allocation"); - - foreach_block (block, cfg) { - memset(last_grf_write, 0, sizeof(last_grf_write)); - memset(last_mrf_write, 0, sizeof(last_mrf_write)); - - foreach_inst_in_block (vec4_instruction, inst, block) { - /* If we read from a register that we were doing dependency control - * on, don't do dependency control across the read. - */ - for (int i = 0; i < 3; i++) { - int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE; - if (inst->src[i].file == VGRF) { - last_grf_write[reg] = NULL; - } else if (inst->src[i].file == FIXED_GRF) { - memset(last_grf_write, 0, sizeof(last_grf_write)); - break; - } - assert(inst->src[i].file != MRF); - } - - if (is_dep_ctrl_unsafe(inst)) { - memset(last_grf_write, 0, sizeof(last_grf_write)); - memset(last_mrf_write, 0, sizeof(last_mrf_write)); - continue; - } - - /* Now, see if we can do dependency control for this instruction - * against a previous one writing to its destination. - */ - int reg = inst->dst.nr + inst->dst.offset / REG_SIZE; - if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) { - if (last_grf_write[reg] && - last_grf_write[reg]->dst.offset == inst->dst.offset && - !(inst->dst.writemask & grf_channels_written[reg])) { - last_grf_write[reg]->no_dd_clear = true; - inst->no_dd_check = true; - } else { - grf_channels_written[reg] = 0; - } - - last_grf_write[reg] = inst; - grf_channels_written[reg] |= inst->dst.writemask; - } else if (inst->dst.file == MRF) { - if (last_mrf_write[reg] && - last_mrf_write[reg]->dst.offset == inst->dst.offset && - !(inst->dst.writemask & mrf_channels_written[reg])) { - last_mrf_write[reg]->no_dd_clear = true; - inst->no_dd_check = true; - } else { - mrf_channels_written[reg] = 0; - } - - last_mrf_write[reg] = inst; - mrf_channels_written[reg] |= inst->dst.writemask; - } - } - } -} - -bool -vec4_instruction::can_reswizzle(const struct intel_device_info *devinfo, - int dst_writemask, - int swizzle, - int swizzle_mask) -{ - /* Gfx6 MATH instructions can not execute in align16 mode, so swizzles - * are not allowed. - */ - if (devinfo->ver == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW) - return false; - - /* If we write to the flag register changing the swizzle would change - * what channels are written to the flag register. - */ - if (writes_flag(devinfo)) - return false; - - /* We can't swizzle implicit accumulator access. We'd have to - * reswizzle the producer of the accumulator value in addition - * to the consumer (i.e. both MUL and MACH). Just skip this. - */ - if (reads_accumulator_implicitly()) - return false; - - if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW) - return false; - - /* If this instruction sets anything not referenced by swizzle, then we'd - * totally break it when we reswizzle. - */ - if (dst.writemask & ~swizzle_mask) - return false; - - if (mlen > 0) - return false; - - for (int i = 0; i < 3; i++) { - if (src[i].is_accumulator()) - return false; - } - - return true; -} - -/** - * For any channels in the swizzle's source that were populated by this - * instruction, rewrite the instruction to put the appropriate result directly - * in those channels. - * - * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x - */ -void -vec4_instruction::reswizzle(int dst_writemask, int swizzle) -{ - /* Destination write mask doesn't correspond to source swizzle for the dot - * product and pack_bytes instructions. - */ - if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH && - opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 && - opcode != VEC4_OPCODE_PACK_BYTES) { - for (int i = 0; i < 3; i++) { - if (src[i].file == BAD_FILE) - continue; - - if (src[i].file == IMM) { - assert(src[i].type != BRW_REGISTER_TYPE_V && - src[i].type != BRW_REGISTER_TYPE_UV); - - /* Vector immediate types need to be reswizzled. */ - if (src[i].type == BRW_REGISTER_TYPE_VF) { - const unsigned imm[] = { - (src[i].ud >> 0) & 0x0ff, - (src[i].ud >> 8) & 0x0ff, - (src[i].ud >> 16) & 0x0ff, - (src[i].ud >> 24) & 0x0ff, - }; - - src[i] = brw_imm_vf4(imm[BRW_GET_SWZ(swizzle, 0)], - imm[BRW_GET_SWZ(swizzle, 1)], - imm[BRW_GET_SWZ(swizzle, 2)], - imm[BRW_GET_SWZ(swizzle, 3)]); - } - - continue; - } - - src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle); - } - } - - /* Apply the specified swizzle and writemask to the original mask of - * written components. - */ - dst.writemask = dst_writemask & - brw_apply_swizzle_to_mask(swizzle, dst.writemask); -} - -/* - * Tries to reduce extra MOV instructions by taking temporary GRFs that get - * just written and then MOVed into another reg and making the original write - * of the GRF write directly to the final destination instead. - */ -bool -vec4_visitor::opt_register_coalesce() -{ - bool progress = false; - int next_ip = 0; - const vec4_live_variables &live = live_analysis.require(); - - foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { - int ip = next_ip; - next_ip++; - - if (inst->opcode != BRW_OPCODE_MOV || - (inst->dst.file != VGRF && inst->dst.file != MRF) || - inst->predicate || - inst->src[0].file != VGRF || - inst->dst.type != inst->src[0].type || - inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr) - continue; - - /* Remove no-op MOVs */ - if (inst->dst.file == inst->src[0].file && - inst->dst.nr == inst->src[0].nr && - inst->dst.offset == inst->src[0].offset) { - bool is_nop_mov = true; - - for (unsigned c = 0; c < 4; c++) { - if ((inst->dst.writemask & (1 << c)) == 0) - continue; - - if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) { - is_nop_mov = false; - break; - } - } - - if (is_nop_mov) { - inst->remove(block); - progress = true; - continue; - } - } - - bool to_mrf = (inst->dst.file == MRF); - - /* Can't coalesce this GRF if someone else was going to - * read it later. - */ - if (live.var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip) - continue; - - /* We need to check interference with the final destination between this - * instruction and the earliest instruction involved in writing the GRF - * we're eliminating. To do that, keep track of which of our source - * channels we've seen initialized. - */ - const unsigned chans_needed = - brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle, - inst->dst.writemask); - unsigned chans_remaining = chans_needed; - - /* Now walk up the instruction stream trying to see if we can rewrite - * everything writing to the temporary to write into the destination - * instead. - */ - vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev; - foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, - inst) { - _scan_inst = scan_inst; - - if (regions_overlap(inst->src[0], inst->size_read(0), - scan_inst->dst, scan_inst->size_written)) { - /* Found something writing to the reg we want to coalesce away. */ - if (to_mrf) { - /* SEND instructions can't have MRF as a destination. */ - if (scan_inst->mlen) - break; - - if (devinfo->ver == 6) { - /* gfx6 math instructions must have the destination be - * VGRF, so no compute-to-MRF for them. - */ - if (scan_inst->is_math()) { - break; - } - } - } - - /* VS_OPCODE_UNPACK_FLAGS_SIMD4X2 generates a bunch of mov(1) - * instructions, and this optimization pass is not capable of - * handling that. Bail on these instructions and hope that some - * later optimization pass can do the right thing after they are - * expanded. - */ - if (scan_inst->opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2) - break; - - /* This doesn't handle saturation on the instruction we - * want to coalesce away if the register types do not match. - * But if scan_inst is a non type-converting 'mov', we can fix - * the types later. - */ - if (inst->saturate && - inst->dst.type != scan_inst->dst.type && - !(scan_inst->opcode == BRW_OPCODE_MOV && - scan_inst->dst.type == scan_inst->src[0].type)) - break; - - /* Only allow coalescing between registers of the same type size. - * Otherwise we would need to make the pass aware of the fact that - * channel sizes are different for single and double precision. - */ - if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type)) - break; - - /* Check that scan_inst writes the same amount of data as the - * instruction, otherwise coalescing would lead to writing a - * different (larger or smaller) region of the destination - */ - if (scan_inst->size_written != inst->size_written) - break; - - /* If we can't handle the swizzle, bail. */ - if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask, - inst->src[0].swizzle, - chans_needed)) { - break; - } - - /* This only handles coalescing writes of 8 channels (1 register - * for single-precision and 2 registers for double-precision) - * starting at the source offset of the copy instruction. - */ - if (DIV_ROUND_UP(scan_inst->size_written, - type_sz(scan_inst->dst.type)) > 8 || - scan_inst->dst.offset != inst->src[0].offset) - break; - - /* Mark which channels we found unconditional writes for. */ - if (!scan_inst->predicate) - chans_remaining &= ~scan_inst->dst.writemask; - - if (chans_remaining == 0) - break; - } - - /* You can't read from an MRF, so if someone else reads our MRF's - * source GRF that we wanted to rewrite, that stops us. If it's a - * GRF we're trying to coalesce to, we don't actually handle - * rewriting sources so bail in that case as well. - */ - bool interfered = false; - for (int i = 0; i < 3; i++) { - if (regions_overlap(inst->src[0], inst->size_read(0), - scan_inst->src[i], scan_inst->size_read(i))) - interfered = true; - } - if (interfered) - break; - - /* If somebody else writes the same channels of our destination here, - * we can't coalesce before that. - */ - if (regions_overlap(inst->dst, inst->size_written, - scan_inst->dst, scan_inst->size_written) && - (inst->dst.writemask & scan_inst->dst.writemask) != 0) { - break; - } - - /* Check for reads of the register we're trying to coalesce into. We - * can't go rewriting instructions above that to put some other value - * in the register instead. - */ - if (to_mrf && scan_inst->mlen > 0) { - unsigned start = scan_inst->base_mrf; - unsigned end = scan_inst->base_mrf + scan_inst->mlen; - - if (inst->dst.nr >= start && inst->dst.nr < end) { - break; - } - } else { - for (int i = 0; i < 3; i++) { - if (regions_overlap(inst->dst, inst->size_written, - scan_inst->src[i], scan_inst->size_read(i))) - interfered = true; - } - if (interfered) - break; - } - } - - if (chans_remaining == 0) { - /* If we've made it here, we have an MOV we want to coalesce out, and - * a scan_inst pointing to the earliest instruction involved in - * computing the value. Now go rewrite the instruction stream - * between the two. - */ - vec4_instruction *scan_inst = _scan_inst; - while (scan_inst != inst) { - if (scan_inst->dst.file == VGRF && - scan_inst->dst.nr == inst->src[0].nr && - scan_inst->dst.offset == inst->src[0].offset) { - scan_inst->reswizzle(inst->dst.writemask, - inst->src[0].swizzle); - scan_inst->dst.file = inst->dst.file; - scan_inst->dst.nr = inst->dst.nr; - scan_inst->dst.offset = inst->dst.offset; - if (inst->saturate && - inst->dst.type != scan_inst->dst.type) { - /* If we have reached this point, scan_inst is a non - * type-converting 'mov' and we can modify its register types - * to match the ones in inst. Otherwise, we could have an - * incorrect saturation result. - */ - scan_inst->dst.type = inst->dst.type; - scan_inst->src[0].type = inst->src[0].type; - } - scan_inst->saturate |= inst->saturate; - } - scan_inst = (vec4_instruction *)scan_inst->next; - } - inst->remove(block); - progress = true; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - -/** - * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control - * flow. We could probably do better here with some form of divergence - * analysis. - */ -bool -vec4_visitor::eliminate_find_live_channel() -{ - bool progress = false; - unsigned depth = 0; - - if (!brw_stage_has_packed_dispatch(devinfo, stage, 0, stage_prog_data)) { - /* The optimization below assumes that channel zero is live on thread - * dispatch, which may not be the case if the fixed function dispatches - * threads sparsely. - */ - return false; - } - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - switch (inst->opcode) { - case BRW_OPCODE_IF: - case BRW_OPCODE_DO: - depth++; - break; - - case BRW_OPCODE_ENDIF: - case BRW_OPCODE_WHILE: - depth--; - break; - - case SHADER_OPCODE_FIND_LIVE_CHANNEL: - if (depth == 0) { - inst->opcode = BRW_OPCODE_MOV; - inst->src[0] = brw_imm_d(0); - inst->force_writemask_all = true; - progress = true; - } - break; - - default: - break; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL); - - return progress; -} - -/** - * Splits virtual GRFs requesting more than one contiguous physical register. - * - * We initially create large virtual GRFs for temporary structures, arrays, - * and matrices, so that the visitor functions can add offsets to work their - * way down to the actual member being accessed. But when it comes to - * optimization, we'd like to treat each register as individual storage if - * possible. - * - * So far, the only thing that might prevent splitting is a send message from - * a GRF on IVB. - */ -void -vec4_visitor::split_virtual_grfs() -{ - int num_vars = this->alloc.count; - int new_virtual_grf[num_vars]; - bool split_grf[num_vars]; - - memset(new_virtual_grf, 0, sizeof(new_virtual_grf)); - - /* Try to split anything > 0 sized. */ - for (int i = 0; i < num_vars; i++) { - split_grf[i] = this->alloc.sizes[i] != 1; - } - - /* Check that the instructions are compatible with the registers we're trying - * to split. - */ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == VGRF && regs_written(inst) > 1) - split_grf[inst->dst.nr] = false; - - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF && regs_read(inst, i) > 1) - split_grf[inst->src[i].nr] = false; - } - } - - /* Allocate new space for split regs. Note that the virtual - * numbers will be contiguous. - */ - for (int i = 0; i < num_vars; i++) { - if (!split_grf[i]) - continue; - - new_virtual_grf[i] = alloc.allocate(1); - for (unsigned j = 2; j < this->alloc.sizes[i]; j++) { - unsigned reg = alloc.allocate(1); - assert(reg == new_virtual_grf[i] + j - 1); - (void) reg; - } - this->alloc.sizes[i] = 1; - } - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == VGRF && split_grf[inst->dst.nr] && - inst->dst.offset / REG_SIZE != 0) { - inst->dst.nr = (new_virtual_grf[inst->dst.nr] + - inst->dst.offset / REG_SIZE - 1); - inst->dst.offset %= REG_SIZE; - } - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] && - inst->src[i].offset / REG_SIZE != 0) { - inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] + - inst->src[i].offset / REG_SIZE - 1); - inst->src[i].offset %= REG_SIZE; - } - } - } - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | DEPENDENCY_VARIABLES); -} - -void -vec4_visitor::dump_instruction_to_file(const backend_instruction *be_inst, FILE *file) const -{ - const vec4_instruction *inst = (const vec4_instruction *)be_inst; - - if (inst->predicate) { - fprintf(file, "(%cf%d.%d%s) ", - inst->predicate_inverse ? '-' : '+', - inst->flag_subreg / 2, - inst->flag_subreg % 2, - pred_ctrl_align16[inst->predicate]); - } - - fprintf(file, "%s(%d)", brw_instruction_name(&compiler->isa, inst->opcode), - inst->exec_size); - if (inst->saturate) - fprintf(file, ".sat"); - if (inst->conditional_mod) { - fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); - if (!inst->predicate && - (devinfo->ver < 5 || (inst->opcode != BRW_OPCODE_SEL && - inst->opcode != BRW_OPCODE_CSEL && - inst->opcode != BRW_OPCODE_IF && - inst->opcode != BRW_OPCODE_WHILE))) { - fprintf(file, ".f%d.%d", inst->flag_subreg / 2, inst->flag_subreg % 2); - } - } - fprintf(file, " "); - - switch (inst->dst.file) { - case VGRF: - fprintf(file, "vgrf%d", inst->dst.nr); - break; - case FIXED_GRF: - fprintf(file, "g%d", inst->dst.nr); - break; - case MRF: - fprintf(file, "m%d", inst->dst.nr); - break; - case ARF: - switch (inst->dst.nr) { - case BRW_ARF_NULL: - fprintf(file, "null"); - break; - case BRW_ARF_ADDRESS: - fprintf(file, "a0.%d", inst->dst.subnr); - break; - case BRW_ARF_ACCUMULATOR: - fprintf(file, "acc%d", inst->dst.subnr); - break; - case BRW_ARF_FLAG: - fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); - break; - default: - fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr); - break; - } - break; - case BAD_FILE: - fprintf(file, "(null)"); - break; - case IMM: - case ATTR: - case UNIFORM: - unreachable("not reached"); - } - if (inst->dst.offset || - (inst->dst.file == VGRF && - alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) { - const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE); - fprintf(file, "+%d.%d", inst->dst.offset / reg_size, - inst->dst.offset % reg_size); - } - if (inst->dst.writemask != WRITEMASK_XYZW) { - fprintf(file, "."); - if (inst->dst.writemask & 1) - fprintf(file, "x"); - if (inst->dst.writemask & 2) - fprintf(file, "y"); - if (inst->dst.writemask & 4) - fprintf(file, "z"); - if (inst->dst.writemask & 8) - fprintf(file, "w"); - } - fprintf(file, ":%s", brw_reg_type_to_letters(inst->dst.type)); - - if (inst->src[0].file != BAD_FILE) - fprintf(file, ", "); - - for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) { - if (inst->src[i].negate) - fprintf(file, "-"); - if (inst->src[i].abs) - fprintf(file, "|"); - switch (inst->src[i].file) { - case VGRF: - fprintf(file, "vgrf%d", inst->src[i].nr); - break; - case FIXED_GRF: - fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr); - break; - case ATTR: - fprintf(file, "attr%d", inst->src[i].nr); - break; - case UNIFORM: - fprintf(file, "u%d", inst->src[i].nr); - break; - case IMM: - switch (inst->src[i].type) { - case BRW_REGISTER_TYPE_F: - fprintf(file, "%fF", inst->src[i].f); - break; - case BRW_REGISTER_TYPE_DF: - fprintf(file, "%fDF", inst->src[i].df); - break; - case BRW_REGISTER_TYPE_D: - fprintf(file, "%dD", inst->src[i].d); - break; - case BRW_REGISTER_TYPE_UD: - fprintf(file, "%uU", inst->src[i].ud); - break; - case BRW_REGISTER_TYPE_VF: - fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", - brw_vf_to_float((inst->src[i].ud >> 0) & 0xff), - brw_vf_to_float((inst->src[i].ud >> 8) & 0xff), - brw_vf_to_float((inst->src[i].ud >> 16) & 0xff), - brw_vf_to_float((inst->src[i].ud >> 24) & 0xff)); - break; - default: - fprintf(file, "???"); - break; - } - break; - case ARF: - switch (inst->src[i].nr) { - case BRW_ARF_NULL: - fprintf(file, "null"); - break; - case BRW_ARF_ADDRESS: - fprintf(file, "a0.%d", inst->src[i].subnr); - break; - case BRW_ARF_ACCUMULATOR: - fprintf(file, "acc%d", inst->src[i].subnr); - break; - case BRW_ARF_FLAG: - fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); - break; - default: - fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr); - break; - } - break; - case BAD_FILE: - fprintf(file, "(null)"); - break; - case MRF: - unreachable("not reached"); - } - - if (inst->src[i].offset || - (inst->src[i].file == VGRF && - alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) { - const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE); - fprintf(file, "+%d.%d", inst->src[i].offset / reg_size, - inst->src[i].offset % reg_size); - } - - if (inst->src[i].file != IMM) { - static const char *chans[4] = {"x", "y", "z", "w"}; - fprintf(file, "."); - for (int c = 0; c < 4; c++) { - fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]); - } - } - - if (inst->src[i].abs) - fprintf(file, "|"); - - if (inst->src[i].file != IMM) { - fprintf(file, ":%s", brw_reg_type_to_letters(inst->src[i].type)); - } - - if (i < 2 && inst->src[i + 1].file != BAD_FILE) - fprintf(file, ", "); - } - - if (inst->force_writemask_all) - fprintf(file, " NoMask"); - - if (inst->exec_size != 8) - fprintf(file, " group%d", inst->group); - - fprintf(file, "\n"); -} - - -int -vec4_vs_visitor::setup_attributes(int payload_reg) -{ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == ATTR) { - assert(inst->src[i].offset % REG_SIZE == 0); - int grf = payload_reg + inst->src[i].nr + - inst->src[i].offset / REG_SIZE; - - struct brw_reg reg = brw_vec8_grf(grf, 0); - reg.swizzle = inst->src[i].swizzle; - reg.type = inst->src[i].type; - reg.abs = inst->src[i].abs; - reg.negate = inst->src[i].negate; - inst->src[i] = reg; - } - } - } - - return payload_reg + vs_prog_data->nr_attribute_slots; -} - -void -vec4_visitor::setup_push_ranges() -{ - /* Only allow 32 registers (256 uniform components) as push constants, - * which is the limit on gfx6. - * - * If changing this value, note the limitation about total_regs in - * brw_curbe.c. - */ - const unsigned max_push_length = 32; - - push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8); - push_length = MIN2(push_length, max_push_length); - - /* Shrink UBO push ranges so it all fits in max_push_length */ - for (unsigned i = 0; i < 4; i++) { - struct brw_ubo_range *range = &prog_data->base.ubo_ranges[i]; - - if (push_length + range->length > max_push_length) - range->length = max_push_length - push_length; - - push_length += range->length; - } - assert(push_length <= max_push_length); -} - -int -vec4_visitor::setup_uniforms(int reg) -{ - /* It's possible that uniform compaction will shrink further than expected - * so we re-compute the layout and set up our UBO push starts. - */ - ASSERTED const unsigned old_push_length = push_length; - push_length = DIV_ROUND_UP(prog_data->base.nr_params, 8); - for (unsigned i = 0; i < 4; i++) { - ubo_push_start[i] = push_length; - push_length += stage_prog_data->ubo_ranges[i].length; - } - assert(push_length == old_push_length); - - /* The pre-gfx6 VS requires that some push constants get loaded no - * matter what, or the GPU would hang. - */ - if (devinfo->ver < 6 && push_length == 0) { - brw_stage_prog_data_add_params(stage_prog_data, 4); - for (unsigned int i = 0; i < 4; i++) { - unsigned int slot = this->uniforms * 4 + i; - stage_prog_data->param[slot] = BRW_PARAM_BUILTIN_ZERO; - } - push_length = 1; - } - - prog_data->base.dispatch_grf_start_reg = reg; - prog_data->base.curb_read_length = push_length; - - return reg + push_length; -} - -void -vec4_vs_visitor::setup_payload(void) -{ - int reg = 0; - - /* The payload always contains important data in g0, which contains - * the URB handles that are passed on to the URB write at the end - * of the thread. So, we always start push constants at g1. - */ - reg++; - - reg = setup_uniforms(reg); - - reg = setup_attributes(reg); - - this->first_non_payload_grf = reg; -} - -bool -vec4_visitor::lower_minmax() -{ - assert(devinfo->ver < 6); - - bool progress = false; - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - const vec4_builder ibld(this, block, inst); - - if (inst->opcode == BRW_OPCODE_SEL && - inst->predicate == BRW_PREDICATE_NONE) { - /* If src1 is an immediate value that is not NaN, then it can't be - * NaN. In that case, emit CMP because it is much better for cmod - * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't - * support HF or DF, so it is not necessary to check for those. - */ - if (inst->src[1].type != BRW_REGISTER_TYPE_F || - (inst->src[1].file == IMM && !isnan(inst->src[1].f))) { - ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], - inst->conditional_mod); - } else { - ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1], - inst->conditional_mod); - } - inst->predicate = BRW_PREDICATE_NORMAL; - inst->conditional_mod = BRW_CONDITIONAL_NONE; - - progress = true; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - -src_reg -vec4_visitor::get_timestamp() -{ - assert(devinfo->ver == 7); - - src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE, - BRW_ARF_TIMESTAMP, - 0, - 0, - 0, - BRW_REGISTER_TYPE_UD, - BRW_VERTICAL_STRIDE_0, - BRW_WIDTH_4, - BRW_HORIZONTAL_STRIDE_4, - BRW_SWIZZLE_XYZW, - WRITEMASK_XYZW)); - - dst_reg dst = dst_reg(this, glsl_uvec4_type()); - - vec4_instruction *mov = emit(MOV(dst, ts)); - /* We want to read the 3 fields we care about (mostly field 0, but also 2) - * even if it's not enabled in the dispatch. - */ - mov->force_writemask_all = true; - - return src_reg(dst); -} - -static bool -is_align1_df(vec4_instruction *inst) -{ - switch (inst->opcode) { - case VEC4_OPCODE_DOUBLE_TO_F32: - case VEC4_OPCODE_DOUBLE_TO_D32: - case VEC4_OPCODE_DOUBLE_TO_U32: - case VEC4_OPCODE_TO_DOUBLE: - case VEC4_OPCODE_PICK_LOW_32BIT: - case VEC4_OPCODE_PICK_HIGH_32BIT: - case VEC4_OPCODE_SET_LOW_32BIT: - case VEC4_OPCODE_SET_HIGH_32BIT: - return true; - default: - return false; - } -} - -/** - * Three source instruction must have a GRF/MRF destination register. - * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. - */ -void -vec4_visitor::fixup_3src_null_dest() -{ - bool progress = false; - - foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) { - if (inst->is_3src(compiler) && inst->dst.is_null()) { - const unsigned size_written = type_sz(inst->dst.type); - const unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE); - - inst->dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), - inst->dst.type); - progress = true; - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | - DEPENDENCY_VARIABLES); -} - -void -vec4_visitor::convert_to_hw_regs() -{ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (int i = 0; i < 3; i++) { - class src_reg &src = inst->src[i]; - struct brw_reg reg; - switch (src.file) { - case VGRF: { - reg = byte_offset(brw_vecn_grf(4, src.nr, 0), src.offset); - reg.type = src.type; - reg.abs = src.abs; - reg.negate = src.negate; - break; - } - - case UNIFORM: { - if (src.nr >= UBO_START) { - reg = byte_offset(brw_vec4_grf( - prog_data->base.dispatch_grf_start_reg + - ubo_push_start[src.nr - UBO_START] + - src.offset / 32, 0), - src.offset % 32); - } else { - reg = byte_offset(brw_vec4_grf( - prog_data->base.dispatch_grf_start_reg + - src.nr / 2, src.nr % 2 * 4), - src.offset); - } - reg = stride(reg, 0, 4, 1); - reg.type = src.type; - reg.abs = src.abs; - reg.negate = src.negate; - - /* This should have been moved to pull constants. */ - assert(!src.reladdr); - break; - } - - case FIXED_GRF: - if (type_sz(src.type) == 8) { - reg = src.as_brw_reg(); - break; - } - FALLTHROUGH; - case ARF: - case IMM: - continue; - - case BAD_FILE: - /* Probably unused. */ - reg = brw_null_reg(); - reg = retype(reg, src.type); - break; - - case MRF: - case ATTR: - unreachable("not reached"); - } - - apply_logical_swizzle(®, inst, i); - src = reg; - - /* From IVB PRM, vol4, part3, "General Restrictions on Regioning - * Parameters": - * - * "If ExecSize = Width and HorzStride ≠ 0, VertStride must be set - * to Width * HorzStride." - * - * We can break this rule with DF sources on DF align1 - * instructions, because the exec_size would be 4 and width is 4. - * As we know we are not accessing to next GRF, it is safe to - * set vstride to the formula given by the rule itself. - */ - if (is_align1_df(inst) && (cvt(inst->exec_size) - 1) == src.width) - src.vstride = src.width + src.hstride; - } - - if (inst->is_3src(compiler)) { - /* 3-src instructions with scalar sources support arbitrary subnr, - * but don't actually use swizzles. Convert swizzle into subnr. - * Skip this for double-precision instructions: RepCtrl=1 is not - * allowed for them and needs special handling. - */ - for (int i = 0; i < 3; i++) { - if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 && - type_sz(inst->src[i].type) < 8) { - assert(brw_is_single_value_swizzle(inst->src[i].swizzle)); - inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0); - } - } - } - - dst_reg &dst = inst->dst; - struct brw_reg reg; - - switch (inst->dst.file) { - case VGRF: - reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset); - reg.type = dst.type; - reg.writemask = dst.writemask; - break; - - case MRF: - reg = byte_offset(brw_message_reg(dst.nr), dst.offset); - assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->ver)); - reg.type = dst.type; - reg.writemask = dst.writemask; - break; - - case ARF: - case FIXED_GRF: - reg = dst.as_brw_reg(); - break; - - case BAD_FILE: - reg = brw_null_reg(); - reg = retype(reg, dst.type); - break; - - case IMM: - case ATTR: - case UNIFORM: - unreachable("not reached"); - } - - dst = reg; - } -} - -static bool -stage_uses_interleaved_attributes(unsigned stage, - enum intel_shader_dispatch_mode dispatch_mode) -{ - switch (stage) { - case MESA_SHADER_TESS_EVAL: - return true; - case MESA_SHADER_GEOMETRY: - return dispatch_mode != INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; - default: - return false; - } -} - -/** - * Get the closest native SIMD width supported by the hardware for instruction - * \p inst. The instruction will be left untouched by - * vec4_visitor::lower_simd_width() if the returned value matches the - * instruction's original execution size. - */ -static unsigned -get_lowered_simd_width(const struct intel_device_info *devinfo, - enum intel_shader_dispatch_mode dispatch_mode, - unsigned stage, const vec4_instruction *inst) -{ - /* Do not split some instructions that require special handling */ - switch (inst->opcode) { - case SHADER_OPCODE_GFX4_SCRATCH_READ: - case SHADER_OPCODE_GFX4_SCRATCH_WRITE: - return inst->exec_size; - default: - break; - } - - unsigned lowered_width = MIN2(16, inst->exec_size); - - /* We need to split some cases of double-precision instructions that write - * 2 registers. We only need to care about this in gfx7 because that is the - * only hardware that implements fp64 in Align16. - */ - if (devinfo->ver == 7 && inst->size_written > REG_SIZE) { - /* Align16 8-wide double-precision SEL does not work well. Verified - * empirically. - */ - if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8) - lowered_width = MIN2(lowered_width, 4); - - /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct - * Register Addressing: - * - * "When destination spans two registers, the source MUST span two - * registers." - */ - for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == BAD_FILE) - continue; - if (inst->size_read(i) <= REG_SIZE) - lowered_width = MIN2(lowered_width, 4); - - /* Interleaved attribute setups use a vertical stride of 0, which - * makes them hit the associated instruction decompression bug in gfx7. - * Split them to prevent this. - */ - if (inst->src[i].file == ATTR && - stage_uses_interleaved_attributes(stage, dispatch_mode)) - lowered_width = MIN2(lowered_width, 4); - } - } - - /* IvyBridge can manage a maximum of 4 DFs per SIMD4x2 instruction, since - * it doesn't support compression in Align16 mode, no matter if it has - * force_writemask_all enabled or disabled (the latter is affected by the - * compressed instruction bug in gfx7, which is another reason to enforce - * this limit). - */ - if (devinfo->verx10 == 70 && - (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) - lowered_width = MIN2(lowered_width, 4); - - return lowered_width; -} - -static bool -dst_src_regions_overlap(vec4_instruction *inst) -{ - if (inst->size_written == 0) - return false; - - unsigned dst_start = inst->dst.offset; - unsigned dst_end = dst_start + inst->size_written - 1; - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == BAD_FILE) - continue; - - if (inst->dst.file != inst->src[i].file || - inst->dst.nr != inst->src[i].nr) - continue; - - unsigned src_start = inst->src[i].offset; - unsigned src_end = src_start + inst->size_read(i) - 1; - - if ((dst_start >= src_start && dst_start <= src_end) || - (dst_end >= src_start && dst_end <= src_end) || - (dst_start <= src_start && dst_end >= src_end)) { - return true; - } - } - - return false; -} - -bool -vec4_visitor::lower_simd_width() -{ - bool progress = false; - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - const unsigned lowered_width = - get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst); - assert(lowered_width <= inst->exec_size); - if (lowered_width == inst->exec_size) - continue; - - /* We need to deal with source / destination overlaps when splitting. - * The hardware supports reading from and writing to the same register - * in the same instruction, but we need to be careful that each split - * instruction we produce does not corrupt the source of the next. - * - * The easiest way to handle this is to make the split instructions write - * to temporaries if there is an src/dst overlap and then move from the - * temporaries to the original destination. We also need to consider - * instructions that do partial writes via align1 opcodes, in which case - * we need to make sure that the we initialize the temporary with the - * value of the instruction's dst. - */ - bool needs_temp = dst_src_regions_overlap(inst); - for (unsigned n = 0; n < inst->exec_size / lowered_width; n++) { - unsigned channel_offset = lowered_width * n; - - unsigned size_written = lowered_width * type_sz(inst->dst.type); - - /* Create the split instruction from the original so that we copy all - * relevant instruction fields, then set the width and calculate the - * new dst/src regions. - */ - vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst); - linst->exec_size = lowered_width; - linst->group = channel_offset; - linst->size_written = size_written; - - /* Compute split dst region */ - dst_reg dst; - if (needs_temp) { - unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE); - dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)), - inst->dst.type); - if (inst->is_align1_partial_write()) { - vec4_instruction *copy = MOV(dst, src_reg(inst->dst)); - copy->exec_size = lowered_width; - copy->group = channel_offset; - copy->size_written = size_written; - inst->insert_before(block, copy); - } - } else { - dst = horiz_offset(inst->dst, channel_offset); - } - linst->dst = dst; - - /* Compute split source regions */ - for (int i = 0; i < 3; i++) { - if (linst->src[i].file == BAD_FILE) - continue; - - bool is_interleaved_attr = - linst->src[i].file == ATTR && - stage_uses_interleaved_attributes(stage, - prog_data->dispatch_mode); - - if (!is_uniform(linst->src[i]) && !is_interleaved_attr) - linst->src[i] = horiz_offset(linst->src[i], channel_offset); - } - - inst->insert_before(block, linst); - - /* If we used a temporary to store the result of the split - * instruction, copy the result to the original destination - */ - if (needs_temp) { - vec4_instruction *mov = - MOV(offset(inst->dst, lowered_width, n), src_reg(dst)); - mov->exec_size = lowered_width; - mov->group = channel_offset; - mov->size_written = size_written; - mov->predicate = inst->predicate; - inst->insert_before(block, mov); - } - } - - inst->remove(block); - progress = true; - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - -static brw_predicate -scalarize_predicate(brw_predicate predicate, unsigned writemask) -{ - if (predicate != BRW_PREDICATE_NORMAL) - return predicate; - - switch (writemask) { - case WRITEMASK_X: - return BRW_PREDICATE_ALIGN16_REPLICATE_X; - case WRITEMASK_Y: - return BRW_PREDICATE_ALIGN16_REPLICATE_Y; - case WRITEMASK_Z: - return BRW_PREDICATE_ALIGN16_REPLICATE_Z; - case WRITEMASK_W: - return BRW_PREDICATE_ALIGN16_REPLICATE_W; - default: - unreachable("invalid writemask"); - } -} - -/* Gfx7 has a hardware decompression bug that we can exploit to represent - * handful of additional swizzles natively. - */ -static bool -is_gfx7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg) -{ - switch (inst->src[arg].swizzle) { - case BRW_SWIZZLE_XXXX: - case BRW_SWIZZLE_YYYY: - case BRW_SWIZZLE_ZZZZ: - case BRW_SWIZZLE_WWWW: - case BRW_SWIZZLE_XYXY: - case BRW_SWIZZLE_YXYX: - case BRW_SWIZZLE_ZWZW: - case BRW_SWIZZLE_WZWZ: - return true; - default: - return false; - } -} - -/* 64-bit sources use regions with a width of 2. These 2 elements in each row - * can be addressed using 32-bit swizzles (which is what the hardware supports) - * but it also means that the swizzle we apply on the first two components of a - * dvec4 is coupled with the swizzle we use for the last 2. In other words, - * only some specific swizzle combinations can be natively supported. - * - * FIXME: we can go an step further and implement even more swizzle - * variations using only partial scalarization. - * - * For more details see: - * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82 - */ -bool -vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg) -{ - const src_reg &src = inst->src[arg]; - assert(type_sz(src.type) == 8); - - /* Uniform regions have a vstride=0. Because we use 2-wide rows with - * 64-bit regions it means that we cannot access components Z/W, so - * return false for any such case. Interleaved attributes will also be - * mapped to GRF registers with a vstride of 0, so apply the same - * treatment. - */ - if ((is_uniform(src) || - (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) && - src.file == ATTR)) && - (brw_mask_for_swizzle(src.swizzle) & 12)) - return false; - - switch (src.swizzle) { - case BRW_SWIZZLE_XYZW: - case BRW_SWIZZLE_XXZZ: - case BRW_SWIZZLE_YYWW: - case BRW_SWIZZLE_YXWZ: - return true; - default: - return devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg); - } -} - -bool -vec4_visitor::scalarize_df() -{ - bool progress = false; - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - /* Skip DF instructions that operate in Align1 mode */ - if (is_align1_df(inst)) - continue; - - /* Check if this is a double-precision instruction */ - bool is_double = type_sz(inst->dst.type) == 8; - for (int arg = 0; !is_double && arg < 3; arg++) { - is_double = inst->src[arg].file != BAD_FILE && - type_sz(inst->src[arg].type) == 8; - } - - if (!is_double) - continue; - - /* Skip the lowering for specific regioning scenarios that we can - * support natively. - */ - bool skip_lowering = true; - - /* XY and ZW writemasks operate in 32-bit, which means that they don't - * have a native 64-bit representation and they should always be split. - */ - if (inst->dst.writemask == WRITEMASK_XY || - inst->dst.writemask == WRITEMASK_ZW) { - skip_lowering = false; - } else { - for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8) - continue; - skip_lowering = skip_lowering && is_supported_64bit_region(inst, i); - } - } - - if (skip_lowering) - continue; - - /* Generate scalar instructions for each enabled channel */ - for (unsigned chan = 0; chan < 4; chan++) { - unsigned chan_mask = 1 << chan; - if (!(inst->dst.writemask & chan_mask)) - continue; - - vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst); - - for (unsigned i = 0; i < 3; i++) { - unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan); - scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz); - } - - scalar_inst->dst.writemask = chan_mask; - - if (inst->predicate != BRW_PREDICATE_NONE) { - scalar_inst->predicate = - scalarize_predicate(inst->predicate, chan_mask); - } - - inst->insert_before(block, scalar_inst); - } - - inst->remove(block); - progress = true; - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - -bool -vec4_visitor::lower_64bit_mad_to_mul_add() -{ - bool progress = false; - - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - if (inst->opcode != BRW_OPCODE_MAD) - continue; - - if (type_sz(inst->dst.type) != 8) - continue; - - dst_reg mul_dst = dst_reg(this, glsl_dvec4_type()); - - /* Use the copy constructor so we copy all relevant instruction fields - * from the original mad into the add and mul instructions - */ - vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst); - mul->opcode = BRW_OPCODE_MUL; - mul->dst = mul_dst; - mul->src[0] = inst->src[1]; - mul->src[1] = inst->src[2]; - mul->src[2].file = BAD_FILE; - - vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst); - add->opcode = BRW_OPCODE_ADD; - add->src[0] = src_reg(mul_dst); - add->src[1] = inst->src[0]; - add->src[2].file = BAD_FILE; - - inst->insert_before(block, mul); - inst->insert_before(block, add); - inst->remove(block); - - progress = true; - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - -/* The align16 hardware can only do 32-bit swizzle channels, so we need to - * translate the logical 64-bit swizzle channels that we use in the Vec4 IR - * to 32-bit swizzle channels in hardware registers. - * - * @inst and @arg identify the original vec4 IR source operand we need to - * translate the swizzle for and @hw_reg is the hardware register where we - * will write the hardware swizzle to use. - * - * This pass assumes that Align16/DF instructions have been fully scalarized - * previously so there is just one 64-bit swizzle channel to deal with for any - * given Vec4 IR source. - */ -void -vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg, - vec4_instruction *inst, int arg) -{ - src_reg reg = inst->src[arg]; - - if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE) - return; - - /* If this is not a 64-bit operand or this is a scalar instruction we don't - * need to do anything about the swizzles. - */ - if(type_sz(reg.type) < 8 || is_align1_df(inst)) { - hw_reg->swizzle = reg.swizzle; - return; - } - - /* Take the 64-bit logical swizzle channel and translate it to 32-bit */ - assert(brw_is_single_value_swizzle(reg.swizzle) || - is_supported_64bit_region(inst, arg)); - - /* Apply the region <2, 2, 1> for GRF or <0, 2, 1> for uniforms, as align16 - * HW can only do 32-bit swizzle channels. - */ - hw_reg->width = BRW_WIDTH_2; - - if (is_supported_64bit_region(inst, arg) && - !is_gfx7_supported_64bit_swizzle(inst, arg)) { - /* Supported 64-bit swizzles are those such that their first two - * components, when expanded to 32-bit swizzles, match the semantics - * of the original 64-bit swizzle with 2-wide row regioning. - */ - unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0); - unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1); - hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1, - swizzle1 * 2, swizzle1 * 2 + 1); - } else { - /* If we got here then we have one of the following: - * - * 1. An unsupported swizzle, which should be single-value thanks to the - * scalarization pass. - * - * 2. A gfx7 supported swizzle. These can be single-value or double-value - * swizzles. If the latter, they are never cross-dvec2 channels. For - * these we always need to activate the gfx7 vstride=0 exploit. - */ - unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0); - unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1); - assert((swizzle0 < 2) == (swizzle1 < 2)); - - /* To gain access to Z/W components we need to select the second half - * of the register and then use a X/Y swizzle to select Z/W respectively. - */ - if (swizzle0 >= 2) { - *hw_reg = suboffset(*hw_reg, 2); - swizzle0 -= 2; - swizzle1 -= 2; - } - - /* All gfx7-specific supported swizzles require the vstride=0 exploit */ - if (devinfo->ver == 7 && is_gfx7_supported_64bit_swizzle(inst, arg)) - hw_reg->vstride = BRW_VERTICAL_STRIDE_0; - - /* Any 64-bit source with an offset at 16B is intended to address the - * second half of a register and needs a vertical stride of 0 so we: - * - * 1. Don't violate register region restrictions. - * 2. Activate the gfx7 instruction decompression bug exploit when - * execsize > 4 - */ - if (hw_reg->subnr % REG_SIZE == 16) { - assert(devinfo->ver == 7); - hw_reg->vstride = BRW_VERTICAL_STRIDE_0; - } - - hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1, - swizzle1 * 2, swizzle1 * 2 + 1); - } -} - -void -vec4_visitor::invalidate_analysis(brw::analysis_dependency_class c) -{ - backend_shader::invalidate_analysis(c); - live_analysis.invalidate(c); -} - -bool -vec4_visitor::run() -{ - setup_push_ranges(); - - if (prog_data->base.zero_push_reg) { - /* push_reg_mask_param is in uint32 params and UNIFORM is in vec4s */ - const unsigned mask_param = stage_prog_data->push_reg_mask_param; - src_reg mask = src_reg(dst_reg(UNIFORM, mask_param / 4)); - assert(mask_param % 2 == 0); /* Should be 64-bit-aligned */ - mask.swizzle = BRW_SWIZZLE4((mask_param + 0) % 4, - (mask_param + 1) % 4, - (mask_param + 0) % 4, - (mask_param + 1) % 4); - - emit(VEC4_OPCODE_ZERO_OOB_PUSH_REGS, - dst_reg(VGRF, alloc.allocate(3)), mask); - } - - emit_prolog(); - - emit_nir_code(); - if (failed) - return false; - base_ir = NULL; - - emit_thread_end(); - - calculate_cfg(); - cfg->validate(_mesa_shader_stage_to_abbrev(stage)); - - /* Before any optimization, push array accesses out to scratch - * space where we need them to be. This pass may allocate new - * virtual GRFs, so we want to do it early. It also makes sure - * that we have reladdr computations available for CSE, since we'll - * often do repeated subexpressions for those. - */ - move_grf_array_access_to_scratch(); - split_uniform_registers(); - - split_virtual_grfs(); - -#define OPT(pass, args...) ({ \ - pass_num++; \ - bool this_progress = pass(args); \ - \ - if (INTEL_DEBUG(DEBUG_OPTIMIZER) && this_progress) { \ - char filename[64]; \ - snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass, \ - _mesa_shader_stage_to_abbrev(stage), \ - nir->info.name, iteration, pass_num); \ - \ - backend_shader::dump_instructions(filename); \ - } \ - \ - cfg->validate(_mesa_shader_stage_to_abbrev(stage)); \ - progress = progress || this_progress; \ - this_progress; \ - }) - - - if (INTEL_DEBUG(DEBUG_OPTIMIZER)) { - char filename[64]; - snprintf(filename, 64, "%s-%s-00-00-start", - _mesa_shader_stage_to_abbrev(stage), nir->info.name); - - backend_shader::dump_instructions(filename); - } - - bool progress; - int iteration = 0; - int pass_num = 0; - do { - progress = false; - pass_num = 0; - iteration++; - - OPT(opt_predicated_break, *this); - OPT(opt_reduce_swizzle); - OPT(dead_code_eliminate); - OPT(dead_control_flow_eliminate, *this); - OPT(opt_copy_propagation); - OPT(opt_cmod_propagation); - OPT(opt_cse); - OPT(opt_algebraic); - OPT(opt_register_coalesce); - OPT(eliminate_find_live_channel); - } while (progress); - - pass_num = 0; - - if (OPT(opt_vector_float)) { - OPT(opt_cse); - OPT(opt_copy_propagation, false); - OPT(opt_copy_propagation, true); - OPT(dead_code_eliminate); - } - - if (devinfo->ver <= 5 && OPT(lower_minmax)) { - OPT(opt_cmod_propagation); - OPT(opt_cse); - OPT(opt_copy_propagation); - OPT(dead_code_eliminate); - } - - if (OPT(lower_simd_width)) { - OPT(opt_copy_propagation); - OPT(dead_code_eliminate); - } - - if (failed) - return false; - - OPT(lower_64bit_mad_to_mul_add); - - /* Run this before payload setup because tessellation shaders - * rely on it to prevent cross dvec2 regioning on DF attributes - * that are setup so that XY are on the second half of register and - * ZW are in the first half of the next. - */ - OPT(scalarize_df); - - setup_payload(); - - if (INTEL_DEBUG(DEBUG_SPILL_VEC4)) { - /* Debug of register spilling: Go spill everything. */ - const int grf_count = alloc.count; - float spill_costs[alloc.count]; - bool no_spill[alloc.count]; - evaluate_spill_costs(spill_costs, no_spill); - for (int i = 0; i < grf_count; i++) { - if (no_spill[i]) - continue; - spill_reg(i); - } - - /* We want to run this after spilling because 64-bit (un)spills need to - * emit code to shuffle 64-bit data for the 32-bit scratch read/write - * messages that can produce unsupported 64-bit swizzle regions. - */ - OPT(scalarize_df); - } - - fixup_3src_null_dest(); - - bool allocated_without_spills = reg_allocate(); - - if (!allocated_without_spills) { - brw_shader_perf_log(compiler, log_data, - "%s shader triggered register spilling. " - "Try reducing the number of live vec4 values " - "to improve performance.\n", - _mesa_shader_stage_to_string(stage)); - - while (!reg_allocate()) { - if (failed) - return false; - } - - /* We want to run this after spilling because 64-bit (un)spills need to - * emit code to shuffle 64-bit data for the 32-bit scratch read/write - * messages that can produce unsupported 64-bit swizzle regions. - */ - OPT(scalarize_df); - } - - opt_schedule_instructions(); - - opt_set_dependency_control(); - - convert_to_hw_regs(); - - if (last_scratch > 0) { - prog_data->base.total_scratch = - brw_get_scratch_size(last_scratch * REG_SIZE); - } - - return !failed; -} - -} /* namespace brw */ - diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h deleted file mode 100644 index ca803386309..00000000000 --- a/src/intel/compiler/brw_vec4.h +++ /dev/null @@ -1,350 +0,0 @@ -/* - * Copyright © 2011 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef BRW_VEC4_H -#define BRW_VEC4_H - -#include "brw_shader.h" - -#ifdef __cplusplus -#include "brw_ir_vec4.h" -#include "brw_ir_performance.h" -#include "brw_vec4_builder.h" -#include "brw_vec4_live_variables.h" -#endif - -#include "compiler/glsl/ir.h" -#include "compiler/nir/nir.h" - - -#ifdef __cplusplus -extern "C" { -#endif - -const unsigned * -brw_vec4_generate_assembly(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const nir_shader *nir, - struct brw_vue_prog_data *prog_data, - const struct cfg_t *cfg, - const brw::performance &perf, - bool debug_enabled); - -#ifdef __cplusplus -} /* extern "C" */ - -namespace brw { -/** - * The vertex shader front-end. - * - * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and - * fixed-function) into VS IR. - */ -class vec4_visitor : public backend_shader -{ -public: - vec4_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_sampler_prog_key_data *key, - struct brw_vue_prog_data *prog_data, - const nir_shader *shader, - bool no_spills, - bool debug_enabled); - - dst_reg dst_null_f() - { - return dst_reg(brw_null_reg()); - } - - dst_reg dst_null_df() - { - return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF)); - } - - dst_reg dst_null_d() - { - return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)); - } - - dst_reg dst_null_ud() - { - return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD)); - } - - const struct brw_sampler_prog_key_data * const key_tex; - struct brw_vue_prog_data * const prog_data; - char *fail_msg; - bool failed; - - /** - * GLSL IR currently being processed, which is associated with our - * driver IR instructions for debugging purposes. - */ - const void *base_ir; - const char *current_annotation; - - int first_non_payload_grf; - unsigned ubo_push_start[4]; - unsigned push_length; - unsigned int max_grf; - brw_analysis live_analysis; - brw_analysis performance_analysis; - - /* Regs for vertex results. Generated at ir_variable visiting time - * for the ir->location's used. - */ - dst_reg output_reg[VARYING_SLOT_TESS_MAX][4]; - unsigned output_num_components[VARYING_SLOT_TESS_MAX][4]; - const char *output_reg_annotation[VARYING_SLOT_TESS_MAX]; - int uniforms; - - bool run(); - void fail(const char *msg, ...); - - int setup_uniforms(int payload_reg); - - bool reg_allocate_trivial(); - bool reg_allocate(); - void evaluate_spill_costs(float *spill_costs, bool *no_spill); - int choose_spill_reg(struct ra_graph *g); - void spill_reg(unsigned spill_reg); - void move_grf_array_access_to_scratch(); - void split_uniform_registers(); - void setup_push_ranges(); - virtual void invalidate_analysis(brw::analysis_dependency_class c); - void split_virtual_grfs(); - bool opt_vector_float(); - bool opt_reduce_swizzle(); - bool dead_code_eliminate(); - bool opt_cmod_propagation(); - bool opt_copy_propagation(bool do_constant_prop = true); - bool opt_cse_local(bblock_t *block, const vec4_live_variables &live); - bool opt_cse(); - bool opt_algebraic(); - bool opt_register_coalesce(); - bool eliminate_find_live_channel(); - bool is_dep_ctrl_unsafe(const vec4_instruction *inst); - void opt_set_dependency_control(); - void opt_schedule_instructions(); - void convert_to_hw_regs(); - void fixup_3src_null_dest(); - - bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg); - bool lower_simd_width(); - bool scalarize_df(); - bool lower_64bit_mad_to_mul_add(); - void apply_logical_swizzle(struct brw_reg *hw_reg, - vec4_instruction *inst, int arg); - - vec4_instruction *emit(vec4_instruction *inst); - - vec4_instruction *emit(enum opcode opcode); - vec4_instruction *emit(enum opcode opcode, const dst_reg &dst); - vec4_instruction *emit(enum opcode opcode, const dst_reg &dst, - const src_reg &src0); - vec4_instruction *emit(enum opcode opcode, const dst_reg &dst, - const src_reg &src0, const src_reg &src1); - vec4_instruction *emit(enum opcode opcode, const dst_reg &dst, - const src_reg &src0, const src_reg &src1, - const src_reg &src2); - - vec4_instruction *emit_before(bblock_t *block, - vec4_instruction *inst, - vec4_instruction *new_inst); - -#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &); -#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &); -#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &); - EMIT1(MOV) - EMIT1(NOT) - EMIT1(RNDD) - EMIT1(RNDE) - EMIT1(RNDZ) - EMIT1(FRC) - EMIT1(F32TO16) - EMIT1(F16TO32) - EMIT2(ADD) - EMIT2(MUL) - EMIT2(MACH) - EMIT2(MAC) - EMIT2(AND) - EMIT2(OR) - EMIT2(XOR) - EMIT2(DP3) - EMIT2(DP4) - EMIT2(DPH) - EMIT2(SHL) - EMIT2(SHR) - EMIT2(ASR) - vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1, - enum brw_conditional_mod condition); - vec4_instruction *IF(src_reg src0, src_reg src1, - enum brw_conditional_mod condition); - vec4_instruction *IF(enum brw_predicate predicate); - EMIT1(SCRATCH_READ) - EMIT2(SCRATCH_WRITE) - EMIT3(LRP) - EMIT1(BFREV) - EMIT3(BFE) - EMIT2(BFI1) - EMIT3(BFI2) - EMIT1(FBH) - EMIT1(FBL) - EMIT1(CBIT) - EMIT1(LZD) - EMIT3(MAD) - EMIT2(ADDC) - EMIT2(SUBB) - EMIT1(DIM) - -#undef EMIT1 -#undef EMIT2 -#undef EMIT3 - - vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, - src_reg src0, src_reg src1); - - /** - * Copy any live channel from \p src to the first channel of the - * result. - */ - src_reg emit_uniformize(const src_reg &src); - - /** Fix all float operands of a 3-source instruction. */ - void fix_float_operands(src_reg op[3], nir_alu_instr *instr); - - src_reg fix_3src_operand(const src_reg &src); - - vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0, - const src_reg &src1 = src_reg()); - - src_reg fix_math_operand(const src_reg &src); - - void emit_pack_half_2x16(dst_reg dst, src_reg src0); - void emit_unpack_half_2x16(dst_reg dst, src_reg src0); - void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0); - void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0); - void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0); - void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0); - - src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate, - src_reg surface); - - void emit_ndc_computation(); - void emit_psiz_and_flags(dst_reg reg); - vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp); - virtual void emit_urb_slot(dst_reg reg, int varying); - - src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst, - src_reg *reladdr, int reg_offset); - void emit_scratch_read(bblock_t *block, vec4_instruction *inst, - dst_reg dst, - src_reg orig_src, - int base_offset); - void emit_scratch_write(bblock_t *block, vec4_instruction *inst, - int base_offset); - void emit_pull_constant_load_reg(dst_reg dst, - src_reg surf_index, - src_reg offset, - bblock_t *before_block, - vec4_instruction *before_inst); - src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block, - vec4_instruction *inst, src_reg src); - - void resolve_ud_negate(src_reg *reg); - - void emit_shader_float_controls_execution_mode(); - - bool lower_minmax(); - - src_reg get_timestamp(); - - virtual void dump_instruction_to_file(const backend_instruction *inst, FILE *file) const; - - bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate); - - void emit_conversion_from_double(dst_reg dst, src_reg src); - void emit_conversion_to_double(dst_reg dst, src_reg src); - - vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src, - bool for_write, - bool for_scratch = false, - bblock_t *block = NULL, - vec4_instruction *ref = NULL); - - virtual void emit_nir_code(); - virtual void nir_setup_uniforms(); - virtual void nir_emit_impl(nir_function_impl *impl); - virtual void nir_emit_cf_list(exec_list *list); - virtual void nir_emit_if(nir_if *if_stmt); - virtual void nir_emit_loop(nir_loop *loop); - virtual void nir_emit_block(nir_block *block); - virtual void nir_emit_instr(nir_instr *instr); - virtual void nir_emit_load_const(nir_load_const_instr *instr); - src_reg get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr); - virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); - virtual void nir_emit_alu(nir_alu_instr *instr); - virtual void nir_emit_jump(nir_jump_instr *instr); - virtual void nir_emit_texture(nir_tex_instr *instr); - virtual void nir_emit_undef(nir_undef_instr *instr); - virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr); - - dst_reg get_nir_def(const nir_def &def, enum brw_reg_type type); - dst_reg get_nir_def(const nir_def &def, nir_alu_type type); - dst_reg get_nir_def(const nir_def &def); - src_reg get_nir_src(const nir_src &src, enum brw_reg_type type, - unsigned num_components = 4); - src_reg get_nir_src(const nir_src &src, nir_alu_type type, - unsigned num_components = 4); - src_reg get_nir_src(const nir_src &src, - unsigned num_components = 4); - src_reg get_nir_src_imm(const nir_src &src); - src_reg get_indirect_offset(nir_intrinsic_instr *instr); - - dst_reg *nir_ssa_values; - -protected: - void emit_vertex(); - void setup_payload_interference(struct ra_graph *g, int first_payload_node, - int reg_node_count); - virtual void setup_payload() = 0; - virtual void emit_prolog() = 0; - virtual void emit_thread_end() = 0; - virtual void emit_urb_write_header(int mrf) = 0; - virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0; - virtual void gs_emit_vertex(int stream_id); - virtual void gs_end_primitive(); - -private: - /** - * If true, then register allocation should fail instead of spilling. - */ - const bool no_spills; - - unsigned last_scratch; /**< measured in 32-byte (register size) units */ -}; - -} /* namespace brw */ -#endif /* __cplusplus */ - -#endif /* BRW_VEC4_H */ diff --git a/src/intel/compiler/brw_vec4_builder.h b/src/intel/compiler/brw_vec4_builder.h deleted file mode 100644 index 322a6aae20b..00000000000 --- a/src/intel/compiler/brw_vec4_builder.h +++ /dev/null @@ -1,646 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright © 2010-2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef BRW_VEC4_BUILDER_H -#define BRW_VEC4_BUILDER_H - -#include "brw_ir_vec4.h" -#include "brw_ir_allocator.h" - -namespace brw { - /** - * Toolbox to assemble a VEC4 IR program out of individual instructions. - * - * This object is meant to have an interface consistent with - * brw::fs_builder. They cannot be fully interchangeable because - * brw::fs_builder generates scalar code while brw::vec4_builder generates - * vector code. - */ - class vec4_builder { - public: - /** Type used in this IR to represent a source of an instruction. */ - typedef brw::src_reg src_reg; - - /** Type used in this IR to represent the destination of an instruction. */ - typedef brw::dst_reg dst_reg; - - /** Type used in this IR to represent an instruction. */ - typedef vec4_instruction instruction; - - /** - * Construct a vec4_builder that inserts instructions into \p shader. - */ - vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) : - shader(shader), block(NULL), cursor(NULL), - _dispatch_width(dispatch_width), _group(0), - force_writemask_all(false), - annotation() - { - } - - /** - * Construct a vec4_builder that inserts instructions into \p shader - * before instruction \p inst in basic block \p block. The default - * execution controls and debug annotation are initialized from the - * instruction passed as argument. - */ - vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) : - shader(shader), block(block), cursor(inst), - _dispatch_width(inst->exec_size), _group(inst->group), - force_writemask_all(inst->force_writemask_all) - { - annotation.str = inst->annotation; - annotation.ir = inst->ir; - } - - /** - * Construct a vec4_builder that inserts instructions before \p cursor - * in basic block \p block, inheriting other code generation parameters - * from this. - */ - vec4_builder - at(bblock_t *block, exec_node *cursor) const - { - vec4_builder bld = *this; - bld.block = block; - bld.cursor = cursor; - return bld; - } - - /** - * Construct a vec4_builder appending instructions at the end of the - * instruction list of the shader, inheriting other code generation - * parameters from this. - */ - vec4_builder - at_end() const - { - return at(NULL, (exec_node *)&shader->instructions.tail_sentinel); - } - - /** - * Construct a builder specifying the default SIMD width and group of - * channel enable signals, inheriting other code generation parameters - * from this. - * - * \p n gives the default SIMD width, \p i gives the slot group used for - * predication and control flow masking in multiples of \p n channels. - */ - vec4_builder - group(unsigned n, unsigned i) const - { - assert(force_writemask_all || - (n <= dispatch_width() && i < dispatch_width() / n)); - vec4_builder bld = *this; - bld._dispatch_width = n; - bld._group += i * n; - return bld; - } - - /** - * Construct a builder with per-channel control flow execution masking - * disabled if \p b is true. If control flow execution masking is - * already disabled this has no effect. - */ - vec4_builder - exec_all(bool b = true) const - { - vec4_builder bld = *this; - if (b) - bld.force_writemask_all = true; - return bld; - } - - /** - * Construct a builder with the given debug annotation info. - */ - vec4_builder - annotate(const char *str, const void *ir = NULL) const - { - vec4_builder bld = *this; - bld.annotation.str = str; - bld.annotation.ir = ir; - return bld; - } - - /** - * Get the SIMD width in use. - */ - unsigned - dispatch_width() const - { - return _dispatch_width; - } - - /** - * Get the channel group in use. - */ - unsigned - group() const - { - return _group; - } - - /** - * Allocate a virtual register of natural vector size (four for this IR) - * and SIMD width. \p n gives the amount of space to allocate in - * dispatch_width units (which is just enough space for four logical - * components in this IR). - */ - dst_reg - vgrf(enum brw_reg_type type, unsigned n = 1) const - { - assert(dispatch_width() <= 32); - - if (n > 0) - return retype(dst_reg(VGRF, shader->alloc.allocate( - n * DIV_ROUND_UP(type_sz(type), 4))), - type); - else - return retype(null_reg_ud(), type); - } - - /** - * Create a null register of floating type. - */ - dst_reg - null_reg_f() const - { - return dst_reg(retype(brw_null_vec(dispatch_width()), - BRW_REGISTER_TYPE_F)); - } - - /** - * Create a null register of signed integer type. - */ - dst_reg - null_reg_d() const - { - return dst_reg(retype(brw_null_vec(dispatch_width()), - BRW_REGISTER_TYPE_D)); - } - - /** - * Create a null register of unsigned integer type. - */ - dst_reg - null_reg_ud() const - { - return dst_reg(retype(brw_null_vec(dispatch_width()), - BRW_REGISTER_TYPE_UD)); - } - - /** - * Insert an instruction into the program. - */ - instruction * - emit(const instruction &inst) const - { - return emit(new(shader->mem_ctx) instruction(inst)); - } - - /** - * Create and insert a nullary control instruction into the program. - */ - instruction * - emit(enum opcode opcode) const - { - return emit(instruction(opcode)); - } - - /** - * Create and insert a nullary instruction into the program. - */ - instruction * - emit(enum opcode opcode, const dst_reg &dst) const - { - return emit(instruction(opcode, dst)); - } - - /** - * Create and insert a unary instruction into the program. - */ - instruction * - emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const - { - switch (opcode) { - case SHADER_OPCODE_RCP: - case SHADER_OPCODE_RSQ: - case SHADER_OPCODE_SQRT: - case SHADER_OPCODE_EXP2: - case SHADER_OPCODE_LOG2: - case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: - return fix_math_instruction( - emit(instruction(opcode, dst, - fix_math_operand(src0)))); - - default: - return emit(instruction(opcode, dst, src0)); - } - } - - /** - * Create and insert a binary instruction into the program. - */ - instruction * - emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, - const src_reg &src1) const - { - switch (opcode) { - case SHADER_OPCODE_POW: - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - return fix_math_instruction( - emit(instruction(opcode, dst, - fix_math_operand(src0), - fix_math_operand(src1)))); - - default: - return emit(instruction(opcode, dst, src0, src1)); - } - } - - /** - * Create and insert a ternary instruction into the program. - */ - instruction * - emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, - const src_reg &src1, const src_reg &src2) const - { - switch (opcode) { - case BRW_OPCODE_BFE: - case BRW_OPCODE_BFI2: - case BRW_OPCODE_MAD: - case BRW_OPCODE_LRP: - return emit(instruction(opcode, dst, - fix_3src_operand(src0), - fix_3src_operand(src1), - fix_3src_operand(src2))); - - default: - return emit(instruction(opcode, dst, src0, src1, src2)); - } - } - - /** - * Insert a preallocated instruction into the program. - */ - instruction * - emit(instruction *inst) const - { - inst->exec_size = dispatch_width(); - inst->group = group(); - inst->force_writemask_all = force_writemask_all; - inst->size_written = inst->exec_size * type_sz(inst->dst.type); - inst->annotation = annotation.str; - inst->ir = annotation.ir; - - if (block) - static_cast(cursor)->insert_before(block, inst); - else - cursor->insert_before(inst); - - return inst; - } - - /** - * Select \p src0 if the comparison of both sources with the given - * conditional mod evaluates to true, otherwise select \p src1. - * - * Generally useful to get the minimum or maximum of two values. - */ - instruction * - emit_minmax(const dst_reg &dst, const src_reg &src0, - const src_reg &src1, brw_conditional_mod mod) const - { - assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L); - - return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0), - fix_unsigned_negate(src1))); - } - - /** - * Copy any live channel from \p src to the first channel of the result. - */ - src_reg - emit_uniformize(const src_reg &src) const - { - const vec4_builder ubld = exec_all(); - const dst_reg chan_index = - writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X); - const dst_reg dst = vgrf(src.type); - - ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index); - ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index)); - - return src_reg(dst); - } - - /** - * Assorted arithmetic ops. - * @{ - */ -#define ALU1(op) \ - instruction * \ - op(const dst_reg &dst, const src_reg &src0) const \ - { \ - return emit(BRW_OPCODE_##op, dst, src0); \ - } - -#define ALU2(op) \ - instruction * \ - op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ - { \ - return emit(BRW_OPCODE_##op, dst, src0, src1); \ - } - -#define ALU2_ACC(op) \ - instruction * \ - op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \ - { \ - instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1); \ - inst->writes_accumulator = true; \ - return inst; \ - } - -#define ALU3(op) \ - instruction * \ - op(const dst_reg &dst, const src_reg &src0, const src_reg &src1, \ - const src_reg &src2) const \ - { \ - return emit(BRW_OPCODE_##op, dst, src0, src1, src2); \ - } - - ALU2(ADD) - ALU2_ACC(ADDC) - ALU2(AND) - ALU2(ASR) - ALU2(AVG) - ALU3(BFE) - ALU2(BFI1) - ALU3(BFI2) - ALU1(BFREV) - ALU1(CBIT) - ALU3(CSEL) - ALU1(DIM) - ALU2(DP2) - ALU2(DP3) - ALU2(DP4) - ALU2(DPH) - ALU1(F16TO32) - ALU1(F32TO16) - ALU1(FBH) - ALU1(FBL) - ALU1(FRC) - ALU2(LINE) - ALU1(LZD) - ALU2(MAC) - ALU2_ACC(MACH) - ALU3(MAD) - ALU1(MOV) - ALU2(MUL) - ALU1(NOT) - ALU2(OR) - ALU2(PLN) - ALU1(RNDD) - ALU1(RNDE) - ALU1(RNDU) - ALU1(RNDZ) - ALU2(SAD2) - ALU2_ACC(SADA2) - ALU2(SEL) - ALU2(SHL) - ALU2(SHR) - ALU2_ACC(SUBB) - ALU2(XOR) - -#undef ALU3 -#undef ALU2_ACC -#undef ALU2 -#undef ALU1 - /** @} */ - - /** - * CMP: Sets the low bit of the destination channels with the result - * of the comparison, while the upper bits are undefined, and updates - * the flag register with the packed 16 bits of the result. - */ - instruction * - CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1, - brw_conditional_mod condition) const - { - /* Take the instruction: - * - * CMP null src0 src1 - * - * Original gfx4 does type conversion to the destination type - * before comparison, producing garbage results for floating - * point comparisons. - * - * The destination type doesn't matter on newer generations, - * so we set the type to match src0 so we can compact the - * instruction. - */ - return set_condmod(condition, - emit(BRW_OPCODE_CMP, retype(dst, src0.type), - fix_unsigned_negate(src0), - fix_unsigned_negate(src1))); - } - - /** - * CMPN: Behaves like CMP, but produces true if src1 is NaN. - */ - instruction * - CMPN(const dst_reg &dst, const src_reg &src0, const src_reg &src1, - brw_conditional_mod condition) const - { - /* Take the instruction: - * - * CMPN null src0 src1 - * - * Original gfx4 does type conversion to the destination type - * before comparison, producing garbage results for floating - * point comparisons. - * - * The destination type doesn't matter on newer generations, - * so we set the type to match src0 so we can compact the - * instruction. - */ - return set_condmod(condition, - emit(BRW_OPCODE_CMPN, retype(dst, src0.type), - fix_unsigned_negate(src0), - fix_unsigned_negate(src1))); - } - - /** - * Gfx4 predicated IF. - */ - instruction * - IF(brw_predicate predicate) const - { - return set_predicate(predicate, emit(BRW_OPCODE_IF)); - } - - /** - * Gfx6 IF with embedded comparison. - */ - instruction * - IF(const src_reg &src0, const src_reg &src1, - brw_conditional_mod condition) const - { - assert(shader->devinfo->ver == 6); - return set_condmod(condition, - emit(BRW_OPCODE_IF, - null_reg_d(), - fix_unsigned_negate(src0), - fix_unsigned_negate(src1))); - } - - /** - * Emit a linear interpolation instruction. - */ - instruction * - LRP(const dst_reg &dst, const src_reg &x, const src_reg &y, - const src_reg &a) const - { - /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so - * we need to reorder the operands. - */ - assert(shader->devinfo->ver >= 6 && shader->devinfo->ver <= 9); - return emit(BRW_OPCODE_LRP, dst, a, y, x); - } - - backend_shader *shader; - - protected: - /** - * Workaround for negation of UD registers. See comment in - * fs_generator::generate_code() for the details. - */ - src_reg - fix_unsigned_negate(const src_reg &src) const - { - if (src.type == BRW_REGISTER_TYPE_UD && src.negate) { - dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD); - MOV(temp, src); - return src_reg(temp); - } else { - return src; - } - } - - /** - * Workaround for register access modes not supported by the ternary - * instruction encoding. - */ - src_reg - fix_3src_operand(const src_reg &src) const - { - /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be - * able to use vertical stride of zero to replicate the vec4 uniform, like - * - * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] - * - * But you can't, since vertical stride is always four in three-source - * instructions. Instead, insert a MOV instruction to do the replication so - * that the three-source instruction can consume it. - */ - - /* The MOV is only needed if the source is a uniform or immediate. */ - if (src.file != UNIFORM && src.file != IMM) - return src; - - if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) - return src; - - const dst_reg expanded = vgrf(src.type); - emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); - return src_reg(expanded); - } - - /** - * Workaround for register access modes not supported by the math - * instruction. - */ - src_reg - fix_math_operand(const src_reg &src) const - { - /* The gfx6 math instruction ignores the source modifiers -- - * swizzle, abs, negate, and at least some parts of the register - * region description. - * - * Rather than trying to enumerate all these cases, *always* expand the - * operand to a temp GRF for gfx6. - * - * For gfx7, keep the operand as-is, except if immediate, which gfx7 still - * can't use. - */ - if (shader->devinfo->ver == 6 || - (shader->devinfo->ver == 7 && src.file == IMM)) { - const dst_reg tmp = vgrf(src.type); - MOV(tmp, src); - return src_reg(tmp); - } else { - return src; - } - } - - /** - * Workaround other weirdness of the math instruction. - */ - instruction * - fix_math_instruction(instruction *inst) const - { - if (shader->devinfo->ver == 6 && - inst->dst.writemask != WRITEMASK_XYZW) { - const dst_reg tmp = vgrf(inst->dst.type); - MOV(inst->dst, src_reg(tmp)); - inst->dst = tmp; - - } else if (shader->devinfo->ver < 6) { - const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2); - inst->base_mrf = 1; - inst->mlen = sources; - } - - return inst; - } - - bblock_t *block; - exec_node *cursor; - - unsigned _dispatch_width; - unsigned _group; - bool force_writemask_all; - - /** Debug annotation info. */ - struct { - const char *str; - const void *ir; - } annotation; - }; -} - -#endif diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp deleted file mode 100644 index a3d7f7e8558..00000000000 --- a/src/intel/compiler/brw_vec4_cmod_propagation.cpp +++ /dev/null @@ -1,365 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - */ - -/** @file brw_vec4_cmod_propagation.cpp - * - * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check - * brw_fs_cmod_propagation for further details on the rationale behind this - * optimization. - */ - -#include "brw_vec4.h" -#include "brw_cfg.h" -#include "brw_eu.h" - -namespace brw { - -static bool -writemasks_incompatible(const vec4_instruction *earlier, - const vec4_instruction *later) -{ - return (earlier->dst.writemask != WRITEMASK_X && - earlier->dst.writemask != WRITEMASK_XYZW) || - (earlier->dst.writemask == WRITEMASK_XYZW && - later->src[0].swizzle != BRW_SWIZZLE_XYZW) || - (later->dst.writemask & ~earlier->dst.writemask) != 0; -} - -static bool -opt_cmod_propagation_local(bblock_t *block, vec4_visitor *v) -{ - bool progress = false; - UNUSED int ip = block->end_ip + 1; - - foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) { - ip--; - - if ((inst->opcode != BRW_OPCODE_AND && - inst->opcode != BRW_OPCODE_CMP && - inst->opcode != BRW_OPCODE_MOV) || - inst->predicate != BRW_PREDICATE_NONE || - !inst->dst.is_null() || - (inst->src[0].file != VGRF && inst->src[0].file != ATTR && - inst->src[0].file != UNIFORM)) - continue; - - /* An ABS source modifier can only be handled when processing a compare - * with a value other than zero. - */ - if (inst->src[0].abs && - (inst->opcode != BRW_OPCODE_CMP || inst->src[1].is_zero())) - continue; - - if (inst->opcode == BRW_OPCODE_AND && - !(inst->src[1].is_one() && - inst->conditional_mod == BRW_CONDITIONAL_NZ && - !inst->src[0].negate)) - continue; - - if (inst->opcode == BRW_OPCODE_MOV && - inst->conditional_mod != BRW_CONDITIONAL_NZ) - continue; - - bool read_flag = false; - foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) { - /* A CMP with a second source of zero can match with anything. A CMP - * with a second source that is not zero can only match with an ADD - * instruction. - */ - if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero()) { - bool negate; - - if (scan_inst->opcode != BRW_OPCODE_ADD) - goto not_match; - - if (writemasks_incompatible(scan_inst, inst)) - goto not_match; - - /* A CMP is basically a subtraction. The result of the - * subtraction must be the same as the result of the addition. - * This means that one of the operands must be negated. So (a + - * b) vs (a == -b) or (a + -b) vs (a == b). - */ - if ((inst->src[0].equals(scan_inst->src[0]) && - inst->src[1].negative_equals(scan_inst->src[1])) || - (inst->src[0].equals(scan_inst->src[1]) && - inst->src[1].negative_equals(scan_inst->src[0]))) { - negate = false; - } else if ((inst->src[0].negative_equals(scan_inst->src[0]) && - inst->src[1].equals(scan_inst->src[1])) || - (inst->src[0].negative_equals(scan_inst->src[1]) && - inst->src[1].equals(scan_inst->src[0]))) { - negate = true; - } else { - goto not_match; - } - - if (scan_inst->exec_size != inst->exec_size || - scan_inst->group != inst->group) - goto not_match; - - /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods": - * - * * Note that the [post condition signal] bits generated at - * the output of a compute are before the .sat. - * - * So we don't have to bail if scan_inst has saturate. - */ - - /* Otherwise, try propagating the conditional. */ - const enum brw_conditional_mod cond = - negate ? brw_swap_cmod(inst->conditional_mod) - : inst->conditional_mod; - - if (scan_inst->can_do_cmod() && - ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || - scan_inst->conditional_mod == cond)) { - scan_inst->conditional_mod = cond; - inst->remove(block); - progress = true; - } - break; - } - - if (regions_overlap(inst->src[0], inst->size_read(0), - scan_inst->dst, scan_inst->size_written)) { - if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) || - scan_inst->dst.offset != inst->src[0].offset || - scan_inst->exec_size != inst->exec_size || - scan_inst->group != inst->group) { - break; - } - - /* If scan_inst is a CMP that produces a single value and inst is - * a CMP.NZ that consumes only that value, remove inst. - */ - if (inst->conditional_mod == BRW_CONDITIONAL_NZ && - (inst->src[0].type == BRW_REGISTER_TYPE_D || - inst->src[0].type == BRW_REGISTER_TYPE_UD) && - (inst->opcode == BRW_OPCODE_CMP || - inst->opcode == BRW_OPCODE_MOV) && - scan_inst->opcode == BRW_OPCODE_CMP && - ((inst->src[0].swizzle == BRW_SWIZZLE_XXXX && - scan_inst->dst.writemask == WRITEMASK_X) || - (inst->src[0].swizzle == BRW_SWIZZLE_YYYY && - scan_inst->dst.writemask == WRITEMASK_Y) || - (inst->src[0].swizzle == BRW_SWIZZLE_ZZZZ && - scan_inst->dst.writemask == WRITEMASK_Z) || - (inst->src[0].swizzle == BRW_SWIZZLE_WWWW && - scan_inst->dst.writemask == WRITEMASK_W))) { - if (inst->dst.writemask != scan_inst->dst.writemask) { - src_reg temp(v, glsl_vec4_type(), 1); - - /* Given a sequence like: - * - * cmp.ge.f0(8) g21<1>.zF g20<4>.xF g18<4>.xF - * ... - * cmp.nz.f0(8) null<1>D g21<4>.zD 0D - * - * Replace it with something like: - * - * cmp.ge.f0(8) g22<1>.zF g20<4>.xF g18<4>.xF - * mov(8) g21<1>.xF g22<1>.zzzzF - * - * The added MOV will most likely be removed later. In the - * worst case, it should be cheaper to schedule. - */ - temp.swizzle = brw_swizzle_for_mask(inst->dst.writemask); - temp.type = scan_inst->src[0].type; - - vec4_instruction *mov = v->MOV(scan_inst->dst, temp); - - /* Modify the source swizzles on scan_inst. If scan_inst - * was - * - * cmp.ge.f0(8) g21<1>.zF g20<4>.wzyxF g18<4>.yxwzF - * - * replace it with - * - * cmp.ge.f0(8) g21<1>.zF g20<4>.yyyyF g18<4>.wwwwF - */ - unsigned src0_chan; - unsigned src1_chan; - switch (scan_inst->dst.writemask) { - case WRITEMASK_X: - src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 0); - src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 0); - break; - case WRITEMASK_Y: - src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 1); - src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 1); - break; - case WRITEMASK_Z: - src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 2); - src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 2); - break; - case WRITEMASK_W: - src0_chan = BRW_GET_SWZ(scan_inst->src[0].swizzle, 3); - src1_chan = BRW_GET_SWZ(scan_inst->src[1].swizzle, 3); - break; - default: - unreachable("Impossible writemask"); - } - - scan_inst->src[0].swizzle = BRW_SWIZZLE4(src0_chan, - src0_chan, - src0_chan, - src0_chan); - - /* There's no swizzle on immediate value sources. */ - if (scan_inst->src[1].file != IMM) { - scan_inst->src[1].swizzle = BRW_SWIZZLE4(src1_chan, - src1_chan, - src1_chan, - src1_chan); - } - - scan_inst->dst = dst_reg(temp); - scan_inst->dst.writemask = inst->dst.writemask; - - scan_inst->insert_after(block, mov); - } - - inst->remove(block); - progress = true; - break; - } - - if (writemasks_incompatible(scan_inst, inst)) - break; - - /* CMP's result is the same regardless of dest type. */ - if (inst->conditional_mod == BRW_CONDITIONAL_NZ && - scan_inst->opcode == BRW_OPCODE_CMP && - (inst->dst.type == BRW_REGISTER_TYPE_D || - inst->dst.type == BRW_REGISTER_TYPE_UD)) { - inst->remove(block); - progress = true; - break; - } - - /* If the AND wasn't handled by the previous case, it isn't safe - * to remove it. - */ - if (inst->opcode == BRW_OPCODE_AND) - break; - - /* Comparisons operate differently for ints and floats */ - if (scan_inst->dst.type != inst->dst.type && - (scan_inst->dst.type == BRW_REGISTER_TYPE_F || - inst->dst.type == BRW_REGISTER_TYPE_F)) - break; - - /* If the instruction generating inst's source also wrote the - * flag, and inst is doing a simple .nz comparison, then inst - * is redundant - the appropriate value is already in the flag - * register. Delete inst. - */ - if (inst->conditional_mod == BRW_CONDITIONAL_NZ && - !inst->src[0].negate && - scan_inst->writes_flag(v->devinfo)) { - inst->remove(block); - progress = true; - break; - } - - /* The conditional mod of the CMP/CMPN instructions behaves - * specially because the flag output is not calculated from the - * result of the instruction, but the other way around, which - * means that even if the condmod to propagate and the condmod - * from the CMP instruction are the same they will in general give - * different results because they are evaluated based on different - * inputs. - */ - if (scan_inst->opcode == BRW_OPCODE_CMP || - scan_inst->opcode == BRW_OPCODE_CMPN) - break; - - /* From the Sky Lake PRM Vol. 7 "Assigning Conditional Mods": - * - * * Note that the [post condition signal] bits generated at - * the output of a compute are before the .sat. - */ - if (scan_inst->saturate) - break; - - /* From the Sky Lake PRM, Vol 2a, "Multiply": - * - * "When multiplying integer data types, if one of the sources - * is a DW, the resulting full precision data is stored in - * the accumulator. However, if the destination data type is - * either W or DW, the low bits of the result are written to - * the destination register and the remaining high bits are - * discarded. This results in undefined Overflow and Sign - * flags. Therefore, conditional modifiers and saturation - * (.sat) cannot be used in this case. - * - * We just disallow cmod propagation on all integer multiplies. - */ - if (!brw_reg_type_is_floating_point(scan_inst->dst.type) && - scan_inst->opcode == BRW_OPCODE_MUL) - break; - - /* Otherwise, try propagating the conditional. */ - enum brw_conditional_mod cond = - inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod) - : inst->conditional_mod; - - if (scan_inst->can_do_cmod() && - ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) || - scan_inst->conditional_mod == cond)) { - scan_inst->conditional_mod = cond; - inst->remove(block); - progress = true; - } - break; - } - - not_match: - if (scan_inst->writes_flag(v->devinfo)) - break; - - read_flag = read_flag || scan_inst->reads_flag(); - } - } - - return progress; -} - -bool -vec4_visitor::opt_cmod_propagation() -{ - bool progress = false; - - foreach_block_reverse(block, cfg) { - progress = opt_cmod_propagation_local(block, this) || progress; - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - -} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_copy_propagation.cpp b/src/intel/compiler/brw_vec4_copy_propagation.cpp deleted file mode 100644 index fd535fd88af..00000000000 --- a/src/intel/compiler/brw_vec4_copy_propagation.cpp +++ /dev/null @@ -1,556 +0,0 @@ -/* - * Copyright © 2011 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -/** - * @file brw_vec4_copy_propagation.cpp - * - * Implements tracking of values copied between registers, and - * optimizations based on that: copy propagation and constant - * propagation. - */ - -#include "brw_vec4.h" -#include "brw_cfg.h" -#include "brw_eu.h" - -namespace brw { - -struct copy_entry { - src_reg *value[4]; - int saturatemask; -}; - -static bool -is_direct_copy(vec4_instruction *inst) -{ - return (inst->opcode == BRW_OPCODE_MOV && - !inst->predicate && - inst->dst.file == VGRF && - inst->dst.offset % REG_SIZE == 0 && - !inst->dst.reladdr && - !inst->src[0].reladdr && - (inst->dst.type == inst->src[0].type || - (inst->dst.type == BRW_REGISTER_TYPE_F && - inst->src[0].type == BRW_REGISTER_TYPE_VF))); -} - -static bool -is_dominated_by_previous_instruction(vec4_instruction *inst) -{ - return (inst->opcode != BRW_OPCODE_DO && - inst->opcode != BRW_OPCODE_WHILE && - inst->opcode != BRW_OPCODE_ELSE && - inst->opcode != BRW_OPCODE_ENDIF); -} - -static bool -is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch) -{ - const src_reg *src = values[ch]; - - /* consider GRF only */ - assert(inst->dst.file == VGRF); - if (!src || src->file != VGRF) - return false; - - return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) && - (inst->dst.offset != src->offset || - inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch))); -} - -/** - * Get the origin of a copy as a single register if all components present in - * the given readmask originate from the same register and have compatible - * regions, otherwise return a BAD_FILE register. - */ -static src_reg -get_copy_value(const copy_entry &entry, unsigned readmask) -{ - unsigned swz[4] = {}; - src_reg value; - - for (unsigned i = 0; i < 4; i++) { - if (readmask & (1 << i)) { - if (entry.value[i]) { - src_reg src = *entry.value[i]; - - if (src.file == IMM) { - swz[i] = i; - } else { - swz[i] = BRW_GET_SWZ(src.swizzle, i); - /* Overwrite the original swizzle so the src_reg::equals call - * below doesn't care about it, the correct swizzle will be - * calculated once the swizzles of all components are known. - */ - src.swizzle = BRW_SWIZZLE_XYZW; - } - - if (value.file == BAD_FILE) { - value = src; - } else if (!value.equals(src)) { - return src_reg(); - } - } else { - return src_reg(); - } - } - } - - return swizzle(value, - brw_compose_swizzle(brw_swizzle_for_mask(readmask), - BRW_SWIZZLE4(swz[0], swz[1], - swz[2], swz[3]))); -} - -static bool -try_constant_propagate(vec4_instruction *inst, - int arg, const copy_entry *entry) -{ - /* For constant propagation, we only handle the same constant - * across all 4 channels. Some day, we should handle the 8-bit - * float vector format, which would let us constant propagate - * vectors better. - * We could be more aggressive here -- some channels might not get used - * based on the destination writemask. - */ - src_reg value = - get_copy_value(*entry, - brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, - WRITEMASK_XYZW)); - - if (value.file != IMM) - return false; - - /* 64-bit types can't be used except for one-source instructions, which - * higher levels should have constant folded away, so there's no point in - * propagating immediates here. - */ - if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8) - return false; - - if (value.type == BRW_REGISTER_TYPE_VF) { - /* The result of bit-casting the component values of a vector float - * cannot in general be represented as an immediate. - */ - if (inst->src[arg].type != BRW_REGISTER_TYPE_F) - return false; - } else { - value.type = inst->src[arg].type; - } - - if (inst->src[arg].abs) { - if (!brw_abs_immediate(value.type, &value.as_brw_reg())) - return false; - } - - if (inst->src[arg].negate) { - if (!brw_negate_immediate(value.type, &value.as_brw_reg())) - return false; - } - - value = swizzle(value, inst->src[arg].swizzle); - - switch (inst->opcode) { - case BRW_OPCODE_MOV: - case SHADER_OPCODE_BROADCAST: - inst->src[arg] = value; - return true; - - case VEC4_OPCODE_UNTYPED_ATOMIC: - if (arg == 1) { - inst->src[arg] = value; - return true; - } - break; - - case SHADER_OPCODE_POW: - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - break; - case BRW_OPCODE_DP2: - case BRW_OPCODE_DP3: - case BRW_OPCODE_DP4: - case BRW_OPCODE_DPH: - case BRW_OPCODE_BFI1: - case BRW_OPCODE_ASR: - case BRW_OPCODE_SHL: - case BRW_OPCODE_SHR: - case BRW_OPCODE_SUBB: - if (arg == 1) { - inst->src[arg] = value; - return true; - } - break; - - case BRW_OPCODE_MACH: - case BRW_OPCODE_MUL: - case SHADER_OPCODE_MULH: - case BRW_OPCODE_ADD: - case BRW_OPCODE_OR: - case BRW_OPCODE_AND: - case BRW_OPCODE_XOR: - case BRW_OPCODE_ADDC: - if (arg == 1) { - inst->src[arg] = value; - return true; - } else if (arg == 0 && inst->src[1].file != IMM) { - /* Fit this constant in by commuting the operands. Exception: we - * can't do this for 32-bit integer MUL/MACH because it's asymmetric. - */ - if ((inst->opcode == BRW_OPCODE_MUL || - inst->opcode == BRW_OPCODE_MACH) && - (inst->src[1].type == BRW_REGISTER_TYPE_D || - inst->src[1].type == BRW_REGISTER_TYPE_UD)) - break; - inst->src[0] = inst->src[1]; - inst->src[1] = value; - return true; - } - break; - case GS_OPCODE_SET_WRITE_OFFSET: - /* This is just a multiply by a constant with special strides. - * The generator will handle immediates in both arguments (generating - * a single MOV of the product). So feel free to propagate in src0. - */ - inst->src[arg] = value; - return true; - - case BRW_OPCODE_CMP: - if (arg == 1) { - inst->src[arg] = value; - return true; - } else if (arg == 0 && inst->src[1].file != IMM) { - enum brw_conditional_mod new_cmod; - - new_cmod = brw_swap_cmod(inst->conditional_mod); - if (new_cmod != BRW_CONDITIONAL_NONE) { - /* Fit this constant in by swapping the operands and - * flipping the test. - */ - inst->src[0] = inst->src[1]; - inst->src[1] = value; - inst->conditional_mod = new_cmod; - return true; - } - } - break; - - case BRW_OPCODE_SEL: - if (arg == 1) { - inst->src[arg] = value; - return true; - } else if (arg == 0 && inst->src[1].file != IMM) { - inst->src[0] = inst->src[1]; - inst->src[1] = value; - - /* If this was predicated, flipping operands means - * we also need to flip the predicate. - */ - if (inst->conditional_mod == BRW_CONDITIONAL_NONE) { - inst->predicate_inverse = !inst->predicate_inverse; - } - return true; - } - break; - - default: - break; - } - - return false; -} - -static bool -is_align1_opcode(unsigned opcode) -{ - switch (opcode) { - case VEC4_OPCODE_DOUBLE_TO_F32: - case VEC4_OPCODE_DOUBLE_TO_D32: - case VEC4_OPCODE_DOUBLE_TO_U32: - case VEC4_OPCODE_TO_DOUBLE: - case VEC4_OPCODE_PICK_LOW_32BIT: - case VEC4_OPCODE_PICK_HIGH_32BIT: - case VEC4_OPCODE_SET_LOW_32BIT: - case VEC4_OPCODE_SET_HIGH_32BIT: - return true; - default: - return false; - } -} - -static bool -try_copy_propagate(const struct brw_compiler *compiler, - vec4_instruction *inst, int arg, - const copy_entry *entry, int attributes_per_reg) -{ - const struct intel_device_info *devinfo = compiler->devinfo; - - /* Build up the value we are propagating as if it were the source of a - * single MOV - */ - src_reg value = - get_copy_value(*entry, - brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle, - WRITEMASK_XYZW)); - - /* Check that we can propagate that value */ - if (value.file != UNIFORM && - value.file != VGRF && - value.file != ATTR) - return false; - - /* Instructions that write 2 registers also need to read 2 registers. Make - * sure we don't break that restriction by copy propagating from a uniform. - */ - if (inst->size_written > REG_SIZE && is_uniform(value)) - return false; - - /* There is a regioning restriction such that if execsize == width - * and hstride != 0 then the vstride can't be 0. When we split instrutions - * that take a single-precision source (like F->DF conversions) we end up - * with a 4-wide source on an instruction with an execution size of 4. - * If we then copy-propagate the source from a uniform we also end up with a - * vstride of 0 and we violate the restriction. - */ - if (inst->exec_size == 4 && value.file == UNIFORM && - type_sz(value.type) == 4) - return false; - - /* If the type of the copy value is different from the type of the - * instruction then the swizzles and writemasks involved don't have the same - * meaning and simply replacing the source would produce different semantics. - */ - if (type_sz(value.type) != type_sz(inst->src[arg].type)) - return false; - - if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE) - return false; - - bool has_source_modifiers = value.negate || value.abs; - - /* gfx6 math and gfx7+ SENDs from GRFs ignore source modifiers on - * instructions. - */ - if (has_source_modifiers && !inst->can_do_source_mods(devinfo)) - return false; - - /* Reject cases that would violate register regioning restrictions. */ - if ((value.file == UNIFORM || value.swizzle != BRW_SWIZZLE_XYZW) && - ((devinfo->ver == 6 && inst->is_math()) || - inst->is_send_from_grf() || - inst->uses_indirect_addressing())) { - return false; - } - - if (has_source_modifiers && - value.type != inst->src[arg].type && - !inst->can_change_types()) - return false; - - if (has_source_modifiers && - (inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE || - inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)) - return false; - - unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle, - value.swizzle); - - /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles - * so copy-propagation won't be safe if the composed swizzle is anything - * other than the identity. - */ - if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW) - return false; - - if (inst->is_3src(compiler) && - (value.file == UNIFORM || - (value.file == ATTR && attributes_per_reg != 1)) && - !brw_is_single_value_swizzle(composed_swizzle)) - return false; - - if (inst->is_send_from_grf()) - return false; - - /* we can't generally copy-propagate UD negations because we - * end up accessing the resulting values as signed integers - * instead. See also resolve_ud_negate(). - */ - if (value.negate && - value.type == BRW_REGISTER_TYPE_UD) - return false; - - /* Don't report progress if this is a noop. */ - if (value.equals(inst->src[arg])) - return false; - - const unsigned dst_saturate_mask = inst->dst.writemask & - brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask); - - if (dst_saturate_mask) { - /* We either saturate all or nothing. */ - if (dst_saturate_mask != inst->dst.writemask) - return false; - - /* Limit saturate propagation only to SEL with src1 bounded within 0.0 - * and 1.0, otherwise skip copy propagate altogether. - */ - switch(inst->opcode) { - case BRW_OPCODE_SEL: - if (arg != 0 || - inst->src[0].type != BRW_REGISTER_TYPE_F || - inst->src[1].file != IMM || - inst->src[1].type != BRW_REGISTER_TYPE_F || - inst->src[1].f < 0.0 || - inst->src[1].f > 1.0) { - return false; - } - if (!inst->saturate) - inst->saturate = true; - break; - default: - return false; - } - } - - /* Build the final value */ - if (inst->src[arg].abs) { - value.negate = false; - value.abs = true; - } - if (inst->src[arg].negate) - value.negate = !value.negate; - - value.swizzle = composed_swizzle; - if (has_source_modifiers && - value.type != inst->src[arg].type) { - assert(inst->can_change_types()); - for (int i = 0; i < 3; i++) { - inst->src[i].type = value.type; - } - inst->dst.type = value.type; - } else { - value.type = inst->src[arg].type; - } - - inst->src[arg] = value; - return true; -} - -bool -vec4_visitor::opt_copy_propagation(bool do_constant_prop) -{ - /* If we are in dual instanced or single mode, then attributes are going - * to be interleaved, so one register contains two attribute slots. - */ - const int attributes_per_reg = - prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; - bool progress = false; - struct copy_entry entries[alloc.total_size]; - - memset(&entries, 0, sizeof(entries)); - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - /* This pass only works on basic blocks. If there's flow - * control, throw out all our information and start from - * scratch. - * - * This should really be fixed by using a structure like in - * src/glsl/opt_copy_propagation.cpp to track available copies. - */ - if (!is_dominated_by_previous_instruction(inst)) { - memset(&entries, 0, sizeof(entries)); - continue; - } - - /* For each source arg, see if each component comes from a copy - * from the same type file (IMM, VGRF, UNIFORM), and try - * optimizing out access to the copy result - */ - for (int i = 2; i >= 0; i--) { - /* Copied values end up in GRFs, and we don't track reladdr - * accesses. - */ - if (inst->src[i].file != VGRF || - inst->src[i].reladdr) - continue; - - /* We only handle register-aligned single GRF copies. */ - if (inst->size_read(i) != REG_SIZE || - inst->src[i].offset % REG_SIZE) - continue; - - const unsigned reg = (alloc.offsets[inst->src[i].nr] + - inst->src[i].offset / REG_SIZE); - const copy_entry &entry = entries[reg]; - - if (do_constant_prop && try_constant_propagate(inst, i, &entry)) - progress = true; - else if (try_copy_propagate(compiler, inst, i, &entry, attributes_per_reg)) - progress = true; - } - - /* Track available source registers. */ - if (inst->dst.file == VGRF) { - const int reg = - alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE; - - /* Update our destination's current channel values. For a direct copy, - * the value is the newly propagated source. Otherwise, we don't know - * the new value, so clear it. - */ - bool direct_copy = is_direct_copy(inst); - entries[reg].saturatemask &= ~inst->dst.writemask; - for (int i = 0; i < 4; i++) { - if (inst->dst.writemask & (1 << i)) { - entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL; - entries[reg].saturatemask |= - inst->saturate && direct_copy ? 1 << i : 0; - } - } - - /* Clear the records for any registers whose current value came from - * our destination's updated channels, as the two are no longer equal. - */ - if (inst->dst.reladdr) - memset(&entries, 0, sizeof(entries)); - else { - for (unsigned i = 0; i < alloc.total_size; i++) { - for (int j = 0; j < 4; j++) { - if (is_channel_updated(inst, entries[i].value, j)) { - entries[i].value[j] = NULL; - entries[i].saturatemask &= ~(1 << j); - } - } - } - } - } - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | - DEPENDENCY_INSTRUCTION_DETAIL); - - return progress; -} - -} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_cse.cpp b/src/intel/compiler/brw_vec4_cse.cpp deleted file mode 100644 index c4c9ea68e15..00000000000 --- a/src/intel/compiler/brw_vec4_cse.cpp +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Copyright © 2012, 2013, 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_vec4.h" -#include "brw_vec4_live_variables.h" -#include "brw_cfg.h" - -using namespace brw; - -/** @file brw_vec4_cse.cpp - * - * Support for local common subexpression elimination. - * - * See Muchnick's Advanced Compiler Design and Implementation, section - * 13.1 (p378). - */ - -namespace { -struct aeb_entry : public exec_node { - /** The instruction that generates the expression value. */ - vec4_instruction *generator; - - /** The temporary where the value is stored. */ - src_reg tmp; -}; -} - -static bool -is_expression(const vec4_instruction *const inst) -{ - switch (inst->opcode) { - case BRW_OPCODE_MOV: - case BRW_OPCODE_SEL: - case BRW_OPCODE_NOT: - case BRW_OPCODE_AND: - case BRW_OPCODE_OR: - case BRW_OPCODE_XOR: - case BRW_OPCODE_SHR: - case BRW_OPCODE_SHL: - case BRW_OPCODE_ASR: - case BRW_OPCODE_CMP: - case BRW_OPCODE_CMPN: - case BRW_OPCODE_ADD: - case BRW_OPCODE_MUL: - case SHADER_OPCODE_MULH: - case BRW_OPCODE_FRC: - case BRW_OPCODE_RNDU: - case BRW_OPCODE_RNDD: - case BRW_OPCODE_RNDE: - case BRW_OPCODE_RNDZ: - case BRW_OPCODE_LINE: - case BRW_OPCODE_PLN: - case BRW_OPCODE_MAD: - case BRW_OPCODE_LRP: - case VEC4_OPCODE_UNPACK_UNIFORM: - case SHADER_OPCODE_FIND_LIVE_CHANNEL: - case SHADER_OPCODE_BROADCAST: - case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS: - case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: - return true; - case SHADER_OPCODE_RCP: - case SHADER_OPCODE_RSQ: - case SHADER_OPCODE_SQRT: - case SHADER_OPCODE_EXP2: - case SHADER_OPCODE_LOG2: - case SHADER_OPCODE_POW: - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: - return inst->mlen == 0; - default: - return false; - } -} - -static bool -operands_match(const vec4_instruction *a, const vec4_instruction *b) -{ - const src_reg *xs = a->src; - const src_reg *ys = b->src; - - if (a->opcode == BRW_OPCODE_MAD) { - return xs[0].equals(ys[0]) && - ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) || - (xs[2].equals(ys[1]) && xs[1].equals(ys[2]))); - } else if (a->opcode == BRW_OPCODE_MOV && - xs[0].file == IMM && - xs[0].type == BRW_REGISTER_TYPE_VF) { - src_reg tmp_x = xs[0]; - src_reg tmp_y = ys[0]; - - /* Smash out the values that are not part of the writemask. Otherwise - * the equals operator will fail due to mismatches in unused components. - */ - const unsigned ab_writemask = a->dst.writemask & b->dst.writemask; - const uint32_t mask = ((ab_writemask & WRITEMASK_X) ? 0x000000ff : 0) | - ((ab_writemask & WRITEMASK_Y) ? 0x0000ff00 : 0) | - ((ab_writemask & WRITEMASK_Z) ? 0x00ff0000 : 0) | - ((ab_writemask & WRITEMASK_W) ? 0xff000000 : 0); - - tmp_x.ud &= mask; - tmp_y.ud &= mask; - - return tmp_x.equals(tmp_y); - } else if (!a->is_commutative()) { - return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]); - } else { - return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) || - (xs[1].equals(ys[0]) && xs[0].equals(ys[1])); - } -} - -/** - * Checks if instructions match, exactly for sources, but loosely for - * destination writemasks. - * - * \param 'a' is the generating expression from the AEB entry. - * \param 'b' is the second occurrence of the expression that we're - * considering eliminating. - */ -static bool -instructions_match(vec4_instruction *a, vec4_instruction *b) -{ - return a->opcode == b->opcode && - a->saturate == b->saturate && - a->predicate == b->predicate && - a->predicate_inverse == b->predicate_inverse && - a->conditional_mod == b->conditional_mod && - a->flag_subreg == b->flag_subreg && - a->dst.type == b->dst.type && - a->offset == b->offset && - a->mlen == b->mlen && - a->base_mrf == b->base_mrf && - a->header_size == b->header_size && - a->shadow_compare == b->shadow_compare && - ((a->dst.writemask & b->dst.writemask) == a->dst.writemask) && - a->force_writemask_all == b->force_writemask_all && - a->size_written == b->size_written && - a->exec_size == b->exec_size && - a->group == b->group && - operands_match(a, b); -} - -bool -vec4_visitor::opt_cse_local(bblock_t *block, const vec4_live_variables &live) -{ - bool progress = false; - exec_list aeb; - - void *cse_ctx = ralloc_context(NULL); - - int ip = block->start_ip; - foreach_inst_in_block (vec4_instruction, inst, block) { - /* Skip some cases. */ - if (is_expression(inst) && !inst->predicate && inst->mlen == 0 && - ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) || - inst->dst.is_null())) - { - bool found = false; - - foreach_in_list_use_after(aeb_entry, entry, &aeb) { - /* Match current instruction's expression against those in AEB. */ - if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) && - instructions_match(inst, entry->generator)) { - found = true; - progress = true; - break; - } - } - - if (!found) { - if (inst->opcode != BRW_OPCODE_MOV || - (inst->opcode == BRW_OPCODE_MOV && - inst->src[0].file == IMM && - inst->src[0].type == BRW_REGISTER_TYPE_VF)) { - /* Our first sighting of this expression. Create an entry. */ - aeb_entry *entry = ralloc(cse_ctx, aeb_entry); - entry->tmp = src_reg(); /* file will be BAD_FILE */ - entry->generator = inst; - aeb.push_tail(entry); - } - } else { - /* This is at least our second sighting of this expression. - * If we don't have a temporary already, make one. - */ - bool no_existing_temp = entry->tmp.file == BAD_FILE; - if (no_existing_temp && !entry->generator->dst.is_null()) { - entry->tmp = retype(src_reg(VGRF, alloc.allocate( - regs_written(entry->generator)), - NULL), inst->dst.type); - - const unsigned width = entry->generator->exec_size; - unsigned component_size = width * type_sz(entry->tmp.type); - unsigned num_copy_movs = - DIV_ROUND_UP(entry->generator->size_written, component_size); - for (unsigned i = 0; i < num_copy_movs; ++i) { - vec4_instruction *copy = - MOV(offset(entry->generator->dst, width, i), - offset(entry->tmp, width, i)); - copy->exec_size = width; - copy->group = entry->generator->group; - copy->force_writemask_all = - entry->generator->force_writemask_all; - entry->generator->insert_after(block, copy); - } - - entry->generator->dst = dst_reg(entry->tmp); - } - - /* dest <- temp */ - if (!inst->dst.is_null()) { - assert(inst->dst.type == entry->tmp.type); - const unsigned width = inst->exec_size; - unsigned component_size = width * type_sz(inst->dst.type); - unsigned num_copy_movs = - DIV_ROUND_UP(inst->size_written, component_size); - for (unsigned i = 0; i < num_copy_movs; ++i) { - vec4_instruction *copy = - MOV(offset(inst->dst, width, i), - offset(entry->tmp, width, i)); - copy->exec_size = inst->exec_size; - copy->group = inst->group; - copy->force_writemask_all = inst->force_writemask_all; - inst->insert_before(block, copy); - } - } - - /* Set our iterator so that next time through the loop inst->next - * will get the instruction in the basic block after the one we've - * removed. - */ - vec4_instruction *prev = (vec4_instruction *)inst->prev; - - inst->remove(block); - inst = prev; - } - } - - foreach_in_list_safe(aeb_entry, entry, &aeb) { - /* Kill all AEB entries that write a different value to or read from - * the flag register if we just wrote it. - */ - if (inst->writes_flag(devinfo)) { - if (entry->generator->reads_flag() || - (entry->generator->writes_flag(devinfo) && - !instructions_match(inst, entry->generator))) { - entry->remove(); - ralloc_free(entry); - continue; - } - } - - for (int i = 0; i < 3; i++) { - src_reg *src = &entry->generator->src[i]; - - /* Kill all AEB entries that use the destination we just - * overwrote. - */ - if (inst->dst.file == entry->generator->src[i].file && - inst->dst.nr == entry->generator->src[i].nr) { - entry->remove(); - ralloc_free(entry); - break; - } - - /* Kill any AEB entries using registers that don't get reused any - * more -- a sure sign they'll fail operands_match(). - */ - if (src->file == VGRF) { - if (live.var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) { - entry->remove(); - ralloc_free(entry); - break; - } - } - } - } - - ip++; - } - - ralloc_free(cse_ctx); - - return progress; -} - -bool -vec4_visitor::opt_cse() -{ - bool progress = false; - const vec4_live_variables &live = live_analysis.require(); - - foreach_block (block, cfg) { - progress = opt_cse_local(block, live) || progress; - } - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} diff --git a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp deleted file mode 100644 index 10a64a56143..00000000000 --- a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp +++ /dev/null @@ -1,188 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_vec4.h" -#include "brw_vec4_live_variables.h" -#include "brw_cfg.h" - -/** @file brw_vec4_dead_code_eliminate.cpp - * - * Dataflow-aware dead code elimination. - * - * Walks the instruction list from the bottom, removing instructions that - * have results that both aren't used in later blocks and haven't been read - * yet in the tail end of this block. - */ - -using namespace brw; - -bool -vec4_visitor::dead_code_eliminate() -{ - bool progress = false; - - const vec4_live_variables &live_vars = live_analysis.require(); - int num_vars = live_vars.num_vars; - BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars)); - BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1); - - foreach_block_reverse_safe(block, cfg) { - memcpy(live, live_vars.block_data[block->num].liveout, - sizeof(BITSET_WORD) * BITSET_WORDS(num_vars)); - memcpy(flag_live, live_vars.block_data[block->num].flag_liveout, - sizeof(BITSET_WORD)); - - foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) { - if ((inst->dst.file == VGRF && !inst->has_side_effects()) || - (inst->dst.is_null() && inst->writes_flag(devinfo))){ - bool result_live[4] = { false }; - if (inst->dst.file == VGRF) { - for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { - for (int c = 0; c < 4; c++) { - const unsigned v = var_from_reg(alloc, inst->dst, c, i); - result_live[c] |= BITSET_TEST(live, v); - } - } - } else { - for (unsigned c = 0; c < 4; c++) - result_live[c] = BITSET_TEST(flag_live, c); - } - - /* If the instruction can't do writemasking, then it's all or - * nothing. - */ - if (!inst->can_do_writemask(devinfo)) { - bool result = result_live[0] | result_live[1] | - result_live[2] | result_live[3]; - result_live[0] = result; - result_live[1] = result; - result_live[2] = result; - result_live[3] = result; - } - - if (inst->writes_flag(devinfo)) { - /* Independently calculate the usage of the flag components and - * the destination value components. - */ - uint8_t flag_mask = inst->dst.writemask; - uint8_t dest_mask = inst->dst.writemask; - - for (int c = 0; c < 4; c++) { - if (!result_live[c] && dest_mask & (1 << c)) - dest_mask &= ~(1 << c); - - if (!BITSET_TEST(flag_live, c)) - flag_mask &= ~(1 << c); - } - - if (inst->dst.writemask != (flag_mask | dest_mask)) { - progress = true; - inst->dst.writemask = flag_mask | dest_mask; - } - - /* If none of the destination components are read, replace the - * destination register with the NULL register. - */ - if (dest_mask == 0) { - progress = true; - inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type)); - } - } else { - for (int c = 0; c < 4; c++) { - if (!result_live[c] && inst->dst.writemask & (1 << c)) { - inst->dst.writemask &= ~(1 << c); - progress = true; - - if (inst->dst.writemask == 0) { - if (inst->writes_accumulator) { - inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type)); - } else { - inst->opcode = BRW_OPCODE_NOP; - break; - } - } - } - } - } - } - - if (inst->dst.is_null() && inst->writes_flag(devinfo)) { - bool combined_live = false; - for (unsigned c = 0; c < 4; c++) - combined_live |= BITSET_TEST(flag_live, c); - - if (!combined_live) { - inst->opcode = BRW_OPCODE_NOP; - progress = true; - } - } - - if (inst->dst.file == VGRF && !inst->predicate && - !inst->is_align1_partial_write()) { - for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { - for (int c = 0; c < 4; c++) { - if (inst->dst.writemask & (1 << c)) { - const unsigned v = var_from_reg(alloc, inst->dst, c, i); - BITSET_CLEAR(live, v); - } - } - } - } - - if (inst->writes_flag(devinfo) && !inst->predicate && inst->exec_size == 8) { - for (unsigned c = 0; c < 4; c++) - BITSET_CLEAR(flag_live, c); - } - - if (inst->opcode == BRW_OPCODE_NOP) { - inst->remove(block); - continue; - } - - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) { - for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) { - for (int c = 0; c < 4; c++) { - const unsigned v = var_from_reg(alloc, inst->src[i], c, j); - BITSET_SET(live, v); - } - } - } - } - - for (unsigned c = 0; c < 4; c++) { - if (inst->reads_flag(c)) { - BITSET_SET(flag_live, c); - } - } - } - } - - ralloc_free(live); - ralloc_free(flag_live); - - if (progress) - invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp deleted file mode 100644 index df414189f4b..00000000000 --- a/src/intel/compiler/brw_vec4_generator.cpp +++ /dev/null @@ -1,2319 +0,0 @@ -/* Copyright © 2011 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_vec4.h" -#include "brw_cfg.h" -#include "brw_eu.h" -#include "brw_disasm_info.h" -#include "dev/intel_debug.h" -#include "util/mesa-sha1.h" - -using namespace brw; - -static void -generate_math1_gfx4(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src) -{ - gfx4_math(p, - dst, - brw_math_function(inst->opcode), - inst->base_mrf, - src, - BRW_MATH_PRECISION_FULL); -} - -static void -check_gfx6_math_src_arg(struct brw_reg src) -{ - /* Source swizzles are ignored. */ - assert(!src.abs); - assert(!src.negate); - assert(src.swizzle == BRW_SWIZZLE_XYZW); -} - -static void -generate_math_gfx6(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) -{ - /* Can't do writemask because math can't be align16. */ - assert(dst.writemask == WRITEMASK_XYZW); - /* Source swizzles are ignored. */ - check_gfx6_math_src_arg(src0); - if (src1.file == BRW_GENERAL_REGISTER_FILE) - check_gfx6_math_src_arg(src1); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - gfx6_math(p, dst, brw_math_function(inst->opcode), src0, src1); - brw_set_default_access_mode(p, BRW_ALIGN_16); -} - -static void -generate_math2_gfx4(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) -{ - /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 - * "Message Payload": - * - * "Operand0[7]. For the INT DIV functions, this operand is the - * denominator." - * ... - * "Operand1[7]. For the INT DIV functions, this operand is the - * numerator." - */ - bool is_int_div = inst->opcode != SHADER_OPCODE_POW; - struct brw_reg &op0 = is_int_div ? src1 : src0; - struct brw_reg &op1 = is_int_div ? src0 : src1; - - brw_push_insn_state(p); - brw_set_default_saturate(p, false); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_default_flag_reg(p, 0, 0); - brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1); - brw_pop_insn_state(p); - - gfx4_math(p, - dst, - brw_math_function(inst->opcode), - inst->base_mrf, - op0, - BRW_MATH_PRECISION_FULL); -} - -static void -generate_tex(struct brw_codegen *p, - struct brw_vue_prog_data *prog_data, - gl_shader_stage stage, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg surface_index, - struct brw_reg sampler_index) -{ - const struct intel_device_info *devinfo = p->devinfo; - int msg_type = -1; - - if (devinfo->ver >= 5) { - switch (inst->opcode) { - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXL: - if (inst->shadow_compare) { - msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE; - } else { - msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LOD; - } - break; - case SHADER_OPCODE_TXD: - if (inst->shadow_compare) { - /* Gfx7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */ - assert(devinfo->verx10 == 75); - msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE; - } else { - msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_DERIVS; - } - break; - case SHADER_OPCODE_TXF: - msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; - break; - case SHADER_OPCODE_TXF_CMS: - if (devinfo->ver >= 7) - msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD2DMS; - else - msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_LD; - break; - case SHADER_OPCODE_TXF_MCS: - assert(devinfo->ver >= 7); - msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_LD_MCS; - break; - case SHADER_OPCODE_TXS: - msg_type = GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO; - break; - case SHADER_OPCODE_TG4: - if (inst->shadow_compare) { - msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C; - } else { - msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4; - } - break; - case SHADER_OPCODE_TG4_OFFSET: - if (inst->shadow_compare) { - msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C; - } else { - msg_type = GFX7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO; - } - break; - case SHADER_OPCODE_SAMPLEINFO: - msg_type = GFX6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO; - break; - default: - unreachable("should not get here: invalid vec4 texture opcode"); - } - } else { - switch (inst->opcode) { - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXL: - if (inst->shadow_compare) { - msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE; - assert(inst->mlen == 3); - } else { - msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD; - assert(inst->mlen == 2); - } - break; - case SHADER_OPCODE_TXD: - /* There is no sample_d_c message; comparisons are done manually. */ - msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS; - assert(inst->mlen == 4); - break; - case SHADER_OPCODE_TXF: - msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD; - assert(inst->mlen == 2); - break; - case SHADER_OPCODE_TXS: - msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO; - assert(inst->mlen == 2); - break; - default: - unreachable("should not get here: invalid vec4 texture opcode"); - } - } - - assert(msg_type != -1); - - assert(sampler_index.type == BRW_REGISTER_TYPE_UD); - - /* Load the message header if present. If there's a texture offset, we need - * to set it up explicitly and load the offset bitfield. Otherwise, we can - * use an implied move from g0 to the first message register. - */ - if (inst->header_size != 0) { - if (devinfo->ver < 6 && !inst->offset) { - /* Set up an implied move from g0 to the MRF. */ - src = brw_vec8_grf(0, 0); - } else { - struct brw_reg header = - retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); - uint32_t dw2 = 0; - - /* Explicitly set up the message header by copying g0 to the MRF. */ - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - - if (inst->offset) - /* Set the texel offset bits in DWord 2. */ - dw2 = inst->offset; - - /* The VS, DS, and FS stages have the g0.2 payload delivered as 0, - * so header0.2 is 0 when g0 is copied. The HS and GS stages do - * not, so we must set to to 0 to avoid setting undesirable bits - * in the message header. - */ - if (dw2 || - stage == MESA_SHADER_TESS_CTRL || - stage == MESA_SHADER_GEOMETRY) { - brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2)); - } - - brw_adjust_sampler_state_pointer(p, header, sampler_index); - brw_pop_insn_state(p); - } - } - - uint32_t return_format; - - switch (dst.type) { - case BRW_REGISTER_TYPE_D: - return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32; - break; - case BRW_REGISTER_TYPE_UD: - return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; - break; - default: - return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32; - break; - } - - /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type - * is set as part of the message descriptor. On gfx4, the PRM seems to - * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on - * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is - * gone from the message descriptor entirely and you just get UINT32 all - * the time regasrdless. Since we can really only do non-UINT32 on gfx4, - * just stomp it to UINT32 all the time. - */ - if (inst->opcode == SHADER_OPCODE_TXS) - return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32; - - if (surface_index.file == BRW_IMMEDIATE_VALUE && - sampler_index.file == BRW_IMMEDIATE_VALUE) { - uint32_t surface = surface_index.ud; - uint32_t sampler = sampler_index.ud; - - brw_SAMPLE(p, - dst, - inst->base_mrf, - src, - surface, - sampler % 16, - msg_type, - 1, /* response length */ - inst->mlen, - inst->header_size != 0, - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - return_format); - } else { - /* Non-constant sampler index. */ - - struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); - struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD)); - struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - - if (brw_regs_equal(&surface_reg, &sampler_reg)) { - brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101)); - } else { - if (sampler_reg.file == BRW_IMMEDIATE_VALUE) { - brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8)); - } else { - brw_SHL(p, addr, sampler_reg, brw_imm_ud(8)); - brw_OR(p, addr, addr, surface_reg); - } - } - brw_AND(p, addr, addr, brw_imm_ud(0xfff)); - - brw_pop_insn_state(p); - - if (inst->base_mrf != -1) - gfx6_resolve_implied_move(p, &src, inst->base_mrf); - - /* dst = send(offset, a0.0 | ) */ - brw_send_indirect_message( - p, BRW_SFID_SAMPLER, dst, src, addr, - brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) | - brw_sampler_desc(devinfo, - 0 /* surface */, - 0 /* sampler */, - msg_type, - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - return_format), - false /* EOT */); - - /* visitor knows more than we do about the surface limit required, - * so has already done marking. - */ - } -} - -static void -generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst) -{ - brw_urb_WRITE(p, - brw_null_reg(), /* dest */ - inst->base_mrf, /* starting mrf reg nr */ - brw_vec8_grf(0, 0), /* src */ - inst->urb_write_flags, - inst->mlen, - 0, /* response len */ - inst->offset, /* urb destination offset */ - BRW_URB_SWIZZLE_INTERLEAVE); -} - -static void -generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst) -{ - struct brw_reg src = brw_message_reg(inst->base_mrf); - brw_urb_WRITE(p, - brw_null_reg(), /* dest */ - inst->base_mrf, /* starting mrf reg nr */ - src, - inst->urb_write_flags, - inst->mlen, - 0, /* response len */ - inst->offset, /* urb destination offset */ - BRW_URB_SWIZZLE_INTERLEAVE); -} - -static void -generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst) -{ - struct brw_reg src = brw_message_reg(inst->base_mrf); - - /* We pass the temporary passed in src0 as the writeback register */ - brw_urb_WRITE(p, - inst->src[0].as_brw_reg(), /* dest */ - inst->base_mrf, /* starting mrf reg nr */ - src, - BRW_URB_WRITE_ALLOCATE_COMPLETE, - inst->mlen, - 1, /* response len */ - inst->offset, /* urb destination offset */ - BRW_URB_SWIZZLE_INTERLEAVE); - - /* Now put allocated urb handle in dst.0 */ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0), - get_element_ud(inst->src[0].as_brw_reg(), 0)); - brw_pop_insn_state(p); -} - -static void -generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst) -{ - struct brw_reg src = brw_message_reg(inst->base_mrf); - brw_urb_WRITE(p, - brw_null_reg(), /* dest */ - inst->base_mrf, /* starting mrf reg nr */ - src, - BRW_URB_WRITE_EOT | inst->urb_write_flags, - inst->mlen, - 0, /* response len */ - 0, /* urb destination offset */ - BRW_URB_SWIZZLE_INTERLEAVE); -} - -static void -generate_gs_set_write_offset(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) -{ - /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message - * Header: M0.3): - * - * Slot 0 Offset. This field, after adding to the Global Offset field - * in the message descriptor, specifies the offset (in 256-bit units) - * from the start of the URB entry, as referenced by URB Handle 0, at - * which the data will be accessed. - * - * Similar text describes DWORD M0.4, which is slot 1 offset. - * - * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components - * of the register for geometry shader invocations 0 and 1) by the - * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. - * - * We can do this with the following EU instruction: - * - * mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW { Align1 WE_all } - */ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - assert(p->devinfo->ver >= 7 && - src1.file == BRW_IMMEDIATE_VALUE && - src1.type == BRW_REGISTER_TYPE_UD && - src1.ud <= USHRT_MAX); - if (src0.file == BRW_IMMEDIATE_VALUE) { - brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3), - brw_imm_ud(src0.ud * src1.ud)); - } else { - if (src1.file == BRW_IMMEDIATE_VALUE) { - src1 = brw_imm_uw(src1.ud); - } - brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), - retype(src1, BRW_REGISTER_TYPE_UW)); - } - brw_pop_insn_state(p); -} - -static void -generate_gs_set_vertex_count(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg src) -{ - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - /* If we think of the src and dst registers as composed of 8 DWORDs each, - * we want to pick up the contents of DWORDs 0 and 4 from src, truncate - * them to WORDs, and then pack them into DWORD 2 of dst. - * - * It's easier to get the EU to do this if we think of the src and dst - * registers as composed of 16 WORDS each; then, we want to pick up the - * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 - * of dst. - * - * We can do that by the following EU instruction: - * - * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } - */ - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_MOV(p, - suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), - stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); - - brw_pop_insn_state(p); -} - -static void -generate_gs_svb_write(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) -{ - int binding = inst->sol_binding; - bool final_write = inst->sol_final_write; - - brw_push_insn_state(p); - brw_set_default_exec_size(p, BRW_EXECUTE_4); - /* Copy Vertex data into M0.x */ - brw_MOV(p, stride(dst, 4, 4, 1), - stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1)); - brw_pop_insn_state(p); - - brw_push_insn_state(p); - /* Send SVB Write */ - brw_svb_write(p, - final_write ? src1 : brw_null_reg(), /* dest == src1 */ - 1, /* msg_reg_nr */ - dst, /* src0 == previous dst */ - BRW_GFX6_SOL_BINDING_START + binding, /* binding_table_index */ - final_write); /* send_commit_msg */ - - /* Finally, wait for the write commit to occur so that we can proceed to - * other things safely. - * - * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3: - * - * The write commit does not modify the destination register, but - * merely clears the dependency associated with the destination - * register. Thus, a simple “mov” instruction using the register as a - * source is sufficient to wait for the write commit to occur. - */ - if (final_write) { - brw_MOV(p, src1, src1); - } - brw_pop_insn_state(p); -} - -static void -generate_gs_svb_set_destination_index(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src) -{ - int vertex = inst->sol_vertex; - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex)); - brw_pop_insn_state(p); -} - -static void -generate_gs_set_dword_2(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg src) -{ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0)); - brw_pop_insn_state(p); -} - -static void -generate_gs_prepare_channel_masks(struct brw_codegen *p, - struct brw_reg dst) -{ - /* We want to left shift just DWORD 4 (the x component belonging to the - * second geometry shader invocation) by 4 bits. So generate the - * instruction: - * - * shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all } - */ - dst = suboffset(vec1(dst), 4); - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_SHL(p, dst, dst, brw_imm_ud(4)); - brw_pop_insn_state(p); -} - -static void -generate_gs_set_channel_masks(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg src) -{ - /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message - * Header: M0.5): - * - * 15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask - * - * When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1 - * DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls - * Vertex 0 DATA[7]. This bit is ANDed with the corresponding - * channel enable to determine the final channel enable. For the - * URB_READ_OWORD & URB_READ_HWORD messages, when final channel - * enable is 1 it indicates that Vertex 1 DATA [3] will be included - * in the writeback message. For the URB_WRITE_OWORD & - * URB_WRITE_HWORD messages, when final channel enable is 1 it - * indicates that Vertex 1 DATA [3] will be written to the surface. - * - * 0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included - * 1: Vertex DATA [3] / Vertex 0 DATA[7] channel included - * - * 14 Vertex 1 DATA [2] Channel Mask - * 13 Vertex 1 DATA [1] Channel Mask - * 12 Vertex 1 DATA [0] Channel Mask - * 11 Vertex 0 DATA [3] Channel Mask - * 10 Vertex 0 DATA [2] Channel Mask - * 9 Vertex 0 DATA [1] Channel Mask - * 8 Vertex 0 DATA [0] Channel Mask - * - * (This is from a section of the PRM that is agnostic to the particular - * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to - * geometry shader invocations 0 and 1, respectively). Since we have the - * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0, - * and the enable flags for geometry shader invocation 1 in bits 7:0 of - * DWORD 4, we just need to OR them together and store the result in bits - * 15:8 of DWORD 5. - * - * It's easier to get the EU to do this if we think of the src and dst - * registers as composed of 32 bytes each; then, we want to pick up the - * contents of bytes 0 and 16 from src, OR them together, and store them in - * byte 21. - * - * We can do that by the following EU instruction: - * - * or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all } - * - * Note: this relies on the source register having zeros in (a) bits 7:4 of - * DWORD 0 and (b) bits 3:0 of DWORD 4. We can rely on (b) because the - * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which - * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to - * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to - * contain valid channel mask values (which are in the range 0x0-0xf). - */ - dst = retype(dst, BRW_REGISTER_TYPE_UB); - src = retype(src, BRW_REGISTER_TYPE_UB); - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16)); - brw_pop_insn_state(p); -} - -static void -generate_gs_get_instance_id(struct brw_codegen *p, - struct brw_reg dst) -{ - /* We want to right shift R0.0 & R0.1 by GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT - * and store into dst.0 & dst.4. So generate the instruction: - * - * shr(8) dst<1> R0<1,4,0> GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q } - */ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - dst = retype(dst, BRW_REGISTER_TYPE_UD); - struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - brw_SHR(p, dst, stride(r0, 1, 4, 0), - brw_imm_ud(GFX7_GS_PAYLOAD_INSTANCE_ID_SHIFT)); - brw_pop_insn_state(p); -} - -static void -generate_gs_ff_sync_set_primitives(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1, - struct brw_reg src2) -{ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - /* Save src0 data in 16:31 bits of dst.0 */ - brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0), - brw_imm_ud(0xffffu)); - brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16)); - /* Save src1 data in 0:15 bits of dst.0 */ - brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0), - brw_imm_ud(0xffffu)); - brw_OR(p, suboffset(vec1(dst), 0), - suboffset(vec1(dst), 0), - suboffset(vec1(src2), 0)); - brw_pop_insn_state(p); -} - -static void -generate_gs_ff_sync(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src0, - struct brw_reg src1) -{ - /* This opcode uses an implied MRF register for: - * - the header of the ff_sync message. And as such it is expected to be - * initialized to r0 before calling here. - * - the destination where we will write the allocated URB handle. - */ - struct brw_reg header = - retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD); - - /* Overwrite dword 0 of the header (SO vertices to write) and - * dword 1 (number of primitives written). - */ - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0)); - brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0)); - brw_pop_insn_state(p); - - /* Allocate URB handle in dst */ - brw_ff_sync(p, - dst, - 0, - header, - 1, /* allocate */ - 1, /* response length */ - 0 /* eot */); - - /* Now put allocated urb handle in header.0 */ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0)); - - /* src1 is not an immediate when we use transform feedback */ - if (src1.file != BRW_IMMEDIATE_VALUE) { - brw_set_default_exec_size(p, BRW_EXECUTE_4); - brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1)); - } - - brw_pop_insn_state(p); -} - -static void -generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst) -{ - /* In gfx6, PrimitiveID is delivered in R0.1 of the payload */ - struct brw_reg src = brw_vec8_grf(0, 0); - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1)); - brw_pop_insn_state(p); -} - -static void -generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst) -{ - const struct intel_device_info *devinfo = p->devinfo; - const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB || - devinfo->platform == INTEL_PLATFORM_BYT; - - /* "Instance Count" comes as part of the payload in r0.2 bits 23:17. - * - * Since we operate in SIMD4x2 mode, we need run half as many threads - * as necessary. So we assign (2i + 1, 2i) as the thread counts. We - * shift right by one less to accomplish the multiplication by two. - */ - dst = retype(dst, BRW_REGISTER_TYPE_UD); - struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - - const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17); - const int shift = ivb ? 16 : 17; - - brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask)); - brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0), - brw_imm_ud(shift - 1)); - brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1)); - - brw_pop_insn_state(p); -} - -static void -generate_tcs_urb_write(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg urb_header) -{ - const struct intel_device_info *devinfo = p->devinfo; - - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_set_dest(p, send, brw_null_reg()); - brw_set_src0(p, send, urb_header); - brw_set_desc(p, send, brw_message_desc(devinfo, inst->mlen, 0, true)); - - brw_inst_set_sfid(devinfo, send, BRW_SFID_URB); - brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD); - brw_inst_set_urb_global_offset(devinfo, send, inst->offset); - if (inst->urb_write_flags & BRW_URB_WRITE_EOT) { - brw_inst_set_eot(devinfo, send, 1); - } else { - brw_inst_set_urb_per_slot_offset(devinfo, send, 1); - brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); - } - - /* what happens to swizzles? */ -} - - -static void -generate_tcs_input_urb_offsets(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg vertex, - struct brw_reg offset) -{ - /* Generates an URB read/write message header for HS/DS operation. - * Inputs are a vertex index, and a byte offset from the beginning of - * the vertex. */ - - /* If `vertex` is not an immediate, we clobber a0.0 */ - - assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE); - assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D); - - assert(dst.file == BRW_GENERAL_REGISTER_FILE); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, dst, brw_imm_ud(0)); - - /* m0.5 bits 8-15 are channel enables */ - brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); - - /* m0.0-0.1: URB handles */ - if (vertex.file == BRW_IMMEDIATE_VALUE) { - uint32_t vertex_index = vertex.ud; - struct brw_reg index_reg = brw_vec1_grf( - 1 + (vertex_index >> 3), vertex_index & 7); - - brw_MOV(p, vec2(get_element_ud(dst, 0)), - retype(index_reg, BRW_REGISTER_TYPE_UD)); - } else { - /* Use indirect addressing. ICP Handles are DWords (single channels - * of a register) and start at g1.0. - * - * In order to start our region at g1.0, we add 8 to the vertex index, - * effectively skipping over the 8 channels in g0.0. This gives us a - * DWord offset to the ICP Handle. - * - * Indirect addressing works in terms of bytes, so we then multiply - * the DWord offset by 4 (by shifting left by 2). - */ - struct brw_reg addr = brw_address_reg(0); - - /* bottom half: m0.0 = g[1.0 + vertex.0]UD */ - brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW), - brw_imm_uw(0x8)); - brw_SHL(p, addr, addr, brw_imm_uw(2)); - brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0)); - - /* top half: m0.1 = g[1.0 + vertex.4]UD */ - brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW), - brw_imm_uw(0x8)); - brw_SHL(p, addr, addr, brw_imm_uw(2)); - brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0)); - } - - /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ - if (offset.file != ARF) - brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); - - brw_pop_insn_state(p); -} - - -static void -generate_tcs_output_urb_offsets(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg write_mask, - struct brw_reg offset) -{ - /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */ - assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE); - - assert(write_mask.file == BRW_IMMEDIATE_VALUE); - assert(write_mask.type == BRW_REGISTER_TYPE_UD); - - brw_push_insn_state(p); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, dst, brw_imm_ud(0)); - - unsigned mask = write_mask.ud; - - /* m0.5 bits 15:12 and 11:8 are channel enables */ - brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12))); - - /* HS patch URB handle is delivered in r0.0 */ - struct brw_reg urb_handle = brw_vec1_grf(0, 0); - - /* m0.0-0.1: URB handles */ - brw_MOV(p, vec2(get_element_ud(dst, 0)), - retype(urb_handle, BRW_REGISTER_TYPE_UD)); - - /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */ - if (offset.file != ARF) - brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0)); - - brw_pop_insn_state(p); -} - -static void -generate_tes_create_input_read_header(struct brw_codegen *p, - struct brw_reg dst) -{ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - /* Initialize the register to 0 */ - brw_MOV(p, dst, brw_imm_ud(0)); - - /* Enable all the channels in m0.5 bits 15:8 */ - brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00)); - - /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1. For safety, - * mask out irrelevant "Reserved" bits, as they're not marked MBZ. - */ - brw_AND(p, vec2(get_element_ud(dst, 0)), - retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD), - brw_imm_ud(0x1fff)); - brw_pop_insn_state(p); -} - -static void -generate_tes_add_indirect_urb_offset(struct brw_codegen *p, - struct brw_reg dst, - struct brw_reg header, - struct brw_reg offset) -{ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - brw_MOV(p, dst, header); - - /* Uniforms will have a stride <0;4,1>, and we need to convert to <0;1,0>. - * Other values get <4;1,0>. - */ - struct brw_reg restrided_offset; - if (offset.vstride == BRW_VERTICAL_STRIDE_0 && - offset.width == BRW_WIDTH_4 && - offset.hstride == BRW_HORIZONTAL_STRIDE_1) { - restrided_offset = stride(offset, 0, 1, 0); - } else { - restrided_offset = stride(offset, 4, 1, 0); - } - - /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */ - brw_MOV(p, vec2(get_element_ud(dst, 3)), restrided_offset); - - brw_pop_insn_state(p); -} - -static void -generate_vec4_urb_read(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg header) -{ - const struct intel_device_info *devinfo = p->devinfo; - - assert(header.file == BRW_GENERAL_REGISTER_FILE); - assert(header.type == BRW_REGISTER_TYPE_UD); - - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_set_dest(p, send, dst); - brw_set_src0(p, send, header); - - brw_set_desc(p, send, brw_message_desc(devinfo, 1, 1, true)); - - brw_inst_set_sfid(devinfo, send, BRW_SFID_URB); - brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); - brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE); - brw_inst_set_urb_per_slot_offset(devinfo, send, 1); - - brw_inst_set_urb_global_offset(devinfo, send, inst->offset); -} - -static void -generate_tcs_release_input(struct brw_codegen *p, - struct brw_reg header, - struct brw_reg vertex, - struct brw_reg is_unpaired) -{ - const struct intel_device_info *devinfo = p->devinfo; - - assert(vertex.file == BRW_IMMEDIATE_VALUE); - assert(vertex.type == BRW_REGISTER_TYPE_UD); - - /* m0.0-0.1: URB handles */ - struct brw_reg urb_handles = - retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7), - BRW_REGISTER_TYPE_UD); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, header, brw_imm_ud(0)); - brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles); - brw_pop_insn_state(p); - - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_set_dest(p, send, brw_null_reg()); - brw_set_src0(p, send, header); - brw_set_desc(p, send, brw_message_desc(devinfo, 1, 0, true)); - - brw_inst_set_sfid(devinfo, send, BRW_SFID_URB); - brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD); - brw_inst_set_urb_complete(devinfo, send, 1); - brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ? - BRW_URB_SWIZZLE_NONE : - BRW_URB_SWIZZLE_INTERLEAVE); -} - -static void -generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst) -{ - struct brw_reg header = brw_message_reg(inst->base_mrf); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_MOV(p, header, brw_imm_ud(0)); - brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8)); - brw_MOV(p, get_element_ud(header, 0), - retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)); - brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u)); - brw_pop_insn_state(p); - - brw_urb_WRITE(p, - brw_null_reg(), /* dest */ - inst->base_mrf, /* starting mrf reg nr */ - header, - BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD | - BRW_URB_WRITE_USE_CHANNEL_MASKS, - inst->mlen, - 0, /* response len */ - 0, /* urb destination offset */ - 0); -} - -static void -generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) -{ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D)); - brw_pop_insn_state(p); -} - -static void -generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst) -{ - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD)); - brw_pop_insn_state(p); -} - -static void -generate_tcs_create_barrier_header(struct brw_codegen *p, - struct brw_vue_prog_data *prog_data, - struct brw_reg dst) -{ - const struct intel_device_info *devinfo = p->devinfo; - const bool ivb = devinfo->platform == INTEL_PLATFORM_IVB || - devinfo->platform == INTEL_PLATFORM_BYT; - struct brw_reg m0_2 = get_element_ud(dst, 2); - unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances; - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - /* Zero the message header */ - brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)); - - /* Copy "Barrier ID" from r0.2, bits 16:13 (Gfx7.5+) or 15:12 (Gfx7) */ - brw_AND(p, m0_2, - retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD), - brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13))); - - /* Shift it up to bits 27:24. */ - brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11)); - - /* Set the Barrier Count and the enable bit */ - brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15))); - - brw_pop_insn_state(p); -} - -static void -generate_oword_dual_block_offsets(struct brw_codegen *p, - struct brw_reg m1, - struct brw_reg index) -{ - int second_vertex_offset; - - if (p->devinfo->ver >= 6) - second_vertex_offset = 1; - else - second_vertex_offset = 16; - - m1 = retype(m1, BRW_REGISTER_TYPE_D); - - /* Set up M1 (message payload). Only the block offsets in M1.0 and - * M1.4 are used, and the rest are ignored. - */ - struct brw_reg m1_0 = suboffset(vec1(m1), 0); - struct brw_reg m1_4 = suboffset(vec1(m1), 4); - struct brw_reg index_0 = suboffset(vec1(index), 0); - struct brw_reg index_4 = suboffset(vec1(index), 4); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - - brw_MOV(p, m1_0, index_0); - - if (index.file == BRW_IMMEDIATE_VALUE) { - index_4.ud += second_vertex_offset; - brw_MOV(p, m1_4, index_4); - } else { - brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset)); - } - - brw_pop_insn_state(p); -} - -static void -generate_unpack_flags(struct brw_codegen *p, - struct brw_reg dst) -{ - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - - struct brw_reg flags = brw_flag_reg(0, 0); - struct brw_reg dst_0 = suboffset(vec1(dst), 0); - struct brw_reg dst_4 = suboffset(vec1(dst), 4); - - brw_AND(p, dst_0, flags, brw_imm_ud(0x0f)); - brw_AND(p, dst_4, flags, brw_imm_ud(0xf0)); - brw_SHR(p, dst_4, dst_4, brw_imm_ud(4)); - - brw_pop_insn_state(p); -} - -static void -generate_scratch_read(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg index) -{ - const struct intel_device_info *devinfo = p->devinfo; - struct brw_reg header = brw_vec8_grf(0, 0); - - gfx6_resolve_implied_move(p, &header, inst->base_mrf); - - generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), - index); - - uint32_t msg_type; - - if (devinfo->ver >= 6) - msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; - else if (devinfo->verx10 >= 45) - msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; - else - msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; - - const unsigned target_cache = - devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : - devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : - BRW_SFID_DATAPORT_READ; - - /* Each of the 8 channel enables is considered for whether each - * dword is written. - */ - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_inst_set_sfid(devinfo, send, target_cache); - brw_set_dest(p, send, dst); - brw_set_src0(p, send, header); - if (devinfo->ver < 6) - brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf); - brw_set_desc(p, send, - brw_message_desc(devinfo, 2, 1, true) | - brw_dp_read_desc(devinfo, - brw_scratch_surface_idx(p), - BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, - msg_type, BRW_DATAPORT_READ_TARGET_RENDER_CACHE)); -} - -static void -generate_scratch_write(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg index) -{ - const struct intel_device_info *devinfo = p->devinfo; - const unsigned target_cache = - (devinfo->ver >= 7 ? GFX7_SFID_DATAPORT_DATA_CACHE : - devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_RENDER_CACHE : - BRW_SFID_DATAPORT_WRITE); - struct brw_reg header = brw_vec8_grf(0, 0); - bool write_commit; - - /* If the instruction is predicated, we'll predicate the send, not - * the header setup. - */ - brw_push_insn_state(p); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - brw_set_default_flag_reg(p, 0, 0); - - gfx6_resolve_implied_move(p, &header, inst->base_mrf); - - generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1), - index); - - brw_MOV(p, - retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D), - retype(src, BRW_REGISTER_TYPE_D)); - - brw_pop_insn_state(p); - - uint32_t msg_type; - - if (devinfo->ver >= 7) - msg_type = GFX7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE; - else if (devinfo->ver == 6) - msg_type = GFX6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; - else - msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE; - - brw_set_default_predicate_control(p, inst->predicate); - - /* Pre-gfx6, we have to specify write commits to ensure ordering - * between reads and writes within a thread. Afterwards, that's - * guaranteed and write commits only matter for inter-thread - * synchronization. - */ - if (devinfo->ver >= 6) { - write_commit = false; - } else { - /* The visitor set up our destination register to be g0. This - * means that when the next read comes along, we will end up - * reading from g0 and causing a block on the write commit. For - * write-after-read, we are relying on the value of the previous - * read being used (and thus blocking on completion) before our - * write is executed. This means we have to be careful in - * instruction scheduling to not violate this assumption. - */ - write_commit = true; - } - - /* Each of the 8 channel enables is considered for whether each - * dword is written. - */ - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_inst_set_sfid(p->devinfo, send, target_cache); - brw_set_dest(p, send, dst); - brw_set_src0(p, send, header); - if (devinfo->ver < 6) - brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); - brw_set_desc(p, send, - brw_message_desc(devinfo, 3, write_commit, true) | - brw_dp_write_desc(devinfo, - brw_scratch_surface_idx(p), - BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, - msg_type, - write_commit)); -} - -static void -generate_pull_constant_load(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg index, - struct brw_reg offset) -{ - const struct intel_device_info *devinfo = p->devinfo; - const unsigned target_cache = - (devinfo->ver >= 6 ? GFX6_SFID_DATAPORT_SAMPLER_CACHE : - BRW_SFID_DATAPORT_READ); - assert(index.file == BRW_IMMEDIATE_VALUE && - index.type == BRW_REGISTER_TYPE_UD); - uint32_t surf_index = index.ud; - - struct brw_reg header = brw_vec8_grf(0, 0); - - gfx6_resolve_implied_move(p, &header, inst->base_mrf); - - if (devinfo->ver >= 6) { - if (offset.file == BRW_IMMEDIATE_VALUE) { - brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), - BRW_REGISTER_TYPE_D), - brw_imm_d(offset.ud >> 4)); - } else { - brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1), - BRW_REGISTER_TYPE_D), - offset, brw_imm_d(4)); - } - } else { - brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), - BRW_REGISTER_TYPE_D), - offset); - } - - uint32_t msg_type; - - if (devinfo->ver >= 6) - msg_type = GFX6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; - else if (devinfo->verx10 >= 45) - msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; - else - msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ; - - /* Each of the 8 channel enables is considered for whether each - * dword is written. - */ - brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND); - brw_inst_set_sfid(devinfo, send, target_cache); - brw_set_dest(p, send, dst); - brw_set_src0(p, send, header); - if (devinfo->ver < 6) - brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf); - brw_set_desc(p, send, - brw_message_desc(devinfo, 2, 1, true) | - brw_dp_read_desc(devinfo, surf_index, - BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD, - msg_type, - BRW_DATAPORT_READ_TARGET_DATA_CACHE)); -} - -static void -generate_get_buffer_size(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg src, - struct brw_reg surf_index) -{ - assert(p->devinfo->ver >= 7); - assert(surf_index.type == BRW_REGISTER_TYPE_UD && - surf_index.file == BRW_IMMEDIATE_VALUE); - - brw_SAMPLE(p, - dst, - inst->base_mrf, - src, - surf_index.ud, - 0, - GFX5_SAMPLER_MESSAGE_SAMPLE_RESINFO, - 1, /* response length */ - inst->mlen, - inst->header_size > 0, - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - BRW_SAMPLER_RETURN_FORMAT_SINT32); -} - -static void -generate_pull_constant_load_gfx7(struct brw_codegen *p, - vec4_instruction *inst, - struct brw_reg dst, - struct brw_reg surf_index, - struct brw_reg offset) -{ - const struct intel_device_info *devinfo = p->devinfo; - assert(surf_index.type == BRW_REGISTER_TYPE_UD); - - if (surf_index.file == BRW_IMMEDIATE_VALUE) { - - brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND); - brw_inst_set_sfid(devinfo, insn, BRW_SFID_SAMPLER); - brw_set_dest(p, insn, dst); - brw_set_src0(p, insn, offset); - brw_set_desc(p, insn, - brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) | - brw_sampler_desc(devinfo, surf_index.ud, - 0, /* LD message ignores sampler unit */ - GFX5_SAMPLER_MESSAGE_SAMPLE_LD, - BRW_SAMPLER_SIMD_MODE_SIMD4X2, 0)); - } else { - - struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD)); - - brw_push_insn_state(p); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - brw_set_default_access_mode(p, BRW_ALIGN_1); - - /* a0.0 = surf_index & 0xff */ - brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND); - brw_inst_set_exec_size(devinfo, insn_and, BRW_EXECUTE_1); - brw_set_dest(p, insn_and, addr); - brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD))); - brw_set_src1(p, insn_and, brw_imm_ud(0x0ff)); - - brw_pop_insn_state(p); - - /* dst = send(offset, a0.0 | ) */ - brw_send_indirect_message( - p, BRW_SFID_SAMPLER, dst, offset, addr, - brw_message_desc(devinfo, inst->mlen, 1, inst->header_size) | - brw_sampler_desc(devinfo, - 0 /* surface */, - 0 /* sampler */, - GFX5_SAMPLER_MESSAGE_SAMPLE_LD, - BRW_SAMPLER_SIMD_MODE_SIMD4X2, - 0), - false /* EOT */); - } -} - -static void -generate_mov_indirect(struct brw_codegen *p, - vec4_instruction *, - struct brw_reg dst, struct brw_reg reg, - struct brw_reg indirect) -{ - assert(indirect.type == BRW_REGISTER_TYPE_UD); - assert(p->devinfo->ver >= 6); - - unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2); - - /* This instruction acts in align1 mode */ - assert(dst.writemask == WRITEMASK_XYZW); - - if (indirect.file == BRW_IMMEDIATE_VALUE) { - imm_byte_offset += indirect.ud; - - reg.nr = imm_byte_offset / REG_SIZE; - reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2; - unsigned shift = (imm_byte_offset / 4) % 4; - reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); - - brw_MOV(p, dst, reg); - } else { - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - struct brw_reg addr = vec8(brw_address_reg(0)); - - /* We need to move the indirect value into the address register. In - * order to make things make some sense, we want to respect at least the - * X component of the swizzle. In order to do that, we need to convert - * the subnr (probably 0) to an align1 subnr and add in the swizzle. - */ - assert(brw_is_single_value_swizzle(indirect.swizzle)); - indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)); - - /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of - * the indirect and splat it out to all four channels of the given half - * of a0. - */ - indirect.subnr *= 2; - indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0); - brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset)); - - /* Now we need to incorporate the swizzle from the source register */ - if (reg.swizzle != BRW_SWIZZLE_XXXX) { - uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 | - BRW_GET_SWZ(reg.swizzle, 1) << 6 | - BRW_GET_SWZ(reg.swizzle, 2) << 10 | - BRW_GET_SWZ(reg.swizzle, 3) << 14; - uv_swiz |= uv_swiz << 16; - - brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz)); - } - - brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type)); - - brw_pop_insn_state(p); - } -} - -static void -generate_zero_oob_push_regs(struct brw_codegen *p, - struct brw_stage_prog_data *prog_data, - struct brw_reg scratch, - struct brw_reg bit_mask_in) -{ - const uint64_t want_zero = prog_data->zero_push_reg; - assert(want_zero); - - assert(bit_mask_in.file == BRW_GENERAL_REGISTER_FILE); - assert(BRW_GET_SWZ(bit_mask_in.swizzle, 1) == - BRW_GET_SWZ(bit_mask_in.swizzle, 0) + 1); - bit_mask_in.subnr += BRW_GET_SWZ(bit_mask_in.swizzle, 0) * 4; - bit_mask_in.type = BRW_REGISTER_TYPE_W; - - /* Scratch should be 3 registers in the GRF */ - assert(scratch.file == BRW_GENERAL_REGISTER_FILE); - scratch = vec8(scratch); - struct brw_reg mask_w16 = retype(scratch, BRW_REGISTER_TYPE_W); - struct brw_reg mask_d16 = retype(byte_offset(scratch, REG_SIZE), - BRW_REGISTER_TYPE_D); - - brw_push_insn_state(p); - brw_set_default_access_mode(p, BRW_ALIGN_1); - brw_set_default_mask_control(p, BRW_MASK_DISABLE); - - for (unsigned i = 0; i < 64; i++) { - if (i % 16 == 0 && (want_zero & BITFIELD64_RANGE(i, 16))) { - brw_set_default_exec_size(p, BRW_EXECUTE_8); - brw_SHL(p, suboffset(mask_w16, 8), - vec1(byte_offset(bit_mask_in, i / 8)), - brw_imm_v(0x01234567)); - brw_SHL(p, mask_w16, suboffset(mask_w16, 8), brw_imm_w(8)); - - brw_set_default_exec_size(p, BRW_EXECUTE_16); - brw_ASR(p, mask_d16, mask_w16, brw_imm_w(15)); - } - - if (want_zero & BITFIELD64_BIT(i)) { - unsigned push_start = prog_data->dispatch_grf_start_reg; - struct brw_reg push_reg = - retype(brw_vec8_grf(push_start + i, 0), BRW_REGISTER_TYPE_D); - - brw_set_default_exec_size(p, BRW_EXECUTE_8); - brw_AND(p, push_reg, push_reg, vec1(suboffset(mask_d16, i))); - } - } - - brw_pop_insn_state(p); -} - -static void -generate_code(struct brw_codegen *p, - const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const nir_shader *nir, - struct brw_vue_prog_data *prog_data, - const struct cfg_t *cfg, - const performance &perf, - struct brw_compile_stats *stats, - bool debug_enabled) -{ - const struct intel_device_info *devinfo = p->devinfo; - const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->info.stage); - struct disasm_info *disasm_info = disasm_initialize(p->isa, cfg); - - /* `send_count` explicitly does not include spills or fills, as we'd - * like to use it as a metric for intentional memory access or other - * shared function use. Otherwise, subtle changes to scheduling or - * register allocation could cause it to fluctuate wildly - and that - * effect is already counted in spill/fill counts. - */ - int spill_count = 0, fill_count = 0; - int loop_count = 0, send_count = 0; - - foreach_block_and_inst (block, vec4_instruction, inst, cfg) { - struct brw_reg src[3], dst; - - if (unlikely(debug_enabled)) - disasm_annotate(disasm_info, inst, p->next_insn_offset); - - for (unsigned int i = 0; i < 3; i++) { - src[i] = inst->src[i].as_brw_reg(); - } - dst = inst->dst.as_brw_reg(); - - brw_set_default_predicate_control(p, inst->predicate); - brw_set_default_predicate_inverse(p, inst->predicate_inverse); - brw_set_default_flag_reg(p, inst->flag_subreg / 2, inst->flag_subreg % 2); - brw_set_default_saturate(p, inst->saturate); - brw_set_default_mask_control(p, inst->force_writemask_all); - brw_set_default_acc_write_control(p, inst->writes_accumulator); - - assert(inst->group % inst->exec_size == 0); - assert(inst->group % 4 == 0); - - /* There are some instructions where the destination is 64-bit - * but we retype it to a smaller type. In that case, we cannot - * double the exec_size. - */ - const bool is_df = (get_exec_type_size(inst) == 8 || - inst->dst.type == BRW_REGISTER_TYPE_DF) && - inst->opcode != VEC4_OPCODE_PICK_LOW_32BIT && - inst->opcode != VEC4_OPCODE_PICK_HIGH_32BIT && - inst->opcode != VEC4_OPCODE_SET_LOW_32BIT && - inst->opcode != VEC4_OPCODE_SET_HIGH_32BIT; - - unsigned exec_size = inst->exec_size; - if (devinfo->verx10 == 70 && is_df) - exec_size *= 2; - - brw_set_default_exec_size(p, cvt(exec_size) - 1); - - if (!inst->force_writemask_all) - brw_set_default_group(p, inst->group); - - assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->ver)); - assert(inst->mlen <= BRW_MAX_MSG_LENGTH); - - unsigned pre_emit_nr_insn = p->nr_insn; - - switch (inst->opcode) { - case VEC4_OPCODE_UNPACK_UNIFORM: - case BRW_OPCODE_MOV: - case VEC4_OPCODE_MOV_FOR_SCRATCH: - brw_MOV(p, dst, src[0]); - break; - case BRW_OPCODE_ADD: - brw_ADD(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_MUL: - brw_MUL(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_MACH: - brw_MACH(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_MAD: - assert(devinfo->ver >= 6); - brw_MAD(p, dst, src[0], src[1], src[2]); - break; - - case BRW_OPCODE_FRC: - brw_FRC(p, dst, src[0]); - break; - case BRW_OPCODE_RNDD: - brw_RNDD(p, dst, src[0]); - break; - case BRW_OPCODE_RNDE: - brw_RNDE(p, dst, src[0]); - break; - case BRW_OPCODE_RNDZ: - brw_RNDZ(p, dst, src[0]); - break; - - case BRW_OPCODE_AND: - brw_AND(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_OR: - brw_OR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_XOR: - brw_XOR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_NOT: - brw_NOT(p, dst, src[0]); - break; - case BRW_OPCODE_ASR: - brw_ASR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_SHR: - brw_SHR(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_SHL: - brw_SHL(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_CMP: - brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]); - break; - case BRW_OPCODE_CMPN: - brw_CMPN(p, dst, inst->conditional_mod, src[0], src[1]); - break; - case BRW_OPCODE_SEL: - brw_SEL(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_DPH: - brw_DPH(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_DP4: - brw_DP4(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_DP3: - brw_DP3(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_DP2: - brw_DP2(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_F32TO16: - assert(devinfo->ver >= 7); - brw_F32TO16(p, dst, src[0]); - break; - - case BRW_OPCODE_F16TO32: - assert(devinfo->ver >= 7); - brw_F16TO32(p, dst, src[0]); - break; - - case BRW_OPCODE_LRP: - assert(devinfo->ver >= 6); - brw_LRP(p, dst, src[0], src[1], src[2]); - break; - - case BRW_OPCODE_BFREV: - assert(devinfo->ver >= 7); - brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD), - retype(src[0], BRW_REGISTER_TYPE_UD)); - break; - case BRW_OPCODE_FBH: - assert(devinfo->ver >= 7); - brw_FBH(p, retype(dst, src[0].type), src[0]); - break; - case BRW_OPCODE_FBL: - assert(devinfo->ver >= 7); - brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), - retype(src[0], BRW_REGISTER_TYPE_UD)); - break; - case BRW_OPCODE_LZD: - brw_LZD(p, dst, src[0]); - break; - case BRW_OPCODE_CBIT: - assert(devinfo->ver >= 7); - brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), - retype(src[0], BRW_REGISTER_TYPE_UD)); - break; - case BRW_OPCODE_ADDC: - assert(devinfo->ver >= 7); - brw_ADDC(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_SUBB: - assert(devinfo->ver >= 7); - brw_SUBB(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_MAC: - brw_MAC(p, dst, src[0], src[1]); - break; - - case BRW_OPCODE_BFE: - assert(devinfo->ver >= 7); - brw_BFE(p, dst, src[0], src[1], src[2]); - break; - - case BRW_OPCODE_BFI1: - assert(devinfo->ver >= 7); - brw_BFI1(p, dst, src[0], src[1]); - break; - case BRW_OPCODE_BFI2: - assert(devinfo->ver >= 7); - brw_BFI2(p, dst, src[0], src[1], src[2]); - break; - - case BRW_OPCODE_IF: - if (!inst->src[0].is_null()) { - /* The instruction has an embedded compare (only allowed on gfx6) */ - assert(devinfo->ver == 6); - gfx6_IF(p, inst->conditional_mod, src[0], src[1]); - } else { - brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8); - brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate); - } - break; - - case BRW_OPCODE_ELSE: - brw_ELSE(p); - break; - case BRW_OPCODE_ENDIF: - brw_ENDIF(p); - break; - - case BRW_OPCODE_DO: - brw_DO(p, BRW_EXECUTE_8); - break; - - case BRW_OPCODE_BREAK: - brw_BREAK(p); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - break; - case BRW_OPCODE_CONTINUE: - brw_CONT(p); - brw_set_default_predicate_control(p, BRW_PREDICATE_NONE); - break; - - case BRW_OPCODE_WHILE: - brw_WHILE(p); - loop_count++; - break; - - case SHADER_OPCODE_RCP: - case SHADER_OPCODE_RSQ: - case SHADER_OPCODE_SQRT: - case SHADER_OPCODE_EXP2: - case SHADER_OPCODE_LOG2: - case SHADER_OPCODE_SIN: - case SHADER_OPCODE_COS: - assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); - if (devinfo->ver >= 7) { - gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], - brw_null_reg()); - } else if (devinfo->ver == 6) { - generate_math_gfx6(p, inst, dst, src[0], brw_null_reg()); - } else { - generate_math1_gfx4(p, inst, dst, src[0]); - send_count++; - } - break; - - case SHADER_OPCODE_POW: - case SHADER_OPCODE_INT_QUOTIENT: - case SHADER_OPCODE_INT_REMAINDER: - assert(inst->conditional_mod == BRW_CONDITIONAL_NONE); - if (devinfo->ver >= 7) { - gfx6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]); - } else if (devinfo->ver == 6) { - generate_math_gfx6(p, inst, dst, src[0], src[1]); - } else { - generate_math2_gfx4(p, inst, dst, src[0], src[1]); - send_count++; - } - break; - - case SHADER_OPCODE_TEX: - case SHADER_OPCODE_TXD: - case SHADER_OPCODE_TXF: - case SHADER_OPCODE_TXF_CMS: - case SHADER_OPCODE_TXF_CMS_W: - case SHADER_OPCODE_TXF_MCS: - case SHADER_OPCODE_TXL: - case SHADER_OPCODE_TXS: - case SHADER_OPCODE_TG4: - case SHADER_OPCODE_TG4_OFFSET: - case SHADER_OPCODE_SAMPLEINFO: - generate_tex(p, prog_data, nir->info.stage, - inst, dst, src[0], src[1], src[2]); - send_count++; - break; - - case SHADER_OPCODE_GET_BUFFER_SIZE: - generate_get_buffer_size(p, inst, dst, src[0], src[1]); - send_count++; - break; - - case VEC4_VS_OPCODE_URB_WRITE: - generate_vs_urb_write(p, inst); - send_count++; - break; - - case SHADER_OPCODE_GFX4_SCRATCH_READ: - generate_scratch_read(p, inst, dst, src[0]); - fill_count++; - break; - - case SHADER_OPCODE_GFX4_SCRATCH_WRITE: - generate_scratch_write(p, inst, dst, src[0], src[1]); - spill_count++; - break; - - case VS_OPCODE_PULL_CONSTANT_LOAD: - generate_pull_constant_load(p, inst, dst, src[0], src[1]); - send_count++; - break; - - case VS_OPCODE_PULL_CONSTANT_LOAD_GFX7: - generate_pull_constant_load_gfx7(p, inst, dst, src[0], src[1]); - send_count++; - break; - - case VEC4_GS_OPCODE_URB_WRITE: - generate_gs_urb_write(p, inst); - send_count++; - break; - - case VEC4_GS_OPCODE_URB_WRITE_ALLOCATE: - generate_gs_urb_write_allocate(p, inst); - send_count++; - break; - - case GS_OPCODE_SVB_WRITE: - generate_gs_svb_write(p, inst, dst, src[0], src[1]); - send_count++; - break; - - case GS_OPCODE_SVB_SET_DST_INDEX: - generate_gs_svb_set_destination_index(p, inst, dst, src[0]); - break; - - case GS_OPCODE_THREAD_END: - generate_gs_thread_end(p, inst); - send_count++; - break; - - case GS_OPCODE_SET_WRITE_OFFSET: - generate_gs_set_write_offset(p, dst, src[0], src[1]); - break; - - case GS_OPCODE_SET_VERTEX_COUNT: - generate_gs_set_vertex_count(p, dst, src[0]); - break; - - case GS_OPCODE_FF_SYNC: - generate_gs_ff_sync(p, inst, dst, src[0], src[1]); - send_count++; - break; - - case GS_OPCODE_FF_SYNC_SET_PRIMITIVES: - generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]); - break; - - case GS_OPCODE_SET_PRIMITIVE_ID: - generate_gs_set_primitive_id(p, dst); - break; - - case GS_OPCODE_SET_DWORD_2: - generate_gs_set_dword_2(p, dst, src[0]); - break; - - case GS_OPCODE_PREPARE_CHANNEL_MASKS: - generate_gs_prepare_channel_masks(p, dst); - break; - - case GS_OPCODE_SET_CHANNEL_MASKS: - generate_gs_set_channel_masks(p, dst, src[0]); - break; - - case GS_OPCODE_GET_INSTANCE_ID: - generate_gs_get_instance_id(p, dst); - break; - - case VEC4_OPCODE_UNTYPED_ATOMIC: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen, - !inst->dst.is_null(), inst->header_size); - send_count++; - break; - - case VEC4_OPCODE_UNTYPED_SURFACE_READ: - assert(!inst->header_size); - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen, - src[2].ud); - send_count++; - break; - - case VEC4_OPCODE_UNTYPED_SURFACE_WRITE: - assert(src[2].file == BRW_IMMEDIATE_VALUE); - brw_untyped_surface_write(p, src[0], src[1], inst->mlen, - src[2].ud, inst->header_size); - send_count++; - break; - - case SHADER_OPCODE_MEMORY_FENCE: - brw_memory_fence(p, dst, src[0], BRW_OPCODE_SEND, - brw_message_target(inst->sfid), - inst->desc, - /* commit_enable */ false, - /* bti */ 0); - send_count++; - break; - - case SHADER_OPCODE_FIND_LIVE_CHANNEL: - brw_find_live_channel(p, dst, false); - break; - - case SHADER_OPCODE_BROADCAST: - assert(inst->force_writemask_all); - brw_broadcast(p, dst, src[0], src[1]); - break; - - case VS_OPCODE_UNPACK_FLAGS_SIMD4X2: - generate_unpack_flags(p, dst); - break; - - case VEC4_OPCODE_MOV_BYTES: { - /* Moves the low byte from each channel, using an Align1 access mode - * and a <4,1,0> source region. - */ - assert(src[0].type == BRW_REGISTER_TYPE_UB || - src[0].type == BRW_REGISTER_TYPE_B); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - src[0].vstride = BRW_VERTICAL_STRIDE_4; - src[0].width = BRW_WIDTH_1; - src[0].hstride = BRW_HORIZONTAL_STRIDE_0; - brw_MOV(p, dst, src[0]); - brw_set_default_access_mode(p, BRW_ALIGN_16); - break; - } - - case VEC4_OPCODE_DOUBLE_TO_F32: - case VEC4_OPCODE_DOUBLE_TO_D32: - case VEC4_OPCODE_DOUBLE_TO_U32: { - assert(type_sz(src[0].type) == 8); - assert(type_sz(dst.type) == 8); - - brw_reg_type dst_type; - - switch (inst->opcode) { - case VEC4_OPCODE_DOUBLE_TO_F32: - dst_type = BRW_REGISTER_TYPE_F; - break; - case VEC4_OPCODE_DOUBLE_TO_D32: - dst_type = BRW_REGISTER_TYPE_D; - break; - case VEC4_OPCODE_DOUBLE_TO_U32: - dst_type = BRW_REGISTER_TYPE_UD; - break; - default: - unreachable("Not supported conversion"); - } - dst = retype(dst, dst_type); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - - /* When converting from DF->F, we set destination's stride as 2 as an - * alignment requirement. But in IVB/BYT, each DF implicitly writes - * two floats, being the first one the converted value. So we don't - * need to explicitly set stride 2, but 1. - */ - struct brw_reg spread_dst; - if (devinfo->verx10 == 70) - spread_dst = stride(dst, 8, 4, 1); - else - spread_dst = stride(dst, 8, 4, 2); - - brw_MOV(p, spread_dst, src[0]); - - brw_set_default_access_mode(p, BRW_ALIGN_16); - break; - } - - case VEC4_OPCODE_TO_DOUBLE: { - assert(type_sz(src[0].type) == 4); - assert(type_sz(dst.type) == 8); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - - brw_MOV(p, dst, src[0]); - - brw_set_default_access_mode(p, BRW_ALIGN_16); - break; - } - - case VEC4_OPCODE_PICK_LOW_32BIT: - case VEC4_OPCODE_PICK_HIGH_32BIT: { - /* Stores the low/high 32-bit of each 64-bit element in src[0] into - * dst using ALIGN1 mode and a <8,4,2>:UD region on the source. - */ - assert(type_sz(src[0].type) == 8); - assert(type_sz(dst.type) == 4); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - - dst = retype(dst, BRW_REGISTER_TYPE_UD); - dst.hstride = BRW_HORIZONTAL_STRIDE_1; - - src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); - if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT) - src[0] = suboffset(src[0], 1); - src[0] = spread(src[0], 2); - brw_MOV(p, dst, src[0]); - - brw_set_default_access_mode(p, BRW_ALIGN_16); - break; - } - - case VEC4_OPCODE_SET_LOW_32BIT: - case VEC4_OPCODE_SET_HIGH_32BIT: { - /* Reads consecutive 32-bit elements from src[0] and writes - * them to the low/high 32-bit of each 64-bit element in dst. - */ - assert(type_sz(src[0].type) == 4); - assert(type_sz(dst.type) == 8); - - brw_set_default_access_mode(p, BRW_ALIGN_1); - - dst = retype(dst, BRW_REGISTER_TYPE_UD); - if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT) - dst = suboffset(dst, 1); - dst.hstride = BRW_HORIZONTAL_STRIDE_2; - - src[0] = retype(src[0], BRW_REGISTER_TYPE_UD); - brw_MOV(p, dst, src[0]); - - brw_set_default_access_mode(p, BRW_ALIGN_16); - break; - } - - case VEC4_OPCODE_PACK_BYTES: { - /* Is effectively: - * - * mov(8) dst<16,4,1>:UB src<4,1,0>:UB - * - * but destinations' only regioning is horizontal stride, so instead we - * have to use two instructions: - * - * mov(4) dst<1>:UB src<4,1,0>:UB - * mov(4) dst.16<1>:UB src.16<4,1,0>:UB - * - * where they pack the four bytes from the low and high four DW. - */ - assert(util_is_power_of_two_nonzero(dst.writemask)); - unsigned offset = __builtin_ctz(dst.writemask); - - dst.type = BRW_REGISTER_TYPE_UB; - - brw_set_default_access_mode(p, BRW_ALIGN_1); - - src[0].type = BRW_REGISTER_TYPE_UB; - src[0].vstride = BRW_VERTICAL_STRIDE_4; - src[0].width = BRW_WIDTH_1; - src[0].hstride = BRW_HORIZONTAL_STRIDE_0; - dst.subnr = offset * 4; - struct brw_inst *insn = brw_MOV(p, dst, src[0]); - brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); - brw_inst_set_no_dd_clear(p->devinfo, insn, true); - brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check); - - src[0].subnr = 16; - dst.subnr = 16 + offset * 4; - insn = brw_MOV(p, dst, src[0]); - brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4); - brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear); - brw_inst_set_no_dd_check(p->devinfo, insn, true); - - brw_set_default_access_mode(p, BRW_ALIGN_16); - break; - } - - case VEC4_OPCODE_ZERO_OOB_PUSH_REGS: - generate_zero_oob_push_regs(p, &prog_data->base, dst, src[0]); - break; - - case VEC4_TCS_OPCODE_URB_WRITE: - generate_tcs_urb_write(p, inst, src[0]); - send_count++; - break; - - case VEC4_OPCODE_URB_READ: - generate_vec4_urb_read(p, inst, dst, src[0]); - send_count++; - break; - - case VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS: - generate_tcs_input_urb_offsets(p, dst, src[0], src[1]); - break; - - case VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS: - generate_tcs_output_urb_offsets(p, dst, src[0], src[1]); - break; - - case TCS_OPCODE_GET_INSTANCE_ID: - generate_tcs_get_instance_id(p, dst); - break; - - case TCS_OPCODE_GET_PRIMITIVE_ID: - generate_tcs_get_primitive_id(p, dst); - break; - - case TCS_OPCODE_CREATE_BARRIER_HEADER: - generate_tcs_create_barrier_header(p, prog_data, dst); - break; - - case TES_OPCODE_CREATE_INPUT_READ_HEADER: - generate_tes_create_input_read_header(p, dst); - break; - - case TES_OPCODE_ADD_INDIRECT_URB_OFFSET: - generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]); - break; - - case TES_OPCODE_GET_PRIMITIVE_ID: - generate_tes_get_primitive_id(p, dst); - break; - - case TCS_OPCODE_SRC0_010_IS_ZERO: - /* If src_reg had stride like fs_reg, we wouldn't need this. */ - brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0)); - break; - - case TCS_OPCODE_RELEASE_INPUT: - generate_tcs_release_input(p, dst, src[0], src[1]); - send_count++; - break; - - case TCS_OPCODE_THREAD_END: - generate_tcs_thread_end(p, inst); - send_count++; - break; - - case SHADER_OPCODE_BARRIER: - brw_barrier(p, src[0]); - brw_WAIT(p); - send_count++; - break; - - case SHADER_OPCODE_MOV_INDIRECT: - generate_mov_indirect(p, inst, dst, src[0], src[1]); - break; - - case BRW_OPCODE_DIM: - assert(devinfo->verx10 == 75); - assert(src[0].type == BRW_REGISTER_TYPE_DF); - assert(dst.type == BRW_REGISTER_TYPE_DF); - brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F)); - break; - - case SHADER_OPCODE_RND_MODE: { - assert(src[0].file == BRW_IMMEDIATE_VALUE); - /* - * Changes the floating point rounding mode updating the control - * register field defined at cr0.0[5-6] bits. - */ - enum brw_rnd_mode mode = - (enum brw_rnd_mode) (src[0].d << BRW_CR0_RND_MODE_SHIFT); - brw_float_controls_mode(p, mode, BRW_CR0_RND_MODE_MASK); - } - break; - - default: - unreachable("Unsupported opcode"); - } - - if (inst->opcode == VEC4_OPCODE_PACK_BYTES) { - /* Handled dependency hints in the generator. */ - - assert(!inst->conditional_mod); - } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) { - assert(p->nr_insn == pre_emit_nr_insn + 1 || - !"conditional_mod, no_dd_check, or no_dd_clear set for IR " - "emitting more than 1 instruction"); - - brw_inst *last = &p->store[pre_emit_nr_insn]; - - if (inst->conditional_mod) - brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod); - brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear); - brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check); - } - } - - brw_set_uip_jip(p, 0); - - /* end of program sentinel */ - disasm_new_inst_group(disasm_info, p->next_insn_offset); - -#ifndef NDEBUG - bool validated = -#else - if (unlikely(debug_enabled)) -#endif - brw_validate_instructions(&compiler->isa, p->store, - 0, p->next_insn_offset, - disasm_info); - - int before_size = p->next_insn_offset; - brw_compact_instructions(p, 0, disasm_info); - int after_size = p->next_insn_offset; - - bool dump_shader_bin = brw_should_dump_shader_bin(); - unsigned char sha1[21]; - char sha1buf[41]; - - if (unlikely(debug_enabled || dump_shader_bin)) { - _mesa_sha1_compute(p->store, p->next_insn_offset, sha1); - _mesa_sha1_format(sha1buf, sha1); - } - - if (unlikely(dump_shader_bin)) - brw_dump_shader_bin(p->store, 0, p->next_insn_offset, sha1buf); - - if (unlikely(debug_enabled)) { - fprintf(stderr, "Native code for %s %s shader %s (src_hash 0x%08x) (sha1 %s):\n", - nir->info.label ? nir->info.label : "unnamed", - _mesa_shader_stage_to_string(nir->info.stage), nir->info.name, - params->source_hash, sha1buf); - - fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d " - "spills:fills, %u sends. Compacted %d to %d bytes (%.0f%%)\n", - stage_abbrev, before_size / 16, loop_count, perf.latency, - spill_count, fill_count, send_count, before_size, after_size, - 100.0f * (before_size - after_size) / before_size); - - /* overriding the shader makes disasm_info invalid */ - if (!brw_try_override_assembly(p, 0, sha1buf)) { - dump_assembly(p->store, 0, p->next_insn_offset, - disasm_info, perf.block_latency); - } else { - fprintf(stderr, "Successfully overrode shader with sha1 %s\n\n", sha1buf); - } - } - ralloc_free(disasm_info); - assert(validated); - - brw_shader_debug_log(compiler, params->log_data, - "%s vec4 shader: %d inst, %d loops, %u cycles, " - "%d:%d spills:fills, %u sends, " - "compacted %d to %d bytes.\n", - stage_abbrev, before_size / 16, - loop_count, perf.latency, spill_count, - fill_count, send_count, before_size, after_size); - if (stats) { - stats->dispatch_width = 0; - stats->max_dispatch_width = 0; - stats->instructions = before_size / 16; - stats->sends = send_count; - stats->loops = loop_count; - stats->cycles = perf.latency; - stats->spills = spill_count; - stats->fills = fill_count; - } -} - -extern "C" const unsigned * -brw_vec4_generate_assembly(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const nir_shader *nir, - struct brw_vue_prog_data *prog_data, - const struct cfg_t *cfg, - const performance &perf, - bool debug_enabled) -{ - struct brw_codegen *p = rzalloc(params->mem_ctx, struct brw_codegen); - brw_init_codegen(&compiler->isa, p, params->mem_ctx); - brw_set_default_access_mode(p, BRW_ALIGN_16); - - generate_code(p, compiler, params, - nir, prog_data, cfg, perf, - params->stats, debug_enabled); - - assert(prog_data->base.const_data_size == 0); - if (nir->constant_data_size > 0) { - prog_data->base.const_data_size = nir->constant_data_size; - prog_data->base.const_data_offset = - brw_append_data(p, nir->constant_data, nir->constant_data_size, 32); - } - - return brw_get_program(p, &prog_data->base.program_size); -} diff --git a/src/intel/compiler/brw_vec4_gs_nir.cpp b/src/intel/compiler/brw_vec4_gs_nir.cpp deleted file mode 100644 index 60b42da87b9..00000000000 --- a/src/intel/compiler/brw_vec4_gs_nir.cpp +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_vec4_gs_visitor.h" - -namespace brw { - -void -vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) -{ - dst_reg dest; - src_reg src; - - switch (instr->intrinsic) { - case nir_intrinsic_load_per_vertex_input: { - assert(instr->def.bit_size == 32); - /* The EmitNoIndirectInput flag guarantees our vertex index will - * be constant. We should handle indirects someday. - */ - const unsigned vertex = nir_src_as_uint(instr->src[0]); - const unsigned offset_reg = nir_src_as_uint(instr->src[1]); - - const unsigned input_array_stride = prog_data->urb_read_length * 2; - - /* Make up a type...we have no way of knowing... */ - const glsl_type *const type = glsl_ivec_type(instr->num_components); - - src = src_reg(ATTR, input_array_stride * vertex + - nir_intrinsic_base(instr) + offset_reg, - type); - src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); - - dest = get_nir_def(instr->def, src.type); - dest.writemask = brw_writemask_for_size(instr->num_components); - emit(MOV(dest, src)); - break; - } - - case nir_intrinsic_load_input: - unreachable("nir_lower_io should have produced per_vertex intrinsics"); - - case nir_intrinsic_emit_vertex_with_counter: - this->vertex_count = - retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); - gs_emit_vertex(nir_intrinsic_stream_id(instr)); - break; - - case nir_intrinsic_end_primitive_with_counter: - this->vertex_count = - retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); - gs_end_primitive(); - break; - - case nir_intrinsic_set_vertex_and_primitive_count: - this->vertex_count = - retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD); - break; - - case nir_intrinsic_load_primitive_id: - assert(gs_prog_data->include_primitive_id); - dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D); - emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D))); - break; - - case nir_intrinsic_load_invocation_id: { - dest = get_nir_def(instr->def, BRW_REGISTER_TYPE_D); - if (gs_prog_data->invocations > 1) - emit(GS_OPCODE_GET_INSTANCE_ID, dest); - else - emit(MOV(dest, brw_imm_ud(0))); - break; - } - - default: - vec4_visitor::nir_emit_intrinsic(instr); - } -} -} diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp deleted file mode 100644 index d611e50d544..00000000000 --- a/src/intel/compiler/brw_vec4_gs_visitor.cpp +++ /dev/null @@ -1,560 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_vec4_gs_visitor.cpp - * - * Geometry-shader-specific code derived from the vec4_visitor class. - */ - -#include "brw_vec4_gs_visitor.h" -#include "brw_cfg.h" -#include "brw_fs.h" - -namespace brw { - -vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - struct brw_gs_compile *c, - struct brw_gs_prog_data *prog_data, - const nir_shader *shader, - bool no_spills, - bool debug_enabled) - : vec4_visitor(compiler, params, &c->key.base.tex, - &prog_data->base, shader, - no_spills, debug_enabled), - c(c), - gs_prog_data(prog_data) -{ -} - - -static inline struct brw_reg -attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved) -{ - struct brw_reg reg; - - unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type)); - if (interleaved) { - reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1); - } else { - reg = brw_vecn_grf(width, attr, 0); - } - - reg.type = type; - return reg; -} - -/** - * Replace each register of type ATTR in this->instructions with a reference - * to a fixed HW register. - * - * If interleaved is true, then each attribute takes up half a register, with - * register N containing attribute 2*N in its first half and attribute 2*N+1 - * in its second half (this corresponds to the payload setup used by geometry - * shaders in "single" or "dual instanced" dispatch mode). If interleaved is - * false, then each attribute takes up a whole register, with register N - * containing attribute N (this corresponds to the payload setup used by - * vertex shaders, and by geometry shaders in "dual object" dispatch mode). - */ -int -vec4_gs_visitor::setup_varying_inputs(int payload_reg, - int attributes_per_reg) -{ - /* For geometry shaders there are N copies of the input attributes, where N - * is the number of input vertices. attribute_map[BRW_VARYING_SLOT_COUNT * - * i + j] represents attribute j for vertex i. - * - * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time, - * so the total number of input slots that will be delivered to the GS (and - * thus the stride of the input arrays) is urb_read_length * 2. - */ - const unsigned num_input_vertices = nir->info.gs.vertices_in; - assert(num_input_vertices <= MAX_GS_INPUT_VERTICES); - unsigned input_array_stride = prog_data->urb_read_length * 2; - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (int i = 0; i < 3; i++) { - if (inst->src[i].file != ATTR) - continue; - - assert(inst->src[i].offset % REG_SIZE == 0); - int grf = payload_reg * attributes_per_reg + - inst->src[i].nr + inst->src[i].offset / REG_SIZE; - - struct brw_reg reg = - attribute_to_hw_reg(grf, inst->src[i].type, attributes_per_reg > 1); - reg.swizzle = inst->src[i].swizzle; - if (inst->src[i].abs) - reg = brw_abs(reg); - if (inst->src[i].negate) - reg = negate(reg); - - inst->src[i] = reg; - } - } - - int regs_used = ALIGN(input_array_stride * num_input_vertices, - attributes_per_reg) / attributes_per_reg; - return payload_reg + regs_used; -} - -void -vec4_gs_visitor::setup_payload() -{ - /* If we are in dual instanced or single mode, then attributes are going - * to be interleaved, so one register contains two attribute slots. - */ - int attributes_per_reg = - prog_data->dispatch_mode == INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2; - - int reg = 0; - - /* The payload always contains important data in r0, which contains - * the URB handles that are passed on to the URB write at the end - * of the thread. - */ - reg++; - - /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */ - if (gs_prog_data->include_primitive_id) - reg++; - - reg = setup_uniforms(reg); - - reg = setup_varying_inputs(reg, attributes_per_reg); - - this->first_non_payload_grf = reg; -} - - -void -vec4_gs_visitor::emit_prolog() -{ - /* In vertex shaders, r0.2 is guaranteed to be initialized to zero. In - * geometry shaders, it isn't (it contains a bunch of information we don't - * need, like the input primitive type). We need r0.2 to be zero in order - * to build scratch read/write messages correctly (otherwise this value - * will be interpreted as a global offset, causing us to do our scratch - * reads/writes to garbage memory). So just set it to zero at the top of - * the shader. - */ - this->current_annotation = "clear r0.2"; - dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD)); - vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u)); - inst->force_writemask_all = true; - - /* Create a virtual register to hold the vertex count */ - this->vertex_count = src_reg(this, glsl_uint_type()); - - /* Initialize the vertex_count register to 0 */ - this->current_annotation = "initialize vertex_count"; - inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u))); - inst->force_writemask_all = true; - - if (c->control_data_header_size_bits > 0) { - /* Create a virtual register to hold the current set of control data - * bits. - */ - this->control_data_bits = src_reg(this, glsl_uint_type()); - - /* If we're outputting more than 32 control data bits, then EmitVertex() - * will set control_data_bits to 0 after emitting the first vertex. - * Otherwise, we need to initialize it to 0 here. - */ - if (c->control_data_header_size_bits <= 32) { - this->current_annotation = "initialize control data bits"; - inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); - inst->force_writemask_all = true; - } - } - - this->current_annotation = NULL; -} - -void -vec4_gs_visitor::emit_thread_end() -{ - if (c->control_data_header_size_bits > 0) { - /* During shader execution, we only ever call emit_control_data_bits() - * just prior to outputting a vertex. Therefore, the control data bits - * corresponding to the most recently output vertex still need to be - * emitted. - */ - current_annotation = "thread end: emit control data bits"; - emit_control_data_bits(); - } - - /* MRF 0 is reserved for the debugger, so start with message header - * in MRF 1. - */ - int base_mrf = 1; - - current_annotation = "thread end"; - dst_reg mrf_reg(MRF, base_mrf); - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - vec4_instruction *inst = emit(MOV(mrf_reg, r0)); - inst->force_writemask_all = true; - emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count); - inst = emit(GS_OPCODE_THREAD_END); - inst->base_mrf = base_mrf; - inst->mlen = 1; -} - - -void -vec4_gs_visitor::emit_urb_write_header(int mrf) -{ - /* The SEND instruction that writes the vertex data to the VUE will use - * per_slot_offset=true, which means that DWORDs 3 and 4 of the message - * header specify an offset (in multiples of 256 bits) into the URB entry - * at which the write should take place. - * - * So we have to prepare a message header with the appropriate offset - * values. - */ - dst_reg mrf_reg(MRF, mrf); - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - this->current_annotation = "URB write header"; - vec4_instruction *inst = emit(MOV(mrf_reg, r0)); - inst->force_writemask_all = true; - emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count, - brw_imm_ud(gs_prog_data->output_vertex_size_hwords)); -} - - -vec4_instruction * -vec4_gs_visitor::emit_urb_write_opcode(bool complete) -{ - /* We don't care whether the vertex is complete, because in general - * geometry shaders output multiple vertices, and we don't terminate the - * thread until all vertices are complete. - */ - (void) complete; - - vec4_instruction *inst = emit(VEC4_GS_OPCODE_URB_WRITE); - inst->offset = gs_prog_data->control_data_header_size_hwords; - - inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; - return inst; -} - - -/** - * Write out a batch of 32 control data bits from the control_data_bits - * register to the URB. - * - * The current value of the vertex_count register determines which DWORD in - * the URB receives the control data bits. The control_data_bits register is - * assumed to contain the correct data for the vertex that was most recently - * output, and all previous vertices that share the same DWORD. - * - * This function takes care of ensuring that if no vertices have been output - * yet, no control bits are emitted. - */ -void -vec4_gs_visitor::emit_control_data_bits() -{ - assert(c->control_data_bits_per_vertex != 0); - - /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized) - * granularity, we need to use two tricks to ensure that the batch of 32 - * control data bits is written to the appropriate DWORD in the URB. To - * select which vec4 we are writing to, we use the "slot {0,1} offset" - * fields of the message header. To select which DWORD in the vec4 we are - * writing to, we use the channel mask fields of the message header. To - * avoid penalizing geometry shaders that emit a small number of vertices - * with extra bookkeeping, we only do each of these tricks when - * c->prog_data.control_data_header_size_bits is large enough to make it - * necessary. - * - * Note: this means that if we're outputting just a single DWORD of control - * data bits, we'll actually replicate it four times since we won't do any - * channel masking. But that's not a problem since in this case the - * hardware only pays attention to the first DWORD. - */ - enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD; - if (c->control_data_header_size_bits > 32) - urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS; - if (c->control_data_header_size_bits > 128) - urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET; - - /* If we are using either channel masks or a per-slot offset, then we - * need to figure out which DWORD we are trying to write to, using the - * formula: - * - * dword_index = (vertex_count - 1) * bits_per_vertex / 32 - * - * Since bits_per_vertex is a power of two, and is known at compile - * time, this can be optimized to: - * - * dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex)) - */ - src_reg dword_index(this, glsl_uint_type()); - if (urb_write_flags) { - src_reg prev_count(this, glsl_uint_type()); - emit(ADD(dst_reg(prev_count), this->vertex_count, - brw_imm_ud(0xffffffffu))); - unsigned log2_bits_per_vertex = - util_last_bit(c->control_data_bits_per_vertex); - emit(SHR(dst_reg(dword_index), prev_count, - brw_imm_ud(6 - log2_bits_per_vertex))); - } - - /* Start building the URB write message. The first MRF gets a copy of - * R0. - */ - int base_mrf = 1; - dst_reg mrf_reg(MRF, base_mrf); - src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)); - vec4_instruction *inst = emit(MOV(mrf_reg, r0)); - inst->force_writemask_all = true; - - if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) { - /* Set the per-slot offset to dword_index / 4, to that we'll write to - * the appropriate OWORD within the control data header. - */ - src_reg per_slot_offset(this, glsl_uint_type()); - emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u))); - emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset, - brw_imm_ud(1u)); - } - - if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) { - /* Set the channel masks to 1 << (dword_index % 4), so that we'll - * write to the appropriate DWORD within the OWORD. We need to do - * this computation with force_writemask_all, otherwise garbage data - * from invocation 0 might clobber the mask for invocation 1 when - * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks - * together. - */ - src_reg channel(this, glsl_uint_type()); - inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u))); - inst->force_writemask_all = true; - src_reg one(this, glsl_uint_type()); - inst = emit(MOV(dst_reg(one), brw_imm_ud(1u))); - inst->force_writemask_all = true; - src_reg channel_mask(this, glsl_uint_type()); - inst = emit(SHL(dst_reg(channel_mask), one, channel)); - inst->force_writemask_all = true; - emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask), - channel_mask); - emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask); - } - - /* Store the control data bits in the message payload and send it. */ - dst_reg mrf_reg2(MRF, base_mrf + 1); - inst = emit(MOV(mrf_reg2, this->control_data_bits)); - inst->force_writemask_all = true; - inst = emit(VEC4_GS_OPCODE_URB_WRITE); - inst->urb_write_flags = urb_write_flags; - inst->base_mrf = base_mrf; - inst->mlen = 2; -} - -void -vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id) -{ - /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */ - - /* Note: we are calling this *before* increasing vertex_count, so - * this->vertex_count == vertex_count - 1 in the formula above. - */ - - /* Stream mode uses 2 bits per vertex */ - assert(c->control_data_bits_per_vertex == 2); - - /* Must be a valid stream */ - assert(stream_id < 4); /* MAX_VERTEX_STREAMS */ - - /* Control data bits are initialized to 0 so we don't have to set any - * bits when sending vertices to stream 0. - */ - if (stream_id == 0) - return; - - /* reg::sid = stream_id */ - src_reg sid(this, glsl_uint_type()); - emit(MOV(dst_reg(sid), brw_imm_ud(stream_id))); - - /* reg:shift_count = 2 * (vertex_count - 1) */ - src_reg shift_count(this, glsl_uint_type()); - emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u))); - - /* Note: we're relying on the fact that the GEN SHL instruction only pays - * attention to the lower 5 bits of its second source argument, so on this - * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to - * stream_id << ((2 * (vertex_count - 1)) % 32). - */ - src_reg mask(this, glsl_uint_type()); - emit(SHL(dst_reg(mask), sid, shift_count)); - emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); -} - -void -vec4_gs_visitor::gs_emit_vertex(int stream_id) -{ - this->current_annotation = "emit vertex: safety check"; - - /* Haswell and later hardware ignores the "Render Stream Select" bits - * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled, - * and instead sends all primitives down the pipeline for rasterization. - * If the SOL stage is enabled, "Render Stream Select" is honored and - * primitives bound to non-zero streams are discarded after stream output. - * - * Since the only purpose of primives sent to non-zero streams is to - * be recorded by transform feedback, we can simply discard all geometry - * bound to these streams when transform feedback is disabled. - */ - if (stream_id > 0 && !nir->info.has_transform_feedback_varyings) - return; - - /* If we're outputting 32 control data bits or less, then we can wait - * until the shader is over to output them all. Otherwise we need to - * output them as we go. Now is the time to do it, since we're about to - * output the vertex_count'th vertex, so it's guaranteed that the - * control data bits associated with the (vertex_count - 1)th vertex are - * correct. - */ - if (c->control_data_header_size_bits > 32) { - this->current_annotation = "emit vertex: emit control data bits"; - /* Only emit control data bits if we've finished accumulating a batch - * of 32 bits. This is the case when: - * - * (vertex_count * bits_per_vertex) % 32 == 0 - * - * (in other words, when the last 5 bits of vertex_count * - * bits_per_vertex are 0). Assuming bits_per_vertex == 2^n for some - * integer n (which is always the case, since bits_per_vertex is - * always 1 or 2), this is equivalent to requiring that the last 5-n - * bits of vertex_count are 0: - * - * vertex_count & (2^(5-n) - 1) == 0 - * - * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is - * equivalent to: - * - * vertex_count & (32 / bits_per_vertex - 1) == 0 - */ - vec4_instruction *inst = - emit(AND(dst_null_ud(), this->vertex_count, - brw_imm_ud(32 / c->control_data_bits_per_vertex - 1))); - inst->conditional_mod = BRW_CONDITIONAL_Z; - - emit(IF(BRW_PREDICATE_NORMAL)); - { - /* If vertex_count is 0, then no control data bits have been - * accumulated yet, so we skip emitting them. - */ - emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), - BRW_CONDITIONAL_NEQ)); - emit(IF(BRW_PREDICATE_NORMAL)); - emit_control_data_bits(); - emit(BRW_OPCODE_ENDIF); - - /* Reset control_data_bits to 0 so we can start accumulating a new - * batch. - * - * Note: in the case where vertex_count == 0, this neutralizes the - * effect of any call to EndPrimitive() that the shader may have - * made before outputting its first vertex. - */ - inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u))); - inst->force_writemask_all = true; - } - emit(BRW_OPCODE_ENDIF); - } - - this->current_annotation = "emit vertex: vertex data"; - emit_vertex(); - - /* In stream mode we have to set control data bits for all vertices - * unless we have disabled control data bits completely (which we do - * do for MESA_PRIM_POINTS outputs that don't use streams). - */ - if (c->control_data_header_size_bits > 0 && - gs_prog_data->control_data_format == - GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) { - this->current_annotation = "emit vertex: Stream control data bits"; - set_stream_control_data_bits(stream_id); - } - - this->current_annotation = NULL; -} - -void -vec4_gs_visitor::gs_end_primitive() -{ - /* We can only do EndPrimitive() functionality when the control data - * consists of cut bits. Fortunately, the only time it isn't is when the - * output type is points, in which case EndPrimitive() is a no-op. - */ - if (gs_prog_data->control_data_format != - GFX7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) { - return; - } - - if (c->control_data_header_size_bits == 0) - return; - - /* Cut bits use one bit per vertex. */ - assert(c->control_data_bits_per_vertex == 1); - - /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting - * vertex n, 0 otherwise. So all we need to do here is mark bit - * (vertex_count - 1) % 32 in the cut_bits register to indicate that - * EndPrimitive() was called after emitting vertex (vertex_count - 1); - * vec4_gs_visitor::emit_control_data_bits() will take care of the rest. - * - * Note that if EndPrimitve() is called before emitting any vertices, this - * will cause us to set bit 31 of the control_data_bits register to 1. - * That's fine because: - * - * - If max_vertices < 32, then vertex number 31 (zero-based) will never be - * output, so the hardware will ignore cut bit 31. - * - * - If max_vertices == 32, then vertex number 31 is guaranteed to be the - * last vertex, so setting cut bit 31 has no effect (since the primitive - * is automatically ended when the GS terminates). - * - * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the - * control_data_bits register to 0 when the first vertex is emitted. - */ - - /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */ - src_reg one(this, glsl_uint_type()); - emit(MOV(dst_reg(one), brw_imm_ud(1u))); - src_reg prev_count(this, glsl_uint_type()); - emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu))); - src_reg mask(this, glsl_uint_type()); - /* Note: we're relying on the fact that the GEN SHL instruction only pays - * attention to the lower 5 bits of its second source argument, so on this - * architecture, 1 << (vertex_count - 1) is equivalent to 1 << - * ((vertex_count - 1) % 32). - */ - emit(SHL(dst_reg(mask), one, prev_count)); - emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask)); -} - -} /* namespace brw */ - diff --git a/src/intel/compiler/brw_vec4_gs_visitor.h b/src/intel/compiler/brw_vec4_gs_visitor.h deleted file mode 100644 index ec8b6f7fa8b..00000000000 --- a/src/intel/compiler/brw_vec4_gs_visitor.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_vec4_gs_visitor.h - * - * Geometry-shader-specific code derived from the vec4_visitor class. - */ - -#ifndef BRW_VEC4_GS_VISITOR_H -#define BRW_VEC4_GS_VISITOR_H - -#include "brw_vec4.h" - -#define MAX_GS_INPUT_VERTICES 6 - -#ifdef __cplusplus -namespace brw { - -class vec4_gs_visitor : public vec4_visitor -{ -public: - vec4_gs_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - struct brw_gs_compile *c, - struct brw_gs_prog_data *prog_data, - const nir_shader *shader, - bool no_spills, - bool debug_enabled); - -protected: - virtual void setup_payload(); - virtual void emit_prolog(); - virtual void emit_thread_end(); - virtual void emit_urb_write_header(int mrf); - virtual vec4_instruction *emit_urb_write_opcode(bool complete); - virtual void gs_emit_vertex(int stream_id); - virtual void gs_end_primitive(); - virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); - -protected: - int setup_varying_inputs(int payload_reg, int attributes_per_reg); - void emit_control_data_bits(); - void set_stream_control_data_bits(unsigned stream_id); - - src_reg vertex_count; - src_reg control_data_bits; - const struct brw_gs_compile * const c; - struct brw_gs_prog_data * const gs_prog_data; -}; - -} /* namespace brw */ -#endif /* __cplusplus */ - -#endif /* BRW_VEC4_GS_VISITOR_H */ diff --git a/src/intel/compiler/brw_vec4_live_variables.cpp b/src/intel/compiler/brw_vec4_live_variables.cpp deleted file mode 100644 index 88fa179d0f5..00000000000 --- a/src/intel/compiler/brw_vec4_live_variables.cpp +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors: - * Eric Anholt - * - */ - -#include "brw_vec4.h" -#include "brw_vec4_live_variables.h" - -using namespace brw; - -#define MAX_INSTRUCTION (1 << 30) - -/** @file brw_vec4_live_variables.cpp - * - * Support for computing at the basic block level which variables - * (virtual GRFs in our case) are live at entry and exit. - * - * See Muchnick's Advanced Compiler Design and Implementation, section - * 14.1 (p444). - */ - -/** - * Sets up the use/def arrays and block-local approximation of the live ranges. - * - * The basic-block-level live variable analysis needs to know which - * variables get used before they're completely defined, and which - * variables are completely defined before they're used. - * - * We independently track each channel of a vec4. This is because we need to - * be able to recognize a sequence like: - * - * ... - * DP4 tmp.x a b; - * DP4 tmp.y c d; - * MUL result.xy tmp.xy e.xy - * ... - * - * as having tmp live only across that sequence (assuming it's used nowhere - * else), because it's a common pattern. A more conservative approach that - * doesn't get tmp marked a deffed in this block will tend to result in - * spilling. - */ -void -vec4_live_variables::setup_def_use() -{ - int ip = 0; - - foreach_block (block, cfg) { - assert(ip == block->start_ip); - if (block->num > 0) - assert(cfg->blocks[block->num - 1]->end_ip == ip - 1); - - foreach_inst_in_block(vec4_instruction, inst, block) { - struct block_data *bd = &block_data[block->num]; - - /* Set up the instruction uses. */ - for (unsigned int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) { - for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) { - for (int c = 0; c < 4; c++) { - const unsigned v = var_from_reg(alloc, inst->src[i], c, j); - - start[v] = MIN2(start[v], ip); - end[v] = ip; - - if (!BITSET_TEST(bd->def, v)) - BITSET_SET(bd->use, v); - } - } - } - } - for (unsigned c = 0; c < 4; c++) { - if (inst->reads_flag(c) && - !BITSET_TEST(bd->flag_def, c)) { - BITSET_SET(bd->flag_use, c); - } - } - - /* Set up the instruction defs. */ - if (inst->dst.file == VGRF) { - for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) { - for (int c = 0; c < 4; c++) { - if (inst->dst.writemask & (1 << c)) { - const unsigned v = var_from_reg(alloc, inst->dst, c, i); - - start[v] = MIN2(start[v], ip); - end[v] = ip; - - /* Check for unconditional register writes, these are the - * things that screen off preceding definitions of a - * variable, and thus qualify for being in def[]. - */ - if ((!inst->predicate || inst->opcode == BRW_OPCODE_SEL) && - !BITSET_TEST(bd->use, v)) - BITSET_SET(bd->def, v); - } - } - } - } - if (inst->writes_flag(devinfo)) { - for (unsigned c = 0; c < 4; c++) { - if ((inst->dst.writemask & (1 << c)) && - !BITSET_TEST(bd->flag_use, c)) { - BITSET_SET(bd->flag_def, c); - } - } - } - - ip++; - } - } -} - -/** - * The algorithm incrementally sets bits in liveout and livein, - * propagating it through control flow. It will eventually terminate - * because it only ever adds bits, and stops when no bits are added in - * a pass. - */ -void -vec4_live_variables::compute_live_variables() -{ - bool cont = true; - - while (cont) { - cont = false; - - foreach_block_reverse (block, cfg) { - struct block_data *bd = &block_data[block->num]; - - /* Update liveout */ - foreach_list_typed(bblock_link, child_link, link, &block->children) { - struct block_data *child_bd = &block_data[child_link->block->num]; - - for (int i = 0; i < bitset_words; i++) { - BITSET_WORD new_liveout = (child_bd->livein[i] & - ~bd->liveout[i]); - if (new_liveout) { - bd->liveout[i] |= new_liveout; - cont = true; - } - } - BITSET_WORD new_liveout = (child_bd->flag_livein[0] & - ~bd->flag_liveout[0]); - if (new_liveout) { - bd->flag_liveout[0] |= new_liveout; - cont = true; - } - } - - /* Update livein */ - for (int i = 0; i < bitset_words; i++) { - BITSET_WORD new_livein = (bd->use[i] | - (bd->liveout[i] & - ~bd->def[i])); - if (new_livein & ~bd->livein[i]) { - bd->livein[i] |= new_livein; - cont = true; - } - } - BITSET_WORD new_livein = (bd->flag_use[0] | - (bd->flag_liveout[0] & - ~bd->flag_def[0])); - if (new_livein & ~bd->flag_livein[0]) { - bd->flag_livein[0] |= new_livein; - cont = true; - } - } - } -} - -/** - * Extend the start/end ranges for each variable to account for the - * new information calculated from control flow. - */ -void -vec4_live_variables::compute_start_end() -{ - foreach_block (block, cfg) { - const struct block_data &bd = block_data[block->num]; - - for (int i = 0; i < num_vars; i++) { - if (BITSET_TEST(bd.livein, i)) { - start[i] = MIN2(start[i], block->start_ip); - end[i] = MAX2(end[i], block->start_ip); - } - - if (BITSET_TEST(bd.liveout, i)) { - start[i] = MIN2(start[i], block->end_ip); - end[i] = MAX2(end[i], block->end_ip); - } - } - } -} - -vec4_live_variables::vec4_live_variables(const backend_shader *s) - : alloc(s->alloc), cfg(s->cfg) -{ - mem_ctx = ralloc_context(NULL); - - num_vars = alloc.total_size * 8; - start = ralloc_array(mem_ctx, int, num_vars); - end = ralloc_array(mem_ctx, int, num_vars); - - for (int i = 0; i < num_vars; i++) { - start[i] = MAX_INSTRUCTION; - end[i] = -1; - } - - devinfo = s->compiler->devinfo; - - block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks); - - bitset_words = BITSET_WORDS(num_vars); - for (int i = 0; i < cfg->num_blocks; i++) { - block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); - block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); - block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); - block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words); - - block_data[i].flag_def[0] = 0; - block_data[i].flag_use[0] = 0; - block_data[i].flag_livein[0] = 0; - block_data[i].flag_liveout[0] = 0; - } - - setup_def_use(); - compute_live_variables(); - compute_start_end(); -} - -vec4_live_variables::~vec4_live_variables() -{ - ralloc_free(mem_ctx); -} - -static bool -check_register_live_range(const vec4_live_variables *live, int ip, - unsigned var, unsigned n) -{ - for (unsigned j = 0; j < n; j += 4) { - if (var + j >= unsigned(live->num_vars) || - live->start[var + j] > ip || live->end[var + j] < ip) - return false; - } - - return true; -} - -bool -vec4_live_variables::validate(const backend_shader *s) const -{ - unsigned ip = 0; - - foreach_block_and_inst(block, vec4_instruction, inst, s->cfg) { - for (unsigned c = 0; c < 4; c++) { - if (inst->dst.writemask & (1 << c)) { - for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF && - !check_register_live_range(this, ip, - var_from_reg(alloc, inst->src[i], c), - regs_read(inst, i))) - return false; - } - - if (inst->dst.file == VGRF && - !check_register_live_range(this, ip, - var_from_reg(alloc, inst->dst, c), - regs_written(inst))) - return false; - } - } - - ip++; - } - - return true; -} - -int -vec4_live_variables::var_range_start(unsigned v, unsigned n) const -{ - int ip = INT_MAX; - - for (unsigned i = 0; i < n; i++) - ip = MIN2(ip, start[v + i]); - - return ip; -} - -int -vec4_live_variables::var_range_end(unsigned v, unsigned n) const -{ - int ip = INT_MIN; - - for (unsigned i = 0; i < n; i++) - ip = MAX2(ip, end[v + i]); - - return ip; -} - -bool -vec4_live_variables::vgrfs_interfere(int a, int b) const -{ - return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <= - var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) || - (var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <= - var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a]))); -} diff --git a/src/intel/compiler/brw_vec4_live_variables.h b/src/intel/compiler/brw_vec4_live_variables.h deleted file mode 100644 index 39d97c8a521..00000000000 --- a/src/intel/compiler/brw_vec4_live_variables.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Authors: - * Eric Anholt - * - */ - -#ifndef BRW_VEC4_LIVE_VARIABLES_H -#define BRW_VEC4_LIVE_VARIABLES_H - -#include "brw_ir_vec4.h" -#include "brw_ir_analysis.h" -#include "util/bitset.h" - -struct backend_shader; - -namespace brw { - -class vec4_live_variables { -public: - struct block_data { - /** - * Which variables are defined before being used in the block. - * - * Note that for our purposes, "defined" means unconditionally, completely - * defined. - */ - BITSET_WORD *def; - - /** - * Which variables are used before being defined in the block. - */ - BITSET_WORD *use; - - /** Which defs reach the entry point of the block. */ - BITSET_WORD *livein; - - /** Which defs reach the exit point of the block. */ - BITSET_WORD *liveout; - - BITSET_WORD flag_def[1]; - BITSET_WORD flag_use[1]; - BITSET_WORD flag_livein[1]; - BITSET_WORD flag_liveout[1]; - }; - - vec4_live_variables(const backend_shader *s); - ~vec4_live_variables(); - - bool - validate(const backend_shader *s) const; - - analysis_dependency_class - dependency_class() const - { - return (DEPENDENCY_INSTRUCTION_IDENTITY | - DEPENDENCY_INSTRUCTION_DATA_FLOW | - DEPENDENCY_VARIABLES); - } - - int num_vars; - int bitset_words; - - const struct intel_device_info *devinfo; - - /** Per-basic-block information on live variables */ - struct block_data *block_data; - - /** @{ - * Final computed live ranges for each variable. - */ - int *start; - int *end; - /** @} */ - - int var_range_start(unsigned v, unsigned n) const; - int var_range_end(unsigned v, unsigned n) const; - bool vgrfs_interfere(int a, int b) const; - -protected: - void setup_def_use(); - void compute_live_variables(); - void compute_start_end(); - - const simple_allocator &alloc; - cfg_t *cfg; - void *mem_ctx; -}; - -/* Returns the variable index for the k-th dword of the c-th component of - * register reg. - */ -inline unsigned -var_from_reg(const simple_allocator &alloc, const src_reg ®, - unsigned c = 0, unsigned k = 0) -{ - assert(reg.file == VGRF && reg.nr < alloc.count && c < 4); - const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); - unsigned result = - 8 * alloc.offsets[reg.nr] + reg.offset / 4 + - (BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize; - /* Do not exceed the limit for this register */ - assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr])); - return result; -} - -inline unsigned -var_from_reg(const simple_allocator &alloc, const dst_reg ®, - unsigned c = 0, unsigned k = 0) -{ - assert(reg.file == VGRF && reg.nr < alloc.count && c < 4); - const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4); - unsigned result = - 8 * alloc.offsets[reg.nr] + reg.offset / 4 + - (c + k / csize * 4) * csize + k % csize; - /* Do not exceed the limit for this register */ - assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr])); - return result; -} - -} /* namespace brw */ - -#endif /* BRW_VEC4_LIVE_VARIABLES_H */ diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp deleted file mode 100644 index 9121f8e10f2..00000000000 --- a/src/intel/compiler/brw_vec4_nir.cpp +++ /dev/null @@ -1,2307 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_nir.h" -#include "brw_vec4.h" -#include "brw_vec4_builder.h" -#include "brw_vec4_surface_builder.h" -#include "brw_eu.h" -#include "nir.h" -#include "nir_intrinsics.h" -#include "nir_intrinsics_indices.h" - -using namespace brw; -using namespace brw::surface_access; - -namespace brw { - -void -vec4_visitor::emit_nir_code() -{ - /* Globally set the rounding mode based on the float controls. gen7 doesn't - * support 16-bit floats, and gen8 switches to scalar VS. So we don't need - * to do any per-instruction mode switching the way the scalar FS handles. - */ - emit_shader_float_controls_execution_mode(); - if (nir->num_uniforms > 0) - nir_setup_uniforms(); - - nir_emit_impl(nir_shader_get_entrypoint((nir_shader *)nir)); -} - -void -vec4_visitor::nir_setup_uniforms() -{ - uniforms = nir->num_uniforms / 16; -} - -void -vec4_visitor::nir_emit_impl(nir_function_impl *impl) -{ - nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc); - - nir_emit_cf_list(&impl->body); -} - -void -vec4_visitor::nir_emit_cf_list(exec_list *list) -{ - exec_list_validate(list); - foreach_list_typed(nir_cf_node, node, node, list) { - switch (node->type) { - case nir_cf_node_if: - nir_emit_if(nir_cf_node_as_if(node)); - break; - - case nir_cf_node_loop: - nir_emit_loop(nir_cf_node_as_loop(node)); - break; - - case nir_cf_node_block: - nir_emit_block(nir_cf_node_as_block(node)); - break; - - default: - unreachable("Invalid CFG node block"); - } - } -} - -void -vec4_visitor::nir_emit_if(nir_if *if_stmt) -{ - /* First, put the condition in f0 */ - src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1); - vec4_instruction *inst = emit(MOV(dst_null_d(), condition)); - inst->conditional_mod = BRW_CONDITIONAL_NZ; - - /* We can just predicate based on the X channel, as the condition only - * goes on its own line */ - emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X)); - - nir_emit_cf_list(&if_stmt->then_list); - - if (!nir_cf_list_is_empty_block(&if_stmt->else_list)) { - emit(BRW_OPCODE_ELSE); - nir_emit_cf_list(&if_stmt->else_list); - } - - emit(BRW_OPCODE_ENDIF); -} - -void -vec4_visitor::nir_emit_loop(nir_loop *loop) -{ - assert(!nir_loop_has_continue_construct(loop)); - emit(BRW_OPCODE_DO); - - nir_emit_cf_list(&loop->body); - - emit(BRW_OPCODE_WHILE); -} - -void -vec4_visitor::nir_emit_block(nir_block *block) -{ - nir_foreach_instr(instr, block) { - nir_emit_instr(instr); - } -} - -void -vec4_visitor::nir_emit_instr(nir_instr *instr) -{ - base_ir = instr; - - switch (instr->type) { - case nir_instr_type_load_const: - nir_emit_load_const(nir_instr_as_load_const(instr)); - break; - - case nir_instr_type_intrinsic: - nir_emit_intrinsic(nir_instr_as_intrinsic(instr)); - break; - - case nir_instr_type_alu: - nir_emit_alu(nir_instr_as_alu(instr)); - break; - - case nir_instr_type_jump: - nir_emit_jump(nir_instr_as_jump(instr)); - break; - - case nir_instr_type_tex: - nir_emit_texture(nir_instr_as_tex(instr)); - break; - - case nir_instr_type_undef: - nir_emit_undef(nir_instr_as_undef(instr)); - break; - - default: - unreachable("VS instruction not yet implemented by NIR->vec4"); - } -} - -static dst_reg -dst_reg_for_nir_reg(vec4_visitor *v, nir_def *handle, - unsigned base_offset, nir_src *indirect) -{ - nir_intrinsic_instr *decl = nir_reg_get_decl(handle); - dst_reg reg = v->nir_ssa_values[handle->index]; - if (nir_intrinsic_bit_size(decl) == 64) - reg.type = BRW_REGISTER_TYPE_DF; - - reg = offset(reg, 8, base_offset); - if (indirect) { - reg.reladdr = - new(v->mem_ctx) src_reg(v->get_nir_src(*indirect, - BRW_REGISTER_TYPE_D, - 1)); - } - return reg; -} - -dst_reg -vec4_visitor::get_nir_def(const nir_def &def) -{ - nir_intrinsic_instr *store_reg = nir_store_reg_for_def(&def); - if (!store_reg) { - dst_reg dst = - dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(def.bit_size, 32))); - if (def.bit_size == 64) - dst.type = BRW_REGISTER_TYPE_DF; - nir_ssa_values[def.index] = dst; - return dst; - } else { - nir_src *indirect = - (store_reg->intrinsic == nir_intrinsic_store_reg_indirect) ? - &store_reg->src[2] : NULL; - - dst_reg dst = dst_reg_for_nir_reg(this, store_reg->src[1].ssa, - nir_intrinsic_base(store_reg), - indirect); - dst.writemask = nir_intrinsic_write_mask(store_reg); - return dst; - } -} - -dst_reg -vec4_visitor::get_nir_def(const nir_def &def, enum brw_reg_type type) -{ - return retype(get_nir_def(def), type); -} - -dst_reg -vec4_visitor::get_nir_def(const nir_def &def, nir_alu_type type) -{ - return get_nir_def(def, brw_type_for_nir_type(devinfo, type)); -} - -src_reg -vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type, - unsigned num_components) -{ - nir_intrinsic_instr *load_reg = nir_load_reg_for_def(src.ssa); - - dst_reg reg; - if (load_reg) { - nir_src *indirect = - (load_reg->intrinsic == nir_intrinsic_load_reg_indirect) ? - &load_reg->src[1] : NULL; - - reg = dst_reg_for_nir_reg(this, load_reg->src[0].ssa, - nir_intrinsic_base(load_reg), - indirect); - } else { - reg = nir_ssa_values[src.ssa->index]; - } - - reg = retype(reg, type); - - src_reg reg_as_src = src_reg(reg); - reg_as_src.swizzle = brw_swizzle_for_size(num_components); - return reg_as_src; -} - -src_reg -vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type, - unsigned num_components) -{ - return get_nir_src(src, brw_type_for_nir_type(devinfo, type), - num_components); -} - -src_reg -vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components) -{ - /* if type is not specified, default to signed int */ - return get_nir_src(src, nir_type_int32, num_components); -} - -src_reg -vec4_visitor::get_nir_src_imm(const nir_src &src) -{ - assert(nir_src_num_components(src) == 1); - assert(nir_src_bit_size(src) == 32); - return nir_src_is_const(src) ? src_reg(brw_imm_d(nir_src_as_int(src))) : - get_nir_src(src, 1); -} - -src_reg -vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr) -{ - nir_src *offset_src = nir_get_io_offset_src(instr); - - if (nir_src_is_const(*offset_src)) { - /* The only constant offset we should find is 0. brw_nir.c's - * add_const_offset_to_base() will fold other constant offsets - * into the base index. - */ - assert(nir_src_as_uint(*offset_src) == 0); - return src_reg(); - } - - return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1); -} - -static src_reg -setup_imm_df(const vec4_builder &bld, double v) -{ - const intel_device_info *devinfo = bld.shader->devinfo; - assert(devinfo->ver == 7); - - /* gfx7.5 does not support DF immediates straightforward but the DIM - * instruction allows to set the 64-bit immediate value. - */ - if (devinfo->verx10 == 75) { - const vec4_builder ubld = bld.exec_all(); - const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_DF); - ubld.DIM(dst, brw_imm_df(v)); - return swizzle(src_reg(dst), BRW_SWIZZLE_XXXX); - } - - /* gfx7 does not support DF immediates */ - union { - double d; - struct { - uint32_t i1; - uint32_t i2; - }; - } di; - - di.d = v; - - /* Write the low 32-bit of the constant to the X:UD channel and the - * high 32-bit to the Y:UD channel to build the constant in a VGRF. - * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes - * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle - * XXXX so any access to the VGRF only reads the constant data in these - * channels. - */ - const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2); - for (unsigned n = 0; n < 2; n++) { - const vec4_builder ubld = bld.exec_all().group(4, n); - ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1)); - ubld.MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2)); - } - - return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX); -} - -void -vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr) -{ - dst_reg reg; - - if (instr->def.bit_size == 64) { - reg = dst_reg(VGRF, alloc.allocate(2)); - reg.type = BRW_REGISTER_TYPE_DF; - } else { - reg = dst_reg(VGRF, alloc.allocate(1)); - reg.type = BRW_REGISTER_TYPE_D; - } - - const vec4_builder ibld = vec4_builder(this).at_end(); - unsigned remaining = brw_writemask_for_size(instr->def.num_components); - - /* @FIXME: consider emitting vector operations to save some MOVs in - * cases where the components are representable in 8 bits. - * For now, we emit a MOV for each distinct value. - */ - for (unsigned i = 0; i < instr->def.num_components; i++) { - unsigned writemask = 1 << i; - - if ((remaining & writemask) == 0) - continue; - - for (unsigned j = i; j < instr->def.num_components; j++) { - if ((instr->def.bit_size == 32 && - instr->value[i].u32 == instr->value[j].u32) || - (instr->def.bit_size == 64 && - instr->value[i].f64 == instr->value[j].f64)) { - writemask |= 1 << j; - } - } - - reg.writemask = writemask; - if (instr->def.bit_size == 64) { - emit(MOV(reg, setup_imm_df(ibld, instr->value[i].f64))); - } else { - emit(MOV(reg, brw_imm_d(instr->value[i].i32))); - } - - remaining &= ~writemask; - } - - /* Set final writemask */ - reg.writemask = brw_writemask_for_size(instr->def.num_components); - - nir_ssa_values[instr->def.index] = reg; -} - -src_reg -vec4_visitor::get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr) -{ - /* SSBO stores are weird in that their index is in src[1] */ - const unsigned src = instr->intrinsic == nir_intrinsic_store_ssbo ? 1 : 0; - - if (nir_src_is_const(instr->src[src])) { - return brw_imm_ud(nir_src_as_uint(instr->src[src])); - } else { - return emit_uniformize(get_nir_src(instr->src[src])); - } -} - -void -vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) -{ - dst_reg dest; - src_reg src; - - switch (instr->intrinsic) { - case nir_intrinsic_decl_reg: { - unsigned bit_size = nir_intrinsic_bit_size(instr); - unsigned array_elems = nir_intrinsic_num_array_elems(instr); - if (array_elems == 0) - array_elems = 1; - - const unsigned num_regs = array_elems * DIV_ROUND_UP(bit_size, 32); - dst_reg reg(VGRF, alloc.allocate(num_regs)); - if (bit_size == 64) - reg.type = BRW_REGISTER_TYPE_DF; - - nir_ssa_values[instr->def.index] = reg; - break; - } - - case nir_intrinsic_load_reg: - case nir_intrinsic_load_reg_indirect: - case nir_intrinsic_store_reg: - case nir_intrinsic_store_reg_indirect: - /* Nothing to do with these. */ - break; - - case nir_intrinsic_load_input: { - assert(instr->def.bit_size == 32); - /* We set EmitNoIndirectInput for VS */ - unsigned load_offset = nir_src_as_uint(instr->src[0]); - - dest = get_nir_def(instr->def); - - src = src_reg(ATTR, nir_intrinsic_base(instr) + load_offset, - glsl_uvec4_type()); - src = retype(src, dest.type); - - /* Swizzle source based on component layout qualifier */ - src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr)); - emit(MOV(dest, src)); - break; - } - - case nir_intrinsic_store_output: { - assert(nir_src_bit_size(instr->src[0]) == 32); - unsigned store_offset = nir_src_as_uint(instr->src[1]); - int varying = nir_intrinsic_base(instr) + store_offset; - src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, - instr->num_components); - - unsigned c = nir_intrinsic_component(instr); - output_reg[varying][c] = dst_reg(src); - output_num_components[varying][c] = instr->num_components; - break; - } - - case nir_intrinsic_get_ssbo_size: { - assert(nir_src_num_components(instr->src[0]) == 1); - unsigned ssbo_index = nir_src_is_const(instr->src[0]) ? - nir_src_as_uint(instr->src[0]) : 0; - - dst_reg result_dst = get_nir_def(instr->def); - vec4_instruction *inst = new(mem_ctx) - vec4_instruction(SHADER_OPCODE_GET_BUFFER_SIZE, result_dst); - - inst->base_mrf = 2; - inst->mlen = 1; /* always at least one */ - inst->src[1] = brw_imm_ud(ssbo_index); - - /* MRF for the first parameter */ - src_reg lod = brw_imm_d(0); - int param_base = inst->base_mrf; - int writemask = WRITEMASK_X; - emit(MOV(dst_reg(MRF, param_base, glsl_int_type(), writemask), lod)); - - emit(inst); - break; - } - - case nir_intrinsic_store_ssbo: { - assert(devinfo->ver == 7); - - /* brw_nir_lower_mem_access_bit_sizes takes care of this */ - assert(nir_src_bit_size(instr->src[0]) == 32); - assert(nir_intrinsic_write_mask(instr) == - (1u << instr->num_components) - 1); - - src_reg surf_index = get_nir_ssbo_intrinsic_index(instr); - src_reg offset_reg = retype(get_nir_src_imm(instr->src[2]), - BRW_REGISTER_TYPE_UD); - - /* Value */ - src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4); - - /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped - * writes will use SIMD8 mode. In order to hide this and keep symmetry across - * typed and untyped messages and across hardware platforms, the - * current implementation of the untyped messages will transparently convert - * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it - * and enabling only channel X on the SEND instruction. - * - * The above, works well for full vector writes, but not for partial writes - * where we want to write some channels and not others, like when we have - * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are - * quite restrictive with regards to the channel enables we can configure in - * the message descriptor (not all combinations are allowed) we cannot simply - * implement these scenarios with a single message while keeping the - * aforementioned symmetry in the implementation. For now we de decided that - * it is better to keep the symmetry to reduce complexity, so in situations - * such as the one described we end up emitting two untyped write messages - * (one for xy and another for w). - * - * The code below packs consecutive channels into a single write message, - * detects gaps in the vector write and if needed, sends a second message - * with the remaining channels. If in the future we decide that we want to - * emit a single message at the expense of losing the symmetry in the - * implementation we can: - * - * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8 - * message payload. In this mode we can write up to 8 offsets and dwords - * to the red channel only (for the two vec4s in the SIMD4x2 execution) - * and select which of the 8 channels carry data to write by setting the - * appropriate writemask in the dst register of the SEND instruction. - * It would require to write a new generator opcode specifically for - * IvyBridge since we would need to prepare a SIMD8 payload that could - * use any channel, not just X. - * - * 2) For Haswell+: Simply send a single write message but set the writemask - * on the dst of the SEND instruction to select the channels we want to - * write. It would require to modify the current messages to receive - * and honor the writemask provided. - */ - const vec4_builder bld = vec4_builder(this).at_end() - .annotate(current_annotation, base_ir); - - emit_untyped_write(bld, surf_index, offset_reg, val_reg, - 1 /* dims */, instr->num_components /* size */, - BRW_PREDICATE_NONE); - break; - } - - case nir_intrinsic_load_ssbo: { - assert(devinfo->ver == 7); - - /* brw_nir_lower_mem_access_bit_sizes takes care of this */ - assert(instr->def.bit_size == 32); - - src_reg surf_index = get_nir_ssbo_intrinsic_index(instr); - src_reg offset_reg = retype(get_nir_src_imm(instr->src[1]), - BRW_REGISTER_TYPE_UD); - - /* Read the vector */ - const vec4_builder bld = vec4_builder(this).at_end() - .annotate(current_annotation, base_ir); - - src_reg read_result = emit_untyped_read(bld, surf_index, offset_reg, - 1 /* dims */, 4 /* size*/, - BRW_PREDICATE_NONE); - dst_reg dest = get_nir_def(instr->def); - read_result.type = dest.type; - read_result.swizzle = brw_swizzle_for_size(instr->num_components); - emit(MOV(dest, read_result)); - break; - } - - case nir_intrinsic_ssbo_atomic: - case nir_intrinsic_ssbo_atomic_swap: - nir_emit_ssbo_atomic(lsc_op_to_legacy_atomic(lsc_aop_for_nir_intrinsic(instr)), instr); - break; - - case nir_intrinsic_load_vertex_id: - unreachable("should be lowered by vertex_id_zero_based"); - - case nir_intrinsic_load_vertex_id_zero_base: - case nir_intrinsic_load_base_vertex: - case nir_intrinsic_load_instance_id: - case nir_intrinsic_load_base_instance: - case nir_intrinsic_load_draw_id: - case nir_intrinsic_load_invocation_id: - unreachable("should be lowered by brw_nir_lower_vs_inputs()"); - - case nir_intrinsic_load_uniform: { - /* Offsets are in bytes but they should always be multiples of 4 */ - assert(nir_intrinsic_base(instr) % 4 == 0); - - dest = get_nir_def(instr->def); - - src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16)); - src.type = dest.type; - - /* Uniforms don't actually have to be vec4 aligned. In the case that - * it isn't, we have to use a swizzle to shift things around. They - * do still have the std140 alignment requirement that vec2's have to - * be vec2-aligned and vec3's and vec4's have to be vec4-aligned. - * - * The swizzle also works in the indirect case as the generator adds - * the swizzle to the offset for us. - */ - const int type_size = type_sz(src.type); - unsigned shift = (nir_intrinsic_base(instr) % 16) / type_size; - assert(shift + instr->num_components <= 4); - - if (nir_src_is_const(instr->src[0])) { - const unsigned load_offset = nir_src_as_uint(instr->src[0]); - /* Offsets are in bytes but they should always be multiples of 4 */ - assert(load_offset % 4 == 0); - - src.swizzle = brw_swizzle_for_size(instr->num_components); - dest.writemask = brw_writemask_for_size(instr->num_components); - unsigned offset = load_offset + shift * type_size; - src.offset = ROUND_DOWN_TO(offset, 16); - shift = (offset % 16) / type_size; - assert(shift + instr->num_components <= 4); - src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift); - - emit(MOV(dest, src)); - } else { - /* Uniform arrays are vec4 aligned, because of std140 alignment - * rules. - */ - assert(shift == 0); - - src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1); - - /* MOV_INDIRECT is going to stomp the whole thing anyway */ - dest.writemask = WRITEMASK_XYZW; - - emit(SHADER_OPCODE_MOV_INDIRECT, dest, src, - indirect, brw_imm_ud(nir_intrinsic_range(instr))); - } - break; - } - - case nir_intrinsic_load_ubo: { - src_reg surf_index; - - dest = get_nir_def(instr->def); - - if (nir_src_is_const(instr->src[0])) { - /* The block index is a constant, so just emit the binding table entry - * as an immediate. - */ - const unsigned index = nir_src_as_uint(instr->src[0]); - surf_index = brw_imm_ud(index); - } else { - /* The block index is not a constant. Evaluate the index expression - * per-channel and add the base UBO index; we have to select a value - * from any live channel. - */ - surf_index = src_reg(this, glsl_uint_type()); - emit(MOV(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32, - instr->num_components))); - surf_index = emit_uniformize(surf_index); - } - - src_reg push_reg; - src_reg offset_reg; - if (nir_src_is_const(instr->src[1])) { - unsigned load_offset = nir_src_as_uint(instr->src[1]); - unsigned aligned_offset = load_offset & ~15; - offset_reg = brw_imm_ud(aligned_offset); - - /* See if we've selected this as a push constant candidate */ - if (nir_src_is_const(instr->src[0])) { - const unsigned ubo_block = nir_src_as_uint(instr->src[0]); - const unsigned offset_256b = aligned_offset / 32; - - for (int i = 0; i < 4; i++) { - const struct brw_ubo_range *range = &prog_data->base.ubo_ranges[i]; - if (range->block == ubo_block && - offset_256b >= range->start && - offset_256b < range->start + range->length) { - - push_reg = src_reg(dst_reg(UNIFORM, UBO_START + i)); - push_reg.type = dest.type; - push_reg.offset = aligned_offset - 32 * range->start; - break; - } - } - } - } else { - offset_reg = src_reg(this, glsl_uint_type()); - emit(MOV(dst_reg(offset_reg), - get_nir_src(instr->src[1], nir_type_uint32, 1))); - } - - src_reg packed_consts; - if (push_reg.file != BAD_FILE) { - packed_consts = push_reg; - } else if (instr->def.bit_size == 32) { - packed_consts = src_reg(this, glsl_vec4_type()); - emit_pull_constant_load_reg(dst_reg(packed_consts), - surf_index, - offset_reg, - NULL, NULL /* before_block/inst */); - prog_data->base.has_ubo_pull = true; - } else { - src_reg temp = src_reg(this, glsl_dvec4_type()); - src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F); - - emit_pull_constant_load_reg(dst_reg(temp_float), - surf_index, offset_reg, NULL, NULL); - if (offset_reg.file == IMM) - offset_reg.ud += 16; - else - emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u))); - emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)), - surf_index, offset_reg, NULL, NULL); - prog_data->base.has_ubo_pull = true; - - packed_consts = src_reg(this, glsl_dvec4_type()); - shuffle_64bit_data(dst_reg(packed_consts), temp, false); - } - - packed_consts.swizzle = brw_swizzle_for_size(instr->num_components); - if (nir_src_is_const(instr->src[1])) { - unsigned load_offset = nir_src_as_uint(instr->src[1]); - unsigned type_size = type_sz(dest.type); - packed_consts.swizzle += - BRW_SWIZZLE4(load_offset % 16 / type_size, - load_offset % 16 / type_size, - load_offset % 16 / type_size, - load_offset % 16 / type_size); - } - - emit(MOV(dest, retype(packed_consts, dest.type))); - - break; - } - - case nir_intrinsic_barrier: { - if (nir_intrinsic_memory_scope(instr) == SCOPE_NONE) - break; - const vec4_builder bld = - vec4_builder(this).at_end().annotate(current_annotation, base_ir); - const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD); - vec4_instruction *fence = - bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp, brw_vec8_grf(0, 0)); - fence->sfid = GFX7_SFID_DATAPORT_DATA_CACHE; - break; - } - - case nir_intrinsic_shader_clock: { - /* We cannot do anything if there is an event, so ignore it for now */ - const src_reg shader_clock = get_timestamp(); - const enum brw_reg_type type = brw_type_for_base_type(glsl_uvec2_type()); - - dest = get_nir_def(instr->def, type); - emit(MOV(dest, shader_clock)); - break; - } - - default: - unreachable("Unknown intrinsic"); - } -} - -void -vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr) -{ - dst_reg dest; - if (nir_intrinsic_infos[instr->intrinsic].has_dest) - dest = get_nir_def(instr->def); - - src_reg surface = get_nir_ssbo_intrinsic_index(instr); - src_reg offset = get_nir_src(instr->src[1], 1); - src_reg data1; - if (op != BRW_AOP_INC && op != BRW_AOP_DEC && op != BRW_AOP_PREDEC) - data1 = get_nir_src(instr->src[2], 1); - src_reg data2; - if (op == BRW_AOP_CMPWR) - data2 = get_nir_src(instr->src[3], 1); - - /* Emit the actual atomic operation operation */ - const vec4_builder bld = - vec4_builder(this).at_end().annotate(current_annotation, base_ir); - - src_reg atomic_result = emit_untyped_atomic(bld, surface, offset, - data1, data2, - 1 /* dims */, 1 /* rsize */, - op, - BRW_PREDICATE_NONE); - dest.type = atomic_result.type; - bld.MOV(dest, atomic_result); -} - -static unsigned -brw_swizzle_for_nir_swizzle(uint8_t swizzle[4]) -{ - return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]); -} - -bool -vec4_visitor::optimize_predicate(nir_alu_instr *instr, - enum brw_predicate *predicate) -{ - if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu) - return false; - - nir_alu_instr *cmp_instr = - nir_instr_as_alu(instr->src[0].src.ssa->parent_instr); - - switch (cmp_instr->op) { - case nir_op_b32any_fnequal2: - case nir_op_b32any_inequal2: - case nir_op_b32any_fnequal3: - case nir_op_b32any_inequal3: - case nir_op_b32any_fnequal4: - case nir_op_b32any_inequal4: - *predicate = BRW_PREDICATE_ALIGN16_ANY4H; - break; - case nir_op_b32all_fequal2: - case nir_op_b32all_iequal2: - case nir_op_b32all_fequal3: - case nir_op_b32all_iequal3: - case nir_op_b32all_fequal4: - case nir_op_b32all_iequal4: - *predicate = BRW_PREDICATE_ALIGN16_ALL4H; - break; - default: - return false; - } - - unsigned size_swizzle = - brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]); - - src_reg op[2]; - assert(nir_op_infos[cmp_instr->op].num_inputs == 2); - for (unsigned i = 0; i < 2; i++) { - nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i]; - unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src); - type = (nir_alu_type) (((unsigned) type) | bit_size); - op[i] = get_nir_src(cmp_instr->src[i].src, type, 4); - unsigned base_swizzle = - brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle); - op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle); - } - - emit(CMP(dst_null_d(), op[0], op[1], - brw_cmod_for_nir_comparison(cmp_instr->op))); - - return true; -} - -void -vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src) -{ - enum opcode op; - switch (dst.type) { - case BRW_REGISTER_TYPE_D: - op = VEC4_OPCODE_DOUBLE_TO_D32; - break; - case BRW_REGISTER_TYPE_UD: - op = VEC4_OPCODE_DOUBLE_TO_U32; - break; - case BRW_REGISTER_TYPE_F: - op = VEC4_OPCODE_DOUBLE_TO_F32; - break; - default: - unreachable("Unknown conversion"); - } - - dst_reg temp = dst_reg(this, glsl_dvec4_type()); - emit(MOV(temp, src)); - dst_reg temp2 = dst_reg(this, glsl_dvec4_type()); - emit(op, temp2, src_reg(temp)); - - emit(VEC4_OPCODE_PICK_LOW_32BIT, retype(temp2, dst.type), src_reg(temp2)); - emit(MOV(dst, src_reg(retype(temp2, dst.type)))); -} - -void -vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src) -{ - dst_reg tmp_dst = dst_reg(src_reg(this, glsl_dvec4_type())); - src_reg tmp_src = retype(src_reg(this, glsl_vec4_type()), src.type); - emit(MOV(dst_reg(tmp_src), src)); - emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src); - emit(MOV(dst, src_reg(tmp_dst))); -} - -/** - * Try to use an immediate value for a source - * - * In cases of flow control, constant propagation is sometimes unable to - * determine that a register contains a constant value. To work around this, - * try to emit a literal as one of the sources. If \c try_src0_also is set, - * \c op[0] will also be tried for an immediate value. - * - * If \c op[0] is modified, the operands will be exchanged so that \c op[1] - * will always be the immediate value. - * - * \return The index of the source that was modified, 0 or 1, if successful. - * Otherwise, -1. - * - * \param op - Operands to the instruction - * \param try_src0_also - True if \c op[0] should also be a candidate for - * getting an immediate value. This should only be set - * for commutative operations. - */ -static int -try_immediate_source(const nir_alu_instr *instr, src_reg *op, - bool try_src0_also) -{ - unsigned idx; - - /* MOV should be the only single-source instruction passed to this - * function. Any other unary instruction with a constant source should - * have been constant-folded away! - */ - assert(nir_op_infos[instr->op].num_inputs > 1 || - instr->op == nir_op_mov); - - if (instr->op != nir_op_mov && - nir_src_bit_size(instr->src[1].src) == 32 && - nir_src_is_const(instr->src[1].src)) { - idx = 1; - } else if (try_src0_also && - nir_src_bit_size(instr->src[0].src) == 32 && - nir_src_is_const(instr->src[0].src)) { - idx = 0; - } else { - return -1; - } - - const enum brw_reg_type old_type = op[idx].type; - - switch (old_type) { - case BRW_REGISTER_TYPE_D: - case BRW_REGISTER_TYPE_UD: { - int first_comp = -1; - int d = 0; - - for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { - if (nir_alu_instr_channel_used(instr, idx, i)) { - if (first_comp < 0) { - first_comp = i; - d = nir_src_comp_as_int(instr->src[idx].src, - instr->src[idx].swizzle[i]); - } else if (d != nir_src_comp_as_int(instr->src[idx].src, - instr->src[idx].swizzle[i])) { - return -1; - } - } - } - - assert(first_comp >= 0); - - if (op[idx].abs) - d = MAX2(-d, d); - - if (op[idx].negate) - d = -d; - - op[idx] = retype(src_reg(brw_imm_d(d)), old_type); - break; - } - - case BRW_REGISTER_TYPE_F: { - int first_comp = -1; - float f[NIR_MAX_VEC_COMPONENTS] = { 0.0f }; - bool is_scalar = true; - - for (unsigned i = 0; i < NIR_MAX_VEC_COMPONENTS; i++) { - if (nir_alu_instr_channel_used(instr, idx, i)) { - f[i] = nir_src_comp_as_float(instr->src[idx].src, - instr->src[idx].swizzle[i]); - if (first_comp < 0) { - first_comp = i; - } else if (f[first_comp] != f[i]) { - is_scalar = false; - } - } - } - - if (is_scalar) { - if (op[idx].abs) - f[first_comp] = fabs(f[first_comp]); - - if (op[idx].negate) - f[first_comp] = -f[first_comp]; - - op[idx] = src_reg(brw_imm_f(f[first_comp])); - assert(op[idx].type == old_type); - } else { - uint8_t vf_values[4] = { 0, 0, 0, 0 }; - - for (unsigned i = 0; i < ARRAY_SIZE(vf_values); i++) { - - if (op[idx].abs) - f[i] = fabs(f[i]); - - if (op[idx].negate) - f[i] = -f[i]; - - const int vf = brw_float_to_vf(f[i]); - if (vf == -1) - return -1; - - vf_values[i] = vf; - } - - op[idx] = src_reg(brw_imm_vf4(vf_values[0], vf_values[1], - vf_values[2], vf_values[3])); - } - break; - } - - default: - unreachable("Non-32bit type."); - } - - /* If the instruction has more than one source, the instruction format only - * allows source 1 to be an immediate value. If the immediate value was - * source 0, then the sources must be exchanged. - */ - if (idx == 0 && instr->op != nir_op_mov) { - src_reg tmp = op[0]; - op[0] = op[1]; - op[1] = tmp; - } - - return idx; -} - -void -vec4_visitor::fix_float_operands(src_reg op[3], nir_alu_instr *instr) -{ - bool fixed[3] = { false, false, false }; - - for (unsigned i = 0; i < 2; i++) { - if (!nir_src_is_const(instr->src[i].src)) - continue; - - for (unsigned j = i + 1; j < 3; j++) { - if (fixed[j]) - continue; - - if (!nir_src_is_const(instr->src[j].src)) - continue; - - if (nir_alu_srcs_equal(instr, instr, i, j)) { - if (!fixed[i]) - op[i] = fix_3src_operand(op[i]); - - op[j] = op[i]; - - fixed[i] = true; - fixed[j] = true; - } else if (nir_alu_srcs_negative_equal(instr, instr, i, j)) { - if (!fixed[i]) - op[i] = fix_3src_operand(op[i]); - - op[j] = op[i]; - op[j].negate = !op[j].negate; - - fixed[i] = true; - fixed[j] = true; - } - } - } - - for (unsigned i = 0; i < 3; i++) { - if (!fixed[i]) - op[i] = fix_3src_operand(op[i]); - } -} - -static bool -const_src_fits_in_16_bits(const nir_src &src, brw_reg_type type) -{ - assert(nir_src_is_const(src)); - if (brw_reg_type_is_unsigned_integer(type)) { - return nir_src_comp_as_uint(src, 0) <= UINT16_MAX; - } else { - const int64_t c = nir_src_comp_as_int(src, 0); - return c <= INT16_MAX && c >= INT16_MIN; - } -} - -void -vec4_visitor::nir_emit_alu(nir_alu_instr *instr) -{ - vec4_instruction *inst; - - nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type | - instr->def.bit_size); - dst_reg dst = get_nir_def(instr->def, dst_type); - dst.writemask &= nir_component_mask(instr->def.num_components); - - src_reg op[4]; - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { - nir_alu_type src_type = (nir_alu_type) - (nir_op_infos[instr->op].input_types[i] | - nir_src_bit_size(instr->src[i].src)); - op[i] = get_nir_src(instr->src[i].src, src_type, 4); - op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle); - } - -#ifndef NDEBUG - /* On Gen7 and earlier, no functionality is exposed that should allow 8-bit - * integer types to ever exist. - */ - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) - assert(type_sz(op[i].type) > 1); -#endif - - switch (instr->op) { - case nir_op_mov: - try_immediate_source(instr, &op[0], true); - inst = emit(MOV(dst, op[0])); - break; - - case nir_op_vec2: - case nir_op_vec3: - case nir_op_vec4: - unreachable("not reached: should be handled by lower_vec_to_movs()"); - - case nir_op_i2f32: - case nir_op_u2f32: - inst = emit(MOV(dst, op[0])); - break; - - case nir_op_f2f32: - case nir_op_f2i32: - case nir_op_f2u32: - if (nir_src_bit_size(instr->src[0].src) == 64) - emit_conversion_from_double(dst, op[0]); - else - inst = emit(MOV(dst, op[0])); - break; - - case nir_op_f2f64: - case nir_op_i2f64: - case nir_op_u2f64: - emit_conversion_to_double(dst, op[0]); - break; - - case nir_op_fsat: - inst = emit(MOV(dst, op[0])); - inst->saturate = true; - break; - - case nir_op_fneg: - case nir_op_ineg: - op[0].negate = true; - inst = emit(MOV(dst, op[0])); - break; - - case nir_op_fabs: - case nir_op_iabs: - op[0].negate = false; - op[0].abs = true; - inst = emit(MOV(dst, op[0])); - break; - - case nir_op_iadd: - assert(instr->def.bit_size < 64); - FALLTHROUGH; - case nir_op_fadd: - try_immediate_source(instr, op, true); - inst = emit(ADD(dst, op[0], op[1])); - break; - - case nir_op_uadd_sat: - assert(instr->def.bit_size < 64); - inst = emit(ADD(dst, op[0], op[1])); - inst->saturate = true; - break; - - case nir_op_fmul: - try_immediate_source(instr, op, true); - inst = emit(MUL(dst, op[0], op[1])); - break; - - case nir_op_imul: { - assert(instr->def.bit_size < 64); - - /* For integer multiplication, the MUL uses the low 16 bits of one of - * the operands (src0 through SNB, src1 on IVB and later). The MACH - * accumulates in the contribution of the upper 16 bits of that - * operand. If we can determine that one of the args is in the low - * 16 bits, though, we can just emit a single MUL. - */ - if (nir_src_is_const(instr->src[0].src) && - nir_alu_instr_src_read_mask(instr, 0) == 1 && - const_src_fits_in_16_bits(instr->src[0].src, op[0].type)) { - if (devinfo->ver < 7) - emit(MUL(dst, op[0], op[1])); - else - emit(MUL(dst, op[1], op[0])); - } else if (nir_src_is_const(instr->src[1].src) && - nir_alu_instr_src_read_mask(instr, 1) == 1 && - const_src_fits_in_16_bits(instr->src[1].src, op[1].type)) { - if (devinfo->ver < 7) - emit(MUL(dst, op[1], op[0])); - else - emit(MUL(dst, op[0], op[1])); - } else { - struct brw_reg acc = retype(brw_acc_reg(8), dst.type); - - emit(MUL(acc, op[0], op[1])); - emit(MACH(dst_null_d(), op[0], op[1])); - emit(MOV(dst, src_reg(acc))); - } - break; - } - - case nir_op_imul_high: - case nir_op_umul_high: { - assert(instr->def.bit_size < 64); - struct brw_reg acc = retype(brw_acc_reg(8), dst.type); - - emit(MUL(acc, op[0], op[1])); - emit(MACH(dst, op[0], op[1])); - break; - } - - case nir_op_frcp: - inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]); - break; - - case nir_op_fexp2: - inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]); - break; - - case nir_op_flog2: - inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]); - break; - - case nir_op_fsin: - inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]); - break; - - case nir_op_fcos: - inst = emit_math(SHADER_OPCODE_COS, dst, op[0]); - break; - - case nir_op_idiv: - case nir_op_udiv: - assert(instr->def.bit_size < 64); - emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]); - break; - - case nir_op_umod: - case nir_op_irem: - /* According to the sign table for INT DIV in the Ivy Bridge PRM, it - * appears that our hardware just does the right thing for signed - * remainder. - */ - assert(instr->def.bit_size < 64); - emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); - break; - - case nir_op_imod: { - /* Get a regular C-style remainder. If a % b == 0, set the predicate. */ - inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]); - - /* Math instructions don't support conditional mod */ - inst = emit(MOV(dst_null_d(), src_reg(dst))); - inst->conditional_mod = BRW_CONDITIONAL_NZ; - - /* Now, we need to determine if signs of the sources are different. - * When we XOR the sources, the top bit is 0 if they are the same and 1 - * if they are different. We can then use a conditional modifier to - * turn that into a predicate. This leads us to an XOR.l instruction. - * - * Technically, according to the PRM, you're not allowed to use .l on a - * XOR instruction. However, empirical experiments and Curro's reading - * of the simulator source both indicate that it's safe. - */ - src_reg tmp = src_reg(this, glsl_ivec4_type()); - inst = emit(XOR(dst_reg(tmp), op[0], op[1])); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->conditional_mod = BRW_CONDITIONAL_L; - - /* If the result of the initial remainder operation is non-zero and the - * two sources have different signs, add in a copy of op[1] to get the - * final integer modulus value. - */ - inst = emit(ADD(dst, src_reg(dst), op[1])); - inst->predicate = BRW_PREDICATE_NORMAL; - break; - } - - case nir_op_ldexp: - unreachable("not reached: should be handled by ldexp_to_arith()"); - - case nir_op_fsqrt: - inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]); - break; - - case nir_op_frsq: - inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]); - break; - - case nir_op_fpow: - inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]); - break; - - case nir_op_uadd_carry: { - assert(instr->def.bit_size < 64); - struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); - - emit(ADDC(dst_null_ud(), op[0], op[1])); - emit(MOV(dst, src_reg(acc))); - break; - } - - case nir_op_usub_borrow: { - assert(instr->def.bit_size < 64); - struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD); - - emit(SUBB(dst_null_ud(), op[0], op[1])); - emit(MOV(dst, src_reg(acc))); - break; - } - - case nir_op_ftrunc: - inst = emit(RNDZ(dst, op[0])); - if (devinfo->ver < 6) { - inst->conditional_mod = BRW_CONDITIONAL_R; - inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f))); - inst->predicate = BRW_PREDICATE_NORMAL; - inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */ - } - break; - - case nir_op_fceil: { - src_reg tmp = src_reg(this, glsl_float_type()); - tmp.swizzle = brw_swizzle_for_size(nir_src_num_components(instr->src[0].src)); - - op[0].negate = !op[0].negate; - emit(RNDD(dst_reg(tmp), op[0])); - tmp.negate = true; - inst = emit(MOV(dst, tmp)); - break; - } - - case nir_op_ffloor: - inst = emit(RNDD(dst, op[0])); - break; - - case nir_op_ffract: - inst = emit(FRC(dst, op[0])); - break; - - case nir_op_fround_even: - inst = emit(RNDE(dst, op[0])); - if (devinfo->ver < 6) { - inst->conditional_mod = BRW_CONDITIONAL_R; - inst = emit(ADD(dst, src_reg(dst), brw_imm_f(1.0f))); - inst->predicate = BRW_PREDICATE_NORMAL; - inst = emit(MOV(dst, src_reg(dst))); /* for potential saturation */ - } - break; - - case nir_op_fquantize2f16: { - /* See also vec4_visitor::emit_pack_half_2x16() */ - src_reg tmp16 = src_reg(this, glsl_uvec4_type()); - src_reg tmp32 = src_reg(this, glsl_vec4_type()); - src_reg zero = src_reg(this, glsl_vec4_type()); - - /* Check for denormal */ - src_reg abs_src0 = op[0]; - abs_src0.abs = true; - emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)), - BRW_CONDITIONAL_L)); - /* Get the appropriately signed zero */ - emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD), - retype(op[0], BRW_REGISTER_TYPE_UD), - brw_imm_ud(0x80000000))); - /* Do the actual F32 -> F16 -> F32 conversion */ - emit(F32TO16(dst_reg(tmp16), op[0])); - emit(F16TO32(dst_reg(tmp32), tmp16)); - /* Select that or zero based on normal status */ - inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32); - inst->predicate = BRW_PREDICATE_NORMAL; - break; - } - - case nir_op_imin: - case nir_op_umin: - assert(instr->def.bit_size < 64); - FALLTHROUGH; - case nir_op_fmin: - try_immediate_source(instr, op, true); - inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]); - break; - - case nir_op_imax: - case nir_op_umax: - assert(instr->def.bit_size < 64); - FALLTHROUGH; - case nir_op_fmax: - try_immediate_source(instr, op, true); - inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]); - break; - - case nir_op_fddx: - case nir_op_fddx_coarse: - case nir_op_fddx_fine: - case nir_op_fddy: - case nir_op_fddy_coarse: - case nir_op_fddy_fine: - unreachable("derivatives are not valid in vertex shaders"); - - case nir_op_ilt32: - case nir_op_ult32: - case nir_op_ige32: - case nir_op_uge32: - case nir_op_ieq32: - case nir_op_ine32: - assert(instr->def.bit_size < 64); - FALLTHROUGH; - case nir_op_flt32: - case nir_op_fge32: - case nir_op_feq32: - case nir_op_fneu32: { - enum brw_conditional_mod conditional_mod = - brw_cmod_for_nir_comparison(instr->op); - - if (nir_src_bit_size(instr->src[0].src) < 64) { - /* If the order of the sources is changed due to an immediate value, - * then the condition must also be changed. - */ - if (try_immediate_source(instr, op, true) == 0) - conditional_mod = brw_swap_cmod(conditional_mod); - - emit(CMP(dst, op[0], op[1], conditional_mod)); - } else { - /* Produce a 32-bit boolean result from the DF comparison by selecting - * only the low 32-bit in each DF produced. Do this in a temporary - * so we can then move from there to the result using align16 again - * to honor the original writemask. - */ - dst_reg temp = dst_reg(this, glsl_dvec4_type()); - emit(CMP(temp, op[0], op[1], conditional_mod)); - dst_reg result = dst_reg(this, glsl_bvec4_type()); - emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp)); - emit(MOV(dst, src_reg(result))); - } - break; - } - - case nir_op_b32all_iequal2: - case nir_op_b32all_iequal3: - case nir_op_b32all_iequal4: - assert(instr->def.bit_size < 64); - FALLTHROUGH; - case nir_op_b32all_fequal2: - case nir_op_b32all_fequal3: - case nir_op_b32all_fequal4: { - unsigned swiz = - brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); - - emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), - brw_cmod_for_nir_comparison(instr->op))); - emit(MOV(dst, brw_imm_d(0))); - inst = emit(MOV(dst, brw_imm_d(~0))); - inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H; - break; - } - - case nir_op_b32any_inequal2: - case nir_op_b32any_inequal3: - case nir_op_b32any_inequal4: - assert(instr->def.bit_size < 64); - FALLTHROUGH; - case nir_op_b32any_fnequal2: - case nir_op_b32any_fnequal3: - case nir_op_b32any_fnequal4: { - unsigned swiz = - brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]); - - emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz), - brw_cmod_for_nir_comparison(instr->op))); - - emit(MOV(dst, brw_imm_d(0))); - inst = emit(MOV(dst, brw_imm_d(~0))); - inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H; - break; - } - - case nir_op_inot: - assert(instr->def.bit_size < 64); - emit(NOT(dst, op[0])); - break; - - case nir_op_ixor: - assert(instr->def.bit_size < 64); - try_immediate_source(instr, op, true); - emit(XOR(dst, op[0], op[1])); - break; - - case nir_op_ior: - assert(instr->def.bit_size < 64); - try_immediate_source(instr, op, true); - emit(OR(dst, op[0], op[1])); - break; - - case nir_op_iand: - assert(instr->def.bit_size < 64); - try_immediate_source(instr, op, true); - emit(AND(dst, op[0], op[1])); - break; - - case nir_op_b2i32: - case nir_op_b2f32: - case nir_op_b2f64: - if (instr->def.bit_size > 32) { - assert(dst.type == BRW_REGISTER_TYPE_DF); - emit_conversion_to_double(dst, negate(op[0])); - } else { - emit(MOV(dst, negate(op[0]))); - } - break; - - case nir_op_unpack_half_2x16_split_x: - case nir_op_unpack_half_2x16_split_y: - case nir_op_pack_half_2x16_split: - unreachable("not reached: should not occur in vertex shader"); - - case nir_op_unpack_snorm_2x16: - case nir_op_unpack_unorm_2x16: - case nir_op_pack_snorm_2x16: - case nir_op_pack_unorm_2x16: - unreachable("not reached: should be handled by lower_packing_builtins"); - - case nir_op_pack_uvec4_to_uint: - unreachable("not reached"); - - case nir_op_pack_uvec2_to_uint: { - dst_reg tmp1 = dst_reg(this, glsl_uint_type()); - tmp1.writemask = WRITEMASK_X; - op[0].swizzle = BRW_SWIZZLE_YYYY; - emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u)))); - - dst_reg tmp2 = dst_reg(this, glsl_uint_type()); - tmp2.writemask = WRITEMASK_X; - op[0].swizzle = BRW_SWIZZLE_XXXX; - emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu)))); - - emit(OR(dst, src_reg(tmp1), src_reg(tmp2))); - break; - } - - case nir_op_pack_64_2x32_split: { - dst_reg result = dst_reg(this, glsl_dvec4_type()); - dst_reg tmp = dst_reg(this, glsl_uvec4_type()); - emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD))); - emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp)); - emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD))); - emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp)); - emit(MOV(dst, src_reg(result))); - break; - } - - case nir_op_unpack_64_2x32_split_x: - case nir_op_unpack_64_2x32_split_y: { - enum opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ? - VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT; - dst_reg tmp = dst_reg(this, glsl_dvec4_type()); - emit(MOV(tmp, op[0])); - dst_reg tmp2 = dst_reg(this, glsl_uvec4_type()); - emit(oper, tmp2, src_reg(tmp)); - emit(MOV(dst, src_reg(tmp2))); - break; - } - - case nir_op_unpack_half_2x16: - /* As NIR does not guarantee that we have a correct swizzle outside the - * boundaries of a vector, and the implementation of emit_unpack_half_2x16 - * uses the source operand in an operation with WRITEMASK_Y while our - * source operand has only size 1, it accessed incorrect data producing - * regressions in Piglit. We repeat the swizzle of the first component on the - * rest of components to avoid regressions. In the vec4_visitor IR code path - * this is not needed because the operand has already the correct swizzle. - */ - op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle); - emit_unpack_half_2x16(dst, op[0]); - break; - - case nir_op_pack_half_2x16: - emit_pack_half_2x16(dst, op[0]); - break; - - case nir_op_unpack_unorm_4x8: - assert(instr->def.bit_size < 64); - emit_unpack_unorm_4x8(dst, op[0]); - break; - - case nir_op_pack_unorm_4x8: - assert(instr->def.bit_size < 64); - emit_pack_unorm_4x8(dst, op[0]); - break; - - case nir_op_unpack_snorm_4x8: - assert(instr->def.bit_size < 64); - emit_unpack_snorm_4x8(dst, op[0]); - break; - - case nir_op_pack_snorm_4x8: - assert(instr->def.bit_size < 64); - emit_pack_snorm_4x8(dst, op[0]); - break; - - case nir_op_bitfield_reverse: - assert(instr->def.bit_size == 32); - assert(nir_src_bit_size(instr->src[0].src) == 32); - emit(BFREV(dst, op[0])); - break; - - case nir_op_bit_count: - assert(instr->def.bit_size == 32); - assert(nir_src_bit_size(instr->src[0].src) < 64); - emit(CBIT(dst, op[0])); - break; - - case nir_op_ifind_msb: { - assert(instr->def.bit_size == 32); - assert(nir_src_bit_size(instr->src[0].src) == 32); - assert(devinfo->ver >= 7); - - vec4_builder bld = vec4_builder(this).at_end(); - src_reg src(dst); - - emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0])); - - /* FBH counts from the MSB side, while GLSL's findMSB() wants the count - * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then - * subtract the result from 31 to convert the MSB count into an LSB - * count. - */ - bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ); - - inst = bld.ADD(dst, src, brw_imm_d(31)); - inst->predicate = BRW_PREDICATE_NORMAL; - inst->src[0].negate = true; - break; - } - - case nir_op_uclz: - assert(instr->def.bit_size == 32); - assert(nir_src_bit_size(instr->src[0].src) == 32); - emit(LZD(dst, op[0])); - break; - - case nir_op_find_lsb: - assert(instr->def.bit_size == 32); - assert(nir_src_bit_size(instr->src[0].src) == 32); - assert(devinfo->ver >= 7); - emit(FBL(dst, op[0])); - break; - - case nir_op_ubitfield_extract: - case nir_op_ibitfield_extract: - unreachable("should have been lowered"); - case nir_op_ubfe: - case nir_op_ibfe: - assert(instr->def.bit_size < 64); - op[0] = fix_3src_operand(op[0]); - op[1] = fix_3src_operand(op[1]); - op[2] = fix_3src_operand(op[2]); - - emit(BFE(dst, op[2], op[1], op[0])); - break; - - case nir_op_bfm: - assert(instr->def.bit_size < 64); - emit(BFI1(dst, op[0], op[1])); - break; - - case nir_op_bfi: - assert(instr->def.bit_size < 64); - op[0] = fix_3src_operand(op[0]); - op[1] = fix_3src_operand(op[1]); - op[2] = fix_3src_operand(op[2]); - - emit(BFI2(dst, op[0], op[1], op[2])); - break; - - case nir_op_bitfield_insert: - unreachable("not reached: should have been lowered"); - - case nir_op_fsign: - if (type_sz(op[0].type) < 8) { - /* AND(val, 0x80000000) gives the sign bit. - * - * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not - * zero. - */ - emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ)); - - op[0].type = BRW_REGISTER_TYPE_UD; - dst.type = BRW_REGISTER_TYPE_UD; - emit(AND(dst, op[0], brw_imm_ud(0x80000000u))); - - inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u))); - inst->predicate = BRW_PREDICATE_NORMAL; - dst.type = BRW_REGISTER_TYPE_F; - } else { - /* For doubles we do the same but we need to consider: - * - * - We use a MOV with conditional_mod instead of a CMP so that we can - * skip loading a 0.0 immediate. We use a source modifier on the - * source of the MOV so that we flush denormalized values to 0. - * Since we want to compare against 0, this won't alter the result. - * - We need to extract the high 32-bit of each DF where the sign - * is stored. - * - We need to produce a DF result. - */ - - /* Check for zero */ - src_reg value = op[0]; - value.abs = true; - inst = emit(MOV(dst_null_df(), value)); - inst->conditional_mod = BRW_CONDITIONAL_NZ; - - /* AND each high 32-bit channel with 0x80000000u */ - dst_reg tmp = dst_reg(this, glsl_uvec4_type()); - emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]); - emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u))); - - /* Add 1.0 to each channel, predicated to skip the cases where the - * channel's value was 0 - */ - inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u))); - inst->predicate = BRW_PREDICATE_NORMAL; - - /* Now convert the result from float to double */ - emit_conversion_to_double(dst, retype(src_reg(tmp), - BRW_REGISTER_TYPE_F)); - } - break; - - case nir_op_ishl: - assert(instr->def.bit_size < 64); - try_immediate_source(instr, op, false); - emit(SHL(dst, op[0], op[1])); - break; - - case nir_op_ishr: - assert(instr->def.bit_size < 64); - try_immediate_source(instr, op, false); - emit(ASR(dst, op[0], op[1])); - break; - - case nir_op_ushr: - assert(instr->def.bit_size < 64); - try_immediate_source(instr, op, false); - emit(SHR(dst, op[0], op[1])); - break; - - case nir_op_ffma: - if (type_sz(dst.type) == 8) { - dst_reg mul_dst = dst_reg(this, glsl_dvec4_type()); - emit(MUL(mul_dst, op[1], op[0])); - inst = emit(ADD(dst, src_reg(mul_dst), op[2])); - } else { - fix_float_operands(op, instr); - inst = emit(MAD(dst, op[2], op[1], op[0])); - } - break; - - case nir_op_flrp: - fix_float_operands(op, instr); - inst = emit(LRP(dst, op[2], op[1], op[0])); - break; - - case nir_op_b32csel: - enum brw_predicate predicate; - if (!optimize_predicate(instr, &predicate)) { - emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ)); - switch (dst.writemask) { - case WRITEMASK_X: - predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X; - break; - case WRITEMASK_Y: - predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y; - break; - case WRITEMASK_Z: - predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z; - break; - case WRITEMASK_W: - predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W; - break; - default: - predicate = BRW_PREDICATE_NORMAL; - break; - } - } - inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]); - inst->predicate = predicate; - break; - - case nir_op_fdot2_replicated: - try_immediate_source(instr, op, true); - inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]); - break; - - case nir_op_fdot3_replicated: - try_immediate_source(instr, op, true); - inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]); - break; - - case nir_op_fdot4_replicated: - try_immediate_source(instr, op, true); - inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]); - break; - - case nir_op_fdph_replicated: - try_immediate_source(instr, op, false); - inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]); - break; - - case nir_op_fdiv: - unreachable("not reached: should be lowered by lower_fdiv in the compiler"); - - case nir_op_fmod: - unreachable("not reached: should be lowered by lower_fmod in the compiler"); - - case nir_op_fsub: - case nir_op_isub: - unreachable("not reached: should be handled by ir_sub_to_add_neg"); - - default: - unreachable("Unimplemented ALU operation"); - } - - /* If we need to do a boolean resolve, replace the result with -(x & 1) - * to sign extend the low bit to 0/~0 - */ - if (devinfo->ver <= 5 && - (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == - BRW_NIR_BOOLEAN_NEEDS_RESOLVE) { - dst_reg masked = dst_reg(this, glsl_int_type()); - masked.writemask = dst.writemask; - emit(AND(masked, src_reg(dst), brw_imm_d(1))); - src_reg masked_neg = src_reg(masked); - masked_neg.negate = true; - emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg)); - } -} - -void -vec4_visitor::nir_emit_jump(nir_jump_instr *instr) -{ - switch (instr->type) { - case nir_jump_break: - emit(BRW_OPCODE_BREAK); - break; - - case nir_jump_continue: - emit(BRW_OPCODE_CONTINUE); - break; - - case nir_jump_return: - FALLTHROUGH; - default: - unreachable("unknown jump"); - } -} - -static bool -is_high_sampler(const struct intel_device_info *devinfo, src_reg sampler) -{ - if (devinfo->verx10 != 75) - return false; - - return sampler.file != IMM || sampler.ud >= 16; -} - -void -vec4_visitor::nir_emit_texture(nir_tex_instr *instr) -{ - unsigned texture = instr->texture_index; - unsigned sampler = instr->sampler_index; - src_reg texture_reg = brw_imm_ud(texture); - src_reg sampler_reg = brw_imm_ud(sampler); - src_reg coordinate; - const glsl_type *coord_type = NULL; - src_reg shadow_comparator; - src_reg offset_value; - src_reg lod, lod2; - src_reg sample_index; - src_reg mcs; - - dst_reg dest = get_nir_def(instr->def, instr->dest_type); - - /* The hardware requires a LOD for buffer textures */ - if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) - lod = brw_imm_d(0); - - /* Load the texture operation sources */ - uint32_t constant_offset = 0; - for (unsigned i = 0; i < instr->num_srcs; i++) { - switch (instr->src[i].src_type) { - case nir_tex_src_comparator: - shadow_comparator = get_nir_src(instr->src[i].src, - BRW_REGISTER_TYPE_F, 1); - break; - - case nir_tex_src_coord: { - unsigned src_size = nir_tex_instr_src_size(instr, i); - - switch (instr->op) { - case nir_texop_txf: - case nir_texop_txf_ms: - case nir_texop_samples_identical: - coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, - src_size); - coord_type = glsl_ivec_type(src_size); - break; - - default: - coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, - src_size); - coord_type = glsl_vec_type(src_size); - break; - } - break; - } - - case nir_tex_src_ddx: - lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, - nir_tex_instr_src_size(instr, i)); - break; - - case nir_tex_src_ddy: - lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, - nir_tex_instr_src_size(instr, i)); - break; - - case nir_tex_src_lod: - switch (instr->op) { - case nir_texop_txs: - case nir_texop_txf: - lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); - break; - - default: - lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1); - break; - } - break; - - case nir_tex_src_ms_index: { - sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1); - break; - } - - case nir_tex_src_offset: - if (!brw_texture_offset(instr, i, &constant_offset)) { - offset_value = - get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2); - } - break; - - case nir_tex_src_texture_offset: { - assert(texture_reg.is_zero()); - texture_reg = emit_uniformize(get_nir_src(instr->src[i].src, - BRW_REGISTER_TYPE_UD, 1)); - break; - } - - case nir_tex_src_sampler_offset: { - assert(sampler_reg.is_zero()); - sampler_reg = emit_uniformize(get_nir_src(instr->src[i].src, - BRW_REGISTER_TYPE_UD, 1)); - break; - } - - case nir_tex_src_projector: - unreachable("Should be lowered by nir_lower_tex"); - - case nir_tex_src_bias: - unreachable("LOD bias is not valid for vertex shaders.\n"); - - default: - unreachable("unknown texture source"); - } - } - - if (instr->op == nir_texop_txf_ms || - instr->op == nir_texop_samples_identical) { - assert(coord_type != NULL); - if (devinfo->ver >= 7) { - mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg); - } else { - mcs = brw_imm_ud(0u); - } - } - - /* Stuff the channel select bits in the top of the texture offset */ - if (instr->op == nir_texop_tg4) { - if (instr->component == 1 && - (key_tex->gather_channel_quirk_mask & (1 << texture))) { - /* gather4 sampler is broken for green channel on RG32F -- - * we must ask for blue instead. - */ - constant_offset |= 2 << 16; - } else { - constant_offset |= instr->component << 16; - } - } - - enum opcode opcode; - switch (instr->op) { - case nir_texop_tex: opcode = SHADER_OPCODE_TXL; break; - case nir_texop_txl: opcode = SHADER_OPCODE_TXL; break; - case nir_texop_txd: opcode = SHADER_OPCODE_TXD; break; - case nir_texop_txf: opcode = SHADER_OPCODE_TXF; break; - case nir_texop_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break; - case nir_texop_txs: opcode = SHADER_OPCODE_TXS; break; - case nir_texop_query_levels: opcode = SHADER_OPCODE_TXS; break; - case nir_texop_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break; - case nir_texop_tg4: - opcode = offset_value.file != BAD_FILE ? SHADER_OPCODE_TG4_OFFSET - : SHADER_OPCODE_TG4; - break; - case nir_texop_samples_identical: { - /* There are some challenges implementing this for vec4, and it seems - * unlikely to be used anyway. For now, just return false ways. - */ - emit(MOV(dest, brw_imm_ud(0u))); - return; - } - case nir_texop_txb: - case nir_texop_lod: - unreachable("Implicit LOD is only valid inside fragment shaders."); - default: - unreachable("Unrecognized tex op"); - } - - vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest); - - inst->offset = constant_offset; - - /* The message header is necessary for: - * - Gfx4 (always) - * - Texel offsets - * - Gather channel selection - * - Sampler indices too large to fit in a 4-bit value. - * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal - */ - inst->header_size = - (devinfo->ver < 5 || - inst->offset != 0 || - opcode == SHADER_OPCODE_TG4 || - opcode == SHADER_OPCODE_TG4_OFFSET || - opcode == SHADER_OPCODE_SAMPLEINFO || - is_high_sampler(devinfo, sampler_reg)) ? 1 : 0; - inst->base_mrf = 2; - inst->mlen = inst->header_size; - inst->dst.writemask = WRITEMASK_XYZW; - inst->shadow_compare = shadow_comparator.file != BAD_FILE; - - inst->src[1] = texture_reg; - inst->src[2] = sampler_reg; - - /* MRF for the first parameter */ - int param_base = inst->base_mrf + inst->header_size; - - if (opcode == SHADER_OPCODE_TXS) { - int writemask = devinfo->ver == 4 ? WRITEMASK_W : WRITEMASK_X; - emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod)); - inst->mlen++; - } else if (opcode == SHADER_OPCODE_SAMPLEINFO) { - inst->dst.writemask = WRITEMASK_X; - } else { - /* Load the coordinate */ - /* FINISHME: gl_clamp_mask and saturate */ - int coord_mask = (1 << instr->coord_components) - 1; - int zero_mask = 0xf & ~coord_mask; - - emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask), - coordinate)); - inst->mlen++; - - if (zero_mask != 0) { - emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask), - brw_imm_d(0))); - } - /* Load the shadow comparator */ - if (shadow_comparator.file != BAD_FILE && - opcode != SHADER_OPCODE_TXD && - opcode != SHADER_OPCODE_TG4_OFFSET) { - emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type, - WRITEMASK_X), - shadow_comparator)); - inst->mlen++; - } - - /* Load the LOD info */ - switch (opcode) { - case SHADER_OPCODE_TXL: { - int mrf, writemask; - if (devinfo->ver >= 5) { - mrf = param_base + 1; - if (shadow_comparator.file != BAD_FILE) { - writemask = WRITEMASK_Y; - /* mlen already incremented */ - } else { - writemask = WRITEMASK_X; - inst->mlen++; - } - } else /* devinfo->ver == 4 */ { - mrf = param_base; - writemask = WRITEMASK_W; - } - emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod)); - break; - } - - case SHADER_OPCODE_TXF: - emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod)); - break; - - case SHADER_OPCODE_TXF_CMS: - emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X), - sample_index)); - if (devinfo->ver >= 7) { - /* MCS data is in the first channel of `mcs`, but we need to get it into - * the .y channel of the second vec4 of params, so replicate .x across - * the whole vec4 and then mask off everything except .y - */ - mcs.swizzle = BRW_SWIZZLE_XXXX; - emit(MOV(dst_reg(MRF, param_base + 1, glsl_uint_type(), WRITEMASK_Y), - mcs)); - } - inst->mlen++; - break; - - case SHADER_OPCODE_TXD: { - const brw_reg_type type = lod.type; - - if (devinfo->ver >= 5) { - lod.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_X,BRW_SWIZZLE_X,BRW_SWIZZLE_Y,BRW_SWIZZLE_Y); - lod2.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_X,BRW_SWIZZLE_X,BRW_SWIZZLE_Y,BRW_SWIZZLE_Y); - emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod)); - emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2)); - inst->mlen++; - - if (nir_tex_instr_dest_size(instr) == 3 || - shadow_comparator.file != BAD_FILE) { - lod.swizzle = BRW_SWIZZLE_ZZZZ; - lod2.swizzle = BRW_SWIZZLE_ZZZZ; - emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod)); - emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2)); - inst->mlen++; - - if (shadow_comparator.file != BAD_FILE) { - emit(MOV(dst_reg(MRF, param_base + 2, - shadow_comparator.type, WRITEMASK_Z), - shadow_comparator)); - } - } - } else /* devinfo->ver == 4 */ { - emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod)); - emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2)); - inst->mlen += 2; - } - break; - } - - case SHADER_OPCODE_TG4_OFFSET: - if (shadow_comparator.file != BAD_FILE) { - emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W), - shadow_comparator)); - } - - emit(MOV(dst_reg(MRF, param_base + 1, glsl_ivec2_type(), WRITEMASK_XY), - offset_value)); - inst->mlen++; - break; - - default: - break; - } - } - - emit(inst); - - /* fixup num layers (z) for cube arrays: hardware returns faces * layers; - * spec requires layers. - */ - if (instr->op == nir_texop_txs && devinfo->ver < 7) { - /* Gfx4-6 return 0 instead of 1 for single layer surfaces. */ - emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z), - src_reg(inst->dst), brw_imm_d(1)); - } - - if (instr->op == nir_texop_query_levels) { - /* # levels is in .w */ - src_reg swizzled(dest); - swizzled.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_W, BRW_SWIZZLE_W, - BRW_SWIZZLE_W, BRW_SWIZZLE_W); - emit(MOV(dest, swizzled)); - } -} - -src_reg -vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type, - src_reg coordinate, src_reg surface) -{ - vec4_instruction *inst = - new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS, - dst_reg(this, glsl_uvec4_type())); - inst->base_mrf = 2; - inst->src[1] = surface; - inst->src[2] = brw_imm_ud(0); /* sampler */ - inst->mlen = 1; - - const int param_base = inst->base_mrf; - - /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */ - int coord_mask = (1 << coordinate_type->vector_elements) - 1; - int zero_mask = 0xf & ~coord_mask; - - emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask), - coordinate)); - - emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask), - brw_imm_d(0))); - - emit(inst); - return src_reg(inst->dst); -} - -void -vec4_visitor::nir_emit_undef(nir_undef_instr *instr) -{ - nir_ssa_values[instr->def.index] = - dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32))); -} - -/* SIMD4x2 64bit data is stored in register space like this: - * - * r0.0:DF x0 y0 z0 w0 - * r1.0:DF x1 y1 z1 w1 - * - * When we need to write data such as this to memory using 32-bit write - * messages we need to shuffle it in this fashion: - * - * r0.0:DF x0 y0 x1 y1 (to be written at base offset) - * r0.0:DF z0 w0 z1 w1 (to be written at base offset + 16) - * - * We need to do the inverse operation when we read using 32-bit messages, - * which we can do by applying the same exact shuffling on the 64-bit data - * read, only that because the data for each vertex is positioned differently - * we need to apply different channel enables. - * - * This function takes 64bit data and shuffles it as explained above. - * - * The @for_write parameter is used to specify if the shuffling is being done - * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit - * write message (for_write = true), or instead we are doing the inverse - * operation and we have just read 64-bit data using a 32-bit messages that we - * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false). - * - * If @block and @ref are non-NULL, then the shuffling is done after @ref, - * otherwise the instructions are emitted normally at the end. The function - * returns the last instruction inserted. - * - * Notice that @src and @dst cannot be the same register. - */ -vec4_instruction * -vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write, - bool for_scratch, - bblock_t *block, vec4_instruction *ref) -{ - assert(type_sz(src.type) == 8); - assert(type_sz(dst.type) == 8); - assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE)); - assert(!ref == !block); - - opcode mov_op = for_scratch ? VEC4_OPCODE_MOV_FOR_SCRATCH : BRW_OPCODE_MOV; - - const vec4_builder bld = !ref ? vec4_builder(this).at_end() : - vec4_builder(this).at(block, ref->next); - - /* Resolve swizzle in src */ - if (src.swizzle != BRW_SWIZZLE_XYZW) { - dst_reg data = dst_reg(this, glsl_dvec4_type()); - bld.emit(mov_op, data, src); - src = src_reg(data); - } - - /* dst+0.XY = src+0.XY */ - bld.group(4, 0).emit(mov_op, writemask(dst, WRITEMASK_XY), src); - - /* dst+0.ZW = src+1.XY */ - bld.group(4, for_write ? 1 : 0) - .emit(mov_op, writemask(dst, WRITEMASK_ZW), - swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY)); - - /* dst+1.XY = src+0.ZW */ - bld.group(4, for_write ? 0 : 1) - .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY), - swizzle(src, BRW_SWIZZLE_ZWZW)); - - /* dst+1.ZW = src+1.ZW */ - return bld.group(4, 1) - .emit(mov_op, writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW), - byte_offset(src, REG_SIZE)); -} - -} diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp deleted file mode 100644 index 8ba1e80b9a5..00000000000 --- a/src/intel/compiler/brw_vec4_reg_allocate.cpp +++ /dev/null @@ -1,512 +0,0 @@ -/* - * Copyright © 2011 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "util/register_allocate.h" -#include "brw_vec4.h" -#include "brw_cfg.h" - -using namespace brw; - -#define REG_CLASS_COUNT 20 - -namespace brw { - -static void -assign(unsigned int *reg_hw_locations, backend_reg *reg) -{ - if (reg->file == VGRF) { - reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE; - reg->offset %= REG_SIZE; - } -} - -bool -vec4_visitor::reg_allocate_trivial() -{ - unsigned int hw_reg_mapping[this->alloc.count]; - bool virtual_grf_used[this->alloc.count]; - int next; - - /* Calculate which virtual GRFs are actually in use after whatever - * optimization passes have occurred. - */ - for (unsigned i = 0; i < this->alloc.count; i++) { - virtual_grf_used[i] = false; - } - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == VGRF) - virtual_grf_used[inst->dst.nr] = true; - - for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) - virtual_grf_used[inst->src[i].nr] = true; - } - } - - hw_reg_mapping[0] = this->first_non_payload_grf; - next = hw_reg_mapping[0] + this->alloc.sizes[0]; - for (unsigned i = 1; i < this->alloc.count; i++) { - if (virtual_grf_used[i]) { - hw_reg_mapping[i] = next; - next += this->alloc.sizes[i]; - } - } - prog_data->total_grf = next; - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - assign(hw_reg_mapping, &inst->dst); - assign(hw_reg_mapping, &inst->src[0]); - assign(hw_reg_mapping, &inst->src[1]); - assign(hw_reg_mapping, &inst->src[2]); - } - - if (prog_data->total_grf > max_grf) { - fail("Ran out of regs on trivial allocator (%d/%d)\n", - prog_data->total_grf, max_grf); - return false; - } - - return true; -} - -extern "C" void -brw_vec4_alloc_reg_set(struct brw_compiler *compiler) -{ - int base_reg_count = - compiler->devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF; - - assert(compiler->devinfo->ver < 8); - - /* After running split_virtual_grfs(), almost all VGRFs will be of size 1. - * SEND-from-GRF sources cannot be split, so we also need classes for each - * potential message length. - */ - assert(REG_CLASS_COUNT == MAX_VGRF_SIZE(compiler->devinfo)); - int class_sizes[REG_CLASS_COUNT]; - - for (int i = 0; i < REG_CLASS_COUNT; i++) - class_sizes[i] = i + 1; - - - ralloc_free(compiler->vec4_reg_set.regs); - compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, base_reg_count, false); - if (compiler->devinfo->ver >= 6) - ra_set_allocate_round_robin(compiler->vec4_reg_set.regs); - ralloc_free(compiler->vec4_reg_set.classes); - compiler->vec4_reg_set.classes = ralloc_array(compiler, struct ra_class *, REG_CLASS_COUNT); - - /* Now, add the registers to their classes, and add the conflicts - * between them and the base GRF registers (and also each other). - */ - for (int i = 0; i < REG_CLASS_COUNT; i++) { - int class_reg_count = base_reg_count - (class_sizes[i] - 1); - compiler->vec4_reg_set.classes[i] = - ra_alloc_contig_reg_class(compiler->vec4_reg_set.regs, class_sizes[i]); - - for (int j = 0; j < class_reg_count; j++) - ra_class_add_reg(compiler->vec4_reg_set.classes[i], j); - } - - ra_set_finalize(compiler->vec4_reg_set.regs, NULL); -} - -void -vec4_visitor::setup_payload_interference(struct ra_graph *g, - int first_payload_node, - int reg_node_count) -{ - int payload_node_count = this->first_non_payload_grf; - - for (int i = 0; i < payload_node_count; i++) { - /* Mark each payload reg node as being allocated to its physical register. - * - * The alternative would be to have per-physical register classes, which - * would just be silly. - */ - ra_set_node_reg(g, first_payload_node + i, i); - - /* For now, just mark each payload node as interfering with every other - * node to be allocated. - */ - for (int j = 0; j < reg_node_count; j++) { - ra_add_node_interference(g, first_payload_node + i, j); - } - } -} - -bool -vec4_visitor::reg_allocate() -{ - unsigned int hw_reg_mapping[alloc.count]; - int payload_reg_count = this->first_non_payload_grf; - - /* Using the trivial allocator can be useful in debugging undefined - * register access as a result of broken optimization passes. - */ - if (0) - return reg_allocate_trivial(); - - assert(devinfo->ver < 8); - - const vec4_live_variables &live = live_analysis.require(); - int node_count = alloc.count; - int first_payload_node = node_count; - node_count += payload_reg_count; - struct ra_graph *g = - ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count); - - for (unsigned i = 0; i < alloc.count; i++) { - int size = this->alloc.sizes[i]; - assert(size >= 1 && size <= MAX_VGRF_SIZE(devinfo)); - ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]); - - for (unsigned j = 0; j < i; j++) { - if (live.vgrfs_interfere(i, j)) { - ra_add_node_interference(g, i, j); - } - } - } - - /* Certain instructions can't safely use the same register for their - * sources and destination. Add interference. - */ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) { - for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) { - ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr); - } - } - } - } - - setup_payload_interference(g, first_payload_node, node_count); - - if (!ra_allocate(g)) { - /* Failed to allocate registers. Spill a reg, and the caller will - * loop back into here to try again. - */ - int reg = choose_spill_reg(g); - if (this->no_spills) { - fail("Failure to register allocate. Reduce number of live " - "values to avoid this."); - } else if (reg == -1) { - fail("no register to spill\n"); - } else { - spill_reg(reg); - } - ralloc_free(g); - return false; - } - - /* Get the chosen virtual registers for each node, and map virtual - * regs in the register classes back down to real hardware reg - * numbers. - */ - prog_data->total_grf = payload_reg_count; - for (unsigned i = 0; i < alloc.count; i++) { - hw_reg_mapping[i] = ra_get_node_reg(g, i); - prog_data->total_grf = MAX2(prog_data->total_grf, - hw_reg_mapping[i] + alloc.sizes[i]); - } - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - assign(hw_reg_mapping, &inst->dst); - assign(hw_reg_mapping, &inst->src[0]); - assign(hw_reg_mapping, &inst->src[1]); - assign(hw_reg_mapping, &inst->src[2]); - } - - ralloc_free(g); - - return true; -} - -/** - * When we decide to spill a register, instead of blindly spilling every use, - * save unspills when the spill register is used (read) in consecutive - * instructions. This can potentially save a bunch of unspills that would - * have very little impact in register allocation anyway. - * - * Notice that we need to account for this behavior when spilling a register - * and when evaluating spilling costs. This function is designed so it can - * be called from both places and avoid repeating the logic. - * - * - When we call this function from spill_reg(), we pass in scratch_reg the - * actual unspill/spill register that we want to reuse in the current - * instruction. - * - * - When we call this from evaluate_spill_costs(), we pass the register for - * which we are evaluating spilling costs. - * - * In either case, we check if the previous instructions read scratch_reg until - * we find one that writes to it with a compatible mask or does not read/write - * scratch_reg at all. - */ -static bool -can_use_scratch_for_source(const vec4_instruction *inst, unsigned i, - unsigned scratch_reg) -{ - assert(inst->src[i].file == VGRF); - bool prev_inst_read_scratch_reg = false; - - /* See if any previous source in the same instructions reads scratch_reg */ - for (unsigned n = 0; n < i; n++) { - if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg) - prev_inst_read_scratch_reg = true; - } - - /* Now check if previous instructions read/write scratch_reg */ - for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev; - !prev_inst->is_head_sentinel(); - prev_inst = (vec4_instruction *) prev_inst->prev) { - - /* If the previous instruction writes to scratch_reg then we can reuse - * it if the write is not conditional and the channels we write are - * compatible with our read mask - */ - if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) { - return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) && - (brw_mask_for_swizzle(inst->src[i].swizzle) & - ~prev_inst->dst.writemask) == 0; - } - - /* Skip scratch read/writes so that instructions generated by spilling - * other registers (that won't read/write scratch_reg) do not stop us from - * reusing scratch_reg for this instruction. - */ - if (prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_WRITE || - prev_inst->opcode == SHADER_OPCODE_GFX4_SCRATCH_READ) - continue; - - /* If the previous instruction does not write to scratch_reg, then check - * if it reads it - */ - int n; - for (n = 0; n < 3; n++) { - if (prev_inst->src[n].file == VGRF && - prev_inst->src[n].nr == scratch_reg) { - prev_inst_read_scratch_reg = true; - break; - } - } - if (n == 3) { - /* The previous instruction does not read scratch_reg. At this point, - * if no previous instruction has read scratch_reg it means that we - * will need to unspill it here and we can't reuse it (so we return - * false). Otherwise, if we found at least one consecutive instruction - * that read scratch_reg, then we know that we got here from - * evaluate_spill_costs (since for the spill_reg path any block of - * consecutive instructions using scratch_reg must start with a write - * to that register, so we would've exited the loop in the check for - * the write that we have at the start of this loop), and in that case - * it means that we found the point at which the scratch_reg would be - * unspilled. Since we always unspill a full vec4, it means that we - * have all the channels available and we can just return true to - * signal that we can reuse the register in the current instruction - * too. - */ - return prev_inst_read_scratch_reg; - } - } - - return prev_inst_read_scratch_reg; -} - -static inline float -spill_cost_for_type(enum brw_reg_type type) -{ - /* Spilling of a 64-bit register involves emitting 2 32-bit scratch - * messages plus the 64b/32b shuffling code. - */ - return type_sz(type) == 8 ? 2.25f : 1.0f; -} - -void -vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill) -{ - float loop_scale = 1.0; - - unsigned *reg_type_size = (unsigned *) - ralloc_size(NULL, this->alloc.count * sizeof(unsigned)); - - for (unsigned i = 0; i < this->alloc.count; i++) { - spill_costs[i] = 0.0; - no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2; - reg_type_size[i] = 0; - } - - /* Calculate costs for spilling nodes. Call it a cost of 1 per - * spill/unspill we'll have to do, and guess that the insides of - * loops run 10 times. - */ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (unsigned int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) { - /* We will only unspill src[i] it it wasn't unspilled for the - * previous instruction, in which case we'll just reuse the scratch - * reg for this instruction. - */ - if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) { - spill_costs[inst->src[i].nr] += - loop_scale * spill_cost_for_type(inst->src[i].type); - if (inst->src[i].reladdr || - inst->src[i].offset >= REG_SIZE) - no_spill[inst->src[i].nr] = true; - - /* We don't support unspills of partial DF reads. - * - * Our 64-bit unspills are implemented with two 32-bit scratch - * messages, each one reading that for both SIMD4x2 threads that - * we need to shuffle into correct 64-bit data. Ensure that we - * are reading data for both threads. - */ - if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8) - no_spill[inst->src[i].nr] = true; - } - - /* We can't spill registers that mix 32-bit and 64-bit access (that - * contain 64-bit data that is operated on via 32-bit instructions) - */ - unsigned type_size = type_sz(inst->src[i].type); - if (reg_type_size[inst->src[i].nr] == 0) - reg_type_size[inst->src[i].nr] = type_size; - else if (reg_type_size[inst->src[i].nr] != type_size) - no_spill[inst->src[i].nr] = true; - } - } - - if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) { - spill_costs[inst->dst.nr] += - loop_scale * spill_cost_for_type(inst->dst.type); - if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE) - no_spill[inst->dst.nr] = true; - - /* We don't support spills of partial DF writes. - * - * Our 64-bit spills are implemented with two 32-bit scratch messages, - * each one writing that for both SIMD4x2 threads. Ensure that we - * are writing data for both threads. - */ - if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8) - no_spill[inst->dst.nr] = true; - - /* We can't spill registers that mix 32-bit and 64-bit access (that - * contain 64-bit data that is operated on via 32-bit instructions) - */ - unsigned type_size = type_sz(inst->dst.type); - if (reg_type_size[inst->dst.nr] == 0) - reg_type_size[inst->dst.nr] = type_size; - else if (reg_type_size[inst->dst.nr] != type_size) - no_spill[inst->dst.nr] = true; - } - - switch (inst->opcode) { - - case BRW_OPCODE_DO: - loop_scale *= 10; - break; - - case BRW_OPCODE_WHILE: - loop_scale /= 10; - break; - - case SHADER_OPCODE_GFX4_SCRATCH_READ: - case SHADER_OPCODE_GFX4_SCRATCH_WRITE: - case VEC4_OPCODE_MOV_FOR_SCRATCH: - for (int i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF) - no_spill[inst->src[i].nr] = true; - } - if (inst->dst.file == VGRF) - no_spill[inst->dst.nr] = true; - break; - - default: - break; - } - } - - ralloc_free(reg_type_size); -} - -int -vec4_visitor::choose_spill_reg(struct ra_graph *g) -{ - float spill_costs[this->alloc.count]; - bool no_spill[this->alloc.count]; - - evaluate_spill_costs(spill_costs, no_spill); - - for (unsigned i = 0; i < this->alloc.count; i++) { - if (!no_spill[i]) - ra_set_node_spill_cost(g, i, spill_costs[i]); - } - - return ra_get_best_spill_node(g); -} - -void -vec4_visitor::spill_reg(unsigned spill_reg_nr) -{ - assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2); - unsigned spill_offset = last_scratch; - last_scratch += alloc.sizes[spill_reg_nr]; - - /* Generate spill/unspill instructions for the objects being spilled. */ - unsigned scratch_reg = ~0u; - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (unsigned i = 0; i < 3; i++) { - if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) { - if (scratch_reg == ~0u || - !can_use_scratch_for_source(inst, i, scratch_reg)) { - /* We need to unspill anyway so make sure we read the full vec4 - * in any case. This way, the cached register can be reused - * for consecutive instructions that read different channels of - * the same vec4. - */ - scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]); - src_reg temp = inst->src[i]; - temp.nr = scratch_reg; - temp.offset = 0; - temp.swizzle = BRW_SWIZZLE_XYZW; - emit_scratch_read(block, inst, - dst_reg(temp), inst->src[i], spill_offset); - temp.offset = inst->src[i].offset; - } - assert(scratch_reg != ~0u); - inst->src[i].nr = scratch_reg; - } - } - - if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) { - emit_scratch_write(block, inst, spill_offset); - scratch_reg = inst->dst.nr; - } - } - - invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); -} - -} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_surface_builder.cpp b/src/intel/compiler/brw_vec4_surface_builder.cpp deleted file mode 100644 index fce3133bef8..00000000000 --- a/src/intel/compiler/brw_vec4_surface_builder.cpp +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright © 2013-2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_vec4_surface_builder.h" - -using namespace brw; - -namespace { - namespace array_utils { - /** - * Copy one every \p src_stride logical components of the argument into - * one every \p dst_stride logical components of the result. - */ - static src_reg - emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size, - unsigned dst_stride, unsigned src_stride) - { - if (src_stride == 1 && dst_stride == 1) { - return src; - } else { - const dst_reg dst = bld.vgrf(src.type, - DIV_ROUND_UP(size * dst_stride, 4)); - - for (unsigned i = 0; i < size; ++i) - bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4), - 1 << (i * dst_stride % 4)), - swizzle(offset(src, 8, i * src_stride / 4), - brw_swizzle_for_mask(1 << (i * src_stride % 4)))); - - return src_reg(dst); - } - } - - /** - * Convert a VEC4 into an array of registers with the layout expected by - * the recipient shared unit. If \p has_simd4x2 is true the argument is - * left unmodified in SIMD4x2 form, otherwise it will be rearranged into - * a SIMD8 vector. - */ - static src_reg - emit_insert(const vec4_builder &bld, const src_reg &src, - unsigned n, bool has_simd4x2) - { - if (src.file == BAD_FILE || n == 0) { - return src_reg(); - - } else { - /* Pad unused components with zeroes. */ - const unsigned mask = (1 << n) - 1; - const dst_reg tmp = bld.vgrf(src.type); - - bld.MOV(writemask(tmp, mask), src); - if (n < 4) - bld.MOV(writemask(tmp, ~mask), brw_imm_d(0)); - - return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1); - } - } - } -} - -namespace brw { - namespace surface_access { - namespace { - using namespace array_utils; - - /** - * Generate a send opcode for a surface message and return the - * result. - */ - src_reg - emit_send(const vec4_builder &bld, enum opcode op, - const src_reg &header, - const src_reg &addr, unsigned addr_sz, - const src_reg &src, unsigned src_sz, - const src_reg &surface, - unsigned arg, unsigned ret_sz, - brw_predicate pred = BRW_PREDICATE_NONE) - { - /* Calculate the total number of components of the payload. */ - const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1); - const unsigned sz = header_sz + addr_sz + src_sz; - - /* Construct the payload. */ - const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz); - unsigned n = 0; - - if (header_sz) - bld.exec_all().MOV(offset(payload, 8, n++), - retype(header, BRW_REGISTER_TYPE_UD)); - - for (unsigned i = 0; i < addr_sz; i++) - bld.MOV(offset(payload, 8, n++), - offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i)); - - for (unsigned i = 0; i < src_sz; i++) - bld.MOV(offset(payload, 8, n++), - offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i)); - - /* Reduce the dynamically uniform surface index to a single - * scalar. - */ - const src_reg usurface = bld.emit_uniformize(surface); - - /* Emit the message send instruction. */ - const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz); - vec4_instruction *inst = - bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg)); - inst->mlen = sz; - inst->size_written = ret_sz * REG_SIZE; - inst->header_size = header_sz; - inst->predicate = pred; - - return src_reg(dst); - } - } - - /** - * Emit an untyped surface read opcode. \p dims determines the number - * of components of the address and \p size the number of components of - * the returned value. - */ - src_reg - emit_untyped_read(const vec4_builder &bld, - const src_reg &surface, const src_reg &addr, - unsigned dims, unsigned size, - brw_predicate pred) - { - return emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_READ, src_reg(), - emit_insert(bld, addr, dims, true), 1, - src_reg(), 0, - surface, size, 1, pred); - } - - /** - * Emit an untyped surface write opcode. \p dims determines the number - * of components of the address and \p size the number of components of - * the argument. - */ - void - emit_untyped_write(const vec4_builder &bld, const src_reg &surface, - const src_reg &addr, const src_reg &src, - unsigned dims, unsigned size, - brw_predicate pred) - { - const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75; - emit_send(bld, VEC4_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(), - emit_insert(bld, addr, dims, has_simd4x2), - has_simd4x2 ? 1 : dims, - emit_insert(bld, src, size, has_simd4x2), - has_simd4x2 ? 1 : size, - surface, size, 0, pred); - } - - /** - * Emit an untyped surface atomic opcode. \p dims determines the number - * of components of the address and \p rsize the number of components of - * the returned value (either zero or one). - */ - src_reg - emit_untyped_atomic(const vec4_builder &bld, - const src_reg &surface, const src_reg &addr, - const src_reg &src0, const src_reg &src1, - unsigned dims, unsigned rsize, unsigned op, - brw_predicate pred) - { - const bool has_simd4x2 = bld.shader->devinfo->verx10 == 75; - - /* Zip the components of both sources, they are represented as the X - * and Y components of the same vector. - */ - const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE); - const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD); - - if (size >= 1) { - bld.MOV(writemask(srcs, WRITEMASK_X), - swizzle(src0, BRW_SWIZZLE_XXXX)); - } - - if (size >= 2) { - bld.MOV(writemask(srcs, WRITEMASK_Y), - swizzle(src1, BRW_SWIZZLE_XXXX)); - } - - return emit_send(bld, VEC4_OPCODE_UNTYPED_ATOMIC, src_reg(), - emit_insert(bld, addr, dims, has_simd4x2), - has_simd4x2 ? 1 : dims, - emit_insert(bld, src_reg(srcs), size, has_simd4x2), - has_simd4x2 && size ? 1 : size, - surface, op, rsize, pred); - } - } -} diff --git a/src/intel/compiler/brw_vec4_surface_builder.h b/src/intel/compiler/brw_vec4_surface_builder.h deleted file mode 100644 index 2821685a361..00000000000 --- a/src/intel/compiler/brw_vec4_surface_builder.h +++ /dev/null @@ -1,53 +0,0 @@ -/* -*- c++ -*- */ -/* - * Copyright © 2013-2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef BRW_VEC4_SURFACE_BUILDER_H -#define BRW_VEC4_SURFACE_BUILDER_H - -#include "brw_vec4_builder.h" - -namespace brw { - namespace surface_access { - src_reg - emit_untyped_read(const vec4_builder &bld, - const src_reg &surface, const src_reg &addr, - unsigned dims, unsigned size, - brw_predicate pred = BRW_PREDICATE_NONE); - - void - emit_untyped_write(const vec4_builder &bld, const src_reg &surface, - const src_reg &addr, const src_reg &src, - unsigned dims, unsigned size, - brw_predicate pred = BRW_PREDICATE_NONE); - - src_reg - emit_untyped_atomic(const vec4_builder &bld, - const src_reg &surface, const src_reg &addr, - const src_reg &src0, const src_reg &src1, - unsigned dims, unsigned rsize, unsigned op, - brw_predicate pred = BRW_PREDICATE_NONE); - } -} - -#endif diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp deleted file mode 100644 index d3dceb38922..00000000000 --- a/src/intel/compiler/brw_vec4_tcs.cpp +++ /dev/null @@ -1,320 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_vec4_tcs.cpp - * - * Tessellaton control shader specific code derived from the vec4_visitor class. - */ - -#include "intel_nir.h" -#include "brw_vec4_tcs.h" - -namespace brw { - -vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_tcs_prog_key *key, - struct brw_tcs_prog_data *prog_data, - const nir_shader *nir, - bool debug_enabled) - : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base, - nir, false, debug_enabled), - key(key) -{ -} - - -void -vec4_tcs_visitor::setup_payload() -{ - int reg = 0; - - /* The payload always contains important data in r0, which contains - * the URB handles that are passed on to the URB write at the end - * of the thread. - */ - reg++; - - /* r1.0 - r4.7 may contain the input control point URB handles, - * which we use to pull vertex data. - */ - reg += 4; - - /* Push constants may start at r5.0 */ - reg = setup_uniforms(reg); - - this->first_non_payload_grf = reg; -} - - -void -vec4_tcs_visitor::emit_prolog() -{ - invocation_id = src_reg(this, glsl_uint_type()); - emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id)); - - /* HS threads are dispatched with the dispatch mask set to 0xFF. - * If there are an odd number of output vertices, then the final - * HS instance dispatched will only have its bottom half doing real - * work, and so we need to disable the upper half: - */ - if (nir->info.tess.tcs_vertices_out % 2) { - emit(CMP(dst_null_d(), invocation_id, - brw_imm_ud(nir->info.tess.tcs_vertices_out), - BRW_CONDITIONAL_L)); - - /* Matching ENDIF is in emit_thread_end() */ - emit(IF(BRW_PREDICATE_NORMAL)); - } -} - - -void -vec4_tcs_visitor::emit_thread_end() -{ - vec4_instruction *inst; - current_annotation = "thread end"; - - if (nir->info.tess.tcs_vertices_out % 2) { - emit(BRW_OPCODE_ENDIF); - } - - if (devinfo->ver == 7) { - struct brw_tcs_prog_data *tcs_prog_data = - (struct brw_tcs_prog_data *) prog_data; - - current_annotation = "release input vertices"; - - /* Synchronize all threads, so we know that no one is still - * using the input URB handles. - */ - if (tcs_prog_data->instances > 1) { - dst_reg header = dst_reg(this, glsl_uvec4_type()); - emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); - emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); - } - - /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles. - * We want to compare the bottom half of invocation_id with 0, but - * use that truth value for the top half as well. Unfortunately, - * we don't have stride in the vec4 world, nor UV immediates in - * align16, so we need an opcode to get invocation_id<0,4,0>. - */ - set_condmod(BRW_CONDITIONAL_Z, - emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), - invocation_id)); - emit(IF(BRW_PREDICATE_NORMAL)); - for (unsigned i = 0; i < key->input_vertices; i += 2) { - /* If we have an odd number of input vertices, the last will be - * unpaired. We don't want to use an interleaved URB write in - * that case. - */ - const bool is_unpaired = i == key->input_vertices - 1; - - dst_reg header(this, glsl_uvec4_type()); - emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i), - brw_imm_ud(is_unpaired)); - } - emit(BRW_OPCODE_ENDIF); - } - - inst = emit(TCS_OPCODE_THREAD_END); - inst->base_mrf = 14; - inst->mlen = 2; -} - - -void -vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst, - const src_reg &vertex_index, - unsigned base_offset, - unsigned first_component, - const src_reg &indirect_offset) -{ - vec4_instruction *inst; - dst_reg temp(this, glsl_ivec4_type()); - temp.type = dst.type; - - /* Set up the message header to reference the proper parts of the URB */ - dst_reg header = dst_reg(this, glsl_uvec4_type()); - inst = emit(VEC4_TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index, - indirect_offset); - inst->force_writemask_all = true; - - /* Read into a temporary, ignoring writemasking. */ - inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); - inst->offset = base_offset; - inst->mlen = 1; - inst->base_mrf = -1; - - /* Copy the temporary to the destination to deal with writemasking. - * - * Also attempt to deal with gl_PointSize being in the .w component. - */ - if (inst->offset == 0 && indirect_offset.file == BAD_FILE) { - emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW))); - } else { - src_reg src = src_reg(temp); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - emit(MOV(dst, src)); - } -} - -void -vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst, - unsigned base_offset, - unsigned first_component, - const src_reg &indirect_offset) -{ - vec4_instruction *inst; - - /* Set up the message header to reference the proper parts of the URB */ - dst_reg header = dst_reg(this, glsl_uvec4_type()); - inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header, - brw_imm_ud(dst.writemask << first_component), indirect_offset); - inst->force_writemask_all = true; - - vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header)); - read->offset = base_offset; - read->mlen = 1; - read->base_mrf = -1; - - if (first_component) { - /* Read into a temporary and copy with a swizzle and writemask. */ - read->dst = retype(dst_reg(this, glsl_ivec4_type()), dst.type); - emit(MOV(dst, swizzle(src_reg(read->dst), - BRW_SWZ_COMP_INPUT(first_component)))); - } -} - -void -vec4_tcs_visitor::emit_urb_write(const src_reg &value, - unsigned writemask, - unsigned base_offset, - const src_reg &indirect_offset) -{ - if (writemask == 0) - return; - - src_reg message(this, glsl_uvec4_type(), 2); - vec4_instruction *inst; - - inst = emit(VEC4_TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message), - brw_imm_ud(writemask), indirect_offset); - inst->force_writemask_all = true; - inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE), - value)); - inst->force_writemask_all = true; - - inst = emit(VEC4_TCS_OPCODE_URB_WRITE, dst_null_f(), message); - inst->offset = base_offset; - inst->mlen = 2; - inst->base_mrf = -1; -} - -void -vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) -{ - switch (instr->intrinsic) { - case nir_intrinsic_load_invocation_id: - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_UD), - invocation_id)); - break; - case nir_intrinsic_load_primitive_id: - emit(TCS_OPCODE_GET_PRIMITIVE_ID, - get_nir_def(instr->def, BRW_REGISTER_TYPE_UD)); - break; - case nir_intrinsic_load_patch_vertices_in: - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D), - brw_imm_d(key->input_vertices))); - break; - case nir_intrinsic_load_per_vertex_input: { - assert(instr->def.bit_size == 32); - src_reg indirect_offset = get_indirect_offset(instr); - unsigned imm_offset = nir_intrinsic_base(instr); - - src_reg vertex_index = retype(get_nir_src_imm(instr->src[0]), - BRW_REGISTER_TYPE_UD); - - unsigned first_component = nir_intrinsic_component(instr); - dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D); - dst.writemask = brw_writemask_for_size(instr->num_components); - emit_input_urb_read(dst, vertex_index, imm_offset, - first_component, indirect_offset); - break; - } - case nir_intrinsic_load_input: - unreachable("nir_lower_io should use load_per_vertex_input intrinsics"); - break; - case nir_intrinsic_load_output: - case nir_intrinsic_load_per_vertex_output: { - src_reg indirect_offset = get_indirect_offset(instr); - unsigned imm_offset = nir_intrinsic_base(instr); - - dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D); - dst.writemask = brw_writemask_for_size(instr->num_components); - - emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr), - indirect_offset); - break; - } - case nir_intrinsic_store_output: - case nir_intrinsic_store_per_vertex_output: { - assert(nir_src_bit_size(instr->src[0]) == 32); - src_reg value = get_nir_src(instr->src[0]); - unsigned mask = nir_intrinsic_write_mask(instr); - unsigned swiz = BRW_SWIZZLE_XYZW; - - src_reg indirect_offset = get_indirect_offset(instr); - unsigned imm_offset = nir_intrinsic_base(instr); - - unsigned first_component = nir_intrinsic_component(instr); - if (first_component) { - assert(swiz == BRW_SWIZZLE_XYZW); - swiz = BRW_SWZ_COMP_OUTPUT(first_component); - mask = mask << first_component; - } - - emit_urb_write(swizzle(value, swiz), mask, - imm_offset, indirect_offset); - break; - } - - case nir_intrinsic_barrier: - if (nir_intrinsic_memory_scope(instr) != SCOPE_NONE) - vec4_visitor::nir_emit_intrinsic(instr); - if (nir_intrinsic_execution_scope(instr) == SCOPE_WORKGROUP) { - dst_reg header = dst_reg(this, glsl_uvec4_type()); - emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header); - emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header)); - } - break; - - default: - vec4_visitor::nir_emit_intrinsic(instr); - } -} - -} /* namespace brw */ - diff --git a/src/intel/compiler/brw_vec4_tcs.h b/src/intel/compiler/brw_vec4_tcs.h deleted file mode 100644 index e5de6c4945b..00000000000 --- a/src/intel/compiler/brw_vec4_tcs.h +++ /dev/null @@ -1,83 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_vec4_tcs.h - * - * The vec4-mode tessellation control shader compiler backend. - */ - -#ifndef BRW_VEC4_TCS_H -#define BRW_VEC4_TCS_H - -#include "brw_compiler.h" -#include "brw_eu.h" -#include "brw_vec4.h" - -#ifdef __cplusplus -namespace brw { - -class vec4_tcs_visitor : public vec4_visitor -{ -public: - vec4_tcs_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_tcs_prog_key *key, - struct brw_tcs_prog_data *prog_data, - const nir_shader *nir, - bool debug_enabled); - -protected: - virtual void setup_payload(); - virtual void emit_prolog(); - virtual void emit_thread_end(); - - virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); - - void emit_input_urb_read(const dst_reg &dst, - const src_reg &vertex_index, - unsigned base_offset, - unsigned first_component, - const src_reg &indirect_offset); - void emit_output_urb_read(const dst_reg &dst, - unsigned base_offset, - unsigned first_component, - const src_reg &indirect_offset); - - void emit_urb_write(const src_reg &value, unsigned writemask, - unsigned base_offset, const src_reg &indirect_offset); - - /* we do not use the normal end-of-shader URB write mechanism -- but every - * vec4 stage must provide implementations of these: - */ - virtual void emit_urb_write_header(int /* mrf */) {} - virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) { return NULL; } - - const struct brw_tcs_prog_key *key; - src_reg invocation_id; -}; - -} /* namespace brw */ -#endif /* __cplusplus */ - -#endif /* BRW_VEC4_TCS_H */ diff --git a/src/intel/compiler/brw_vec4_tes.cpp b/src/intel/compiler/brw_vec4_tes.cpp deleted file mode 100644 index 7af5220be75..00000000000 --- a/src/intel/compiler/brw_vec4_tes.cpp +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_vec4_tes.cpp - * - * Tessellaton evaluation shader specific code derived from the vec4_visitor class. - */ - -#include "brw_vec4_tes.h" -#include "brw_cfg.h" -#include "dev/intel_debug.h" - -namespace brw { - -vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_tes_prog_key *key, - struct brw_tes_prog_data *prog_data, - const nir_shader *shader, - bool debug_enabled) - : vec4_visitor(compiler, params, &key->base.tex, &prog_data->base, - shader, false, debug_enabled) -{ -} - -void -vec4_tes_visitor::setup_payload() -{ - int reg = 0; - - /* The payload always contains important data in r0 and r1, which contains - * the URB handles that are passed on to the URB write at the end - * of the thread. - */ - reg += 2; - - reg = setup_uniforms(reg); - - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - for (int i = 0; i < 3; i++) { - if (inst->src[i].file != ATTR) - continue; - - unsigned slot = inst->src[i].nr + inst->src[i].offset / 16; - struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2)); - grf = stride(grf, 0, 4, 1); - grf.swizzle = inst->src[i].swizzle; - grf.type = inst->src[i].type; - grf.abs = inst->src[i].abs; - grf.negate = inst->src[i].negate; - inst->src[i] = grf; - } - } - - reg += 8 * prog_data->urb_read_length; - - this->first_non_payload_grf = reg; -} - - -void -vec4_tes_visitor::emit_prolog() -{ - input_read_header = src_reg(this, glsl_uvec4_type()); - emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header)); - - this->current_annotation = NULL; -} - - -void -vec4_tes_visitor::emit_urb_write_header(int mrf) -{ - /* No need to do anything for DS; an implied write to this MRF will be - * performed by VEC4_VS_OPCODE_URB_WRITE. - */ - (void) mrf; -} - - -vec4_instruction * -vec4_tes_visitor::emit_urb_write_opcode(bool complete) -{ - vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE); - inst->urb_write_flags = complete ? - BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS; - - return inst; -} - -void -vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr) -{ - const struct brw_tes_prog_data *tes_prog_data = - (const struct brw_tes_prog_data *) prog_data; - - switch (instr->intrinsic) { - case nir_intrinsic_load_tess_coord: - /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */ - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F), - src_reg(brw_vec8_grf(1, 0)))); - break; - case nir_intrinsic_load_tess_level_outer: - if (tes_prog_data->domain == INTEL_TESS_DOMAIN_ISOLINE) { - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F), - swizzle(src_reg(ATTR, 1, glsl_vec4_type()), - BRW_SWIZZLE_ZWZW))); - } else { - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F), - swizzle(src_reg(ATTR, 1, glsl_vec4_type()), - BRW_SWIZZLE_WZYX))); - } - break; - case nir_intrinsic_load_tess_level_inner: - if (tes_prog_data->domain == INTEL_TESS_DOMAIN_QUAD) { - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F), - swizzle(src_reg(ATTR, 0, glsl_vec4_type()), - BRW_SWIZZLE_WZYX))); - } else { - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_F), - src_reg(ATTR, 1, glsl_float_type()))); - } - break; - case nir_intrinsic_load_primitive_id: - emit(TES_OPCODE_GET_PRIMITIVE_ID, - get_nir_def(instr->def, BRW_REGISTER_TYPE_UD)); - break; - - case nir_intrinsic_load_input: - case nir_intrinsic_load_per_vertex_input: { - assert(instr->def.bit_size == 32); - src_reg indirect_offset = get_indirect_offset(instr); - unsigned imm_offset = instr->const_index[0]; - src_reg header = input_read_header; - unsigned first_component = nir_intrinsic_component(instr); - - if (indirect_offset.file != BAD_FILE) { - src_reg clamped_indirect_offset = src_reg(this, glsl_uvec4_type()); - - /* Page 190 of "Volume 7: 3D Media GPGPU Engine (Haswell)" says the - * valid range of the offset is [0, 0FFFFFFFh]. - */ - emit_minmax(BRW_CONDITIONAL_L, - dst_reg(clamped_indirect_offset), - retype(indirect_offset, BRW_REGISTER_TYPE_UD), - brw_imm_ud(0x0fffffffu)); - - header = src_reg(this, glsl_uvec4_type()); - emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header), - input_read_header, clamped_indirect_offset); - } else { - /* Arbitrarily only push up to 24 vec4 slots worth of data, - * which is 12 registers (since each holds 2 vec4 slots). - */ - const unsigned max_push_slots = 24; - if (imm_offset < max_push_slots) { - src_reg src = src_reg(ATTR, imm_offset, glsl_ivec4_type()); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - - emit(MOV(get_nir_def(instr->def, BRW_REGISTER_TYPE_D), src)); - - prog_data->urb_read_length = - MAX2(prog_data->urb_read_length, - DIV_ROUND_UP(imm_offset + 1, 2)); - break; - } - } - - dst_reg temp(this, glsl_ivec4_type()); - vec4_instruction *read = - emit(VEC4_OPCODE_URB_READ, temp, src_reg(header)); - read->offset = imm_offset; - read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET; - - src_reg src = src_reg(temp); - src.swizzle = BRW_SWZ_COMP_INPUT(first_component); - - /* Copy to target. We might end up with some funky writemasks landing - * in here, but we really don't want them in the above pseudo-ops. - */ - dst_reg dst = get_nir_def(instr->def, BRW_REGISTER_TYPE_D); - dst.writemask = brw_writemask_for_size(instr->num_components); - emit(MOV(dst, src)); - break; - } - default: - vec4_visitor::nir_emit_intrinsic(instr); - } -} - - -void -vec4_tes_visitor::emit_thread_end() -{ - /* For DS, we always end the thread by emitting a single vertex. - * emit_urb_write_opcode() will take care of setting the eot flag on the - * SEND instruction. - */ - emit_vertex(); -} - -} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_tes.h b/src/intel/compiler/brw_vec4_tes.h deleted file mode 100644 index 23a11956681..00000000000 --- a/src/intel/compiler/brw_vec4_tes.h +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/** - * \file brw_vec4_tes.h - * - * The vec4 mode tessellation evaluation shader compiler backend. - */ - -#ifndef BRW_VEC4_TES_H -#define BRW_VEC4_TES_H - -#include "brw_vec4.h" - -#ifdef __cplusplus -namespace brw { - -class vec4_tes_visitor : public vec4_visitor -{ -public: - vec4_tes_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_tes_prog_key *key, - struct brw_tes_prog_data *prog_data, - const nir_shader *nir, - bool debug_enabled); - -protected: - virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr); - - virtual void setup_payload(); - virtual void emit_prolog(); - virtual void emit_thread_end(); - - virtual void emit_urb_write_header(int mrf); - virtual vec4_instruction *emit_urb_write_opcode(bool complete); - -private: - src_reg input_read_header; -}; - -} /* namespace brw */ -#endif /* __cplusplus */ - -#endif /* BRW_VEC4_TES_H */ diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp deleted file mode 100644 index 236c7bae3ba..00000000000 --- a/src/intel/compiler/brw_vec4_visitor.cpp +++ /dev/null @@ -1,1319 +0,0 @@ -/* - * Copyright © 2011 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include "brw_nir.h" -#include "brw_vec4.h" -#include "brw_cfg.h" -#include "brw_eu.h" -#include "util/u_math.h" - -namespace brw { - -vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst, - const src_reg &src0, const src_reg &src1, - const src_reg &src2) -{ - this->opcode = opcode; - this->dst = dst; - this->src[0] = src0; - this->src[1] = src1; - this->src[2] = src2; - this->saturate = false; - this->force_writemask_all = false; - this->no_dd_clear = false; - this->no_dd_check = false; - this->writes_accumulator = false; - this->conditional_mod = BRW_CONDITIONAL_NONE; - this->predicate = BRW_PREDICATE_NONE; - this->predicate_inverse = false; - this->target = 0; - this->shadow_compare = false; - this->eot = false; - this->ir = NULL; - this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; - this->header_size = 0; - this->flag_subreg = 0; - this->mlen = 0; - this->base_mrf = 0; - this->offset = 0; - this->exec_size = 8; - this->group = 0; - this->size_written = (dst.file == BAD_FILE ? - 0 : this->exec_size * type_sz(dst.type)); - this->annotation = NULL; -} - -vec4_instruction * -vec4_visitor::emit(vec4_instruction *inst) -{ - inst->ir = this->base_ir; - inst->annotation = this->current_annotation; - - this->instructions.push_tail(inst); - - return inst; -} - -vec4_instruction * -vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst, - vec4_instruction *new_inst) -{ - new_inst->ir = inst->ir; - new_inst->annotation = inst->annotation; - - inst->insert_before(block, new_inst); - - return inst; -} - -vec4_instruction * -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, - const src_reg &src1, const src_reg &src2) -{ - return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2)); -} - - -vec4_instruction * -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0, - const src_reg &src1) -{ - return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1)); -} - -vec4_instruction * -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) -{ - return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0)); -} - -vec4_instruction * -vec4_visitor::emit(enum opcode opcode, const dst_reg &dst) -{ - return emit(new(mem_ctx) vec4_instruction(opcode, dst)); -} - -vec4_instruction * -vec4_visitor::emit(enum opcode opcode) -{ - return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg())); -} - -#define ALU1(op) \ - vec4_instruction * \ - vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \ - { \ - return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \ - } - -#define ALU2(op) \ - vec4_instruction * \ - vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ - const src_reg &src1) \ - { \ - return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ - src0, src1); \ - } - -#define ALU2_ACC(op) \ - vec4_instruction * \ - vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ - const src_reg &src1) \ - { \ - vec4_instruction *inst = new(mem_ctx) vec4_instruction( \ - BRW_OPCODE_##op, dst, src0, src1); \ - inst->writes_accumulator = true; \ - return inst; \ - } - -#define ALU3(op) \ - vec4_instruction * \ - vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \ - const src_reg &src1, const src_reg &src2) \ - { \ - assert(devinfo->ver >= 6); \ - return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \ - src0, src1, src2); \ - } - -ALU1(NOT) -ALU1(MOV) -ALU1(FRC) -ALU1(RNDD) -ALU1(RNDE) -ALU1(RNDZ) -ALU1(F32TO16) -ALU1(F16TO32) -ALU2(ADD) -ALU2(MUL) -ALU2_ACC(MACH) -ALU2(AND) -ALU2(OR) -ALU2(XOR) -ALU2(DP3) -ALU2(DP4) -ALU2(DPH) -ALU2(SHL) -ALU2(SHR) -ALU2(ASR) -ALU3(LRP) -ALU1(BFREV) -ALU3(BFE) -ALU2(BFI1) -ALU3(BFI2) -ALU1(FBH) -ALU1(FBL) -ALU1(CBIT) -ALU1(LZD) -ALU3(MAD) -ALU2_ACC(ADDC) -ALU2_ACC(SUBB) -ALU2(MAC) -ALU1(DIM) - -/** Gfx4 predicated IF. */ -vec4_instruction * -vec4_visitor::IF(enum brw_predicate predicate) -{ - vec4_instruction *inst; - - inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF); - inst->predicate = predicate; - - return inst; -} - -/** Gfx6 IF with embedded comparison. */ -vec4_instruction * -vec4_visitor::IF(src_reg src0, src_reg src1, - enum brw_conditional_mod condition) -{ - assert(devinfo->ver == 6); - - vec4_instruction *inst; - - resolve_ud_negate(&src0); - resolve_ud_negate(&src1); - - inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(), - src0, src1); - inst->conditional_mod = condition; - - return inst; -} - -/** - * CMP: Sets the low bit of the destination channels with the result - * of the comparison, while the upper bits are undefined, and updates - * the flag register with the packed 16 bits of the result. - */ -vec4_instruction * -vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, - enum brw_conditional_mod condition) -{ - vec4_instruction *inst; - - /* Take the instruction: - * - * CMP null src0 src1 - * - * Original gfx4 does type conversion to the destination type before - * comparison, producing garbage results for floating point comparisons. - * - * The destination type doesn't matter on newer generations, so we set the - * type to match src0 so we can compact the instruction. - */ - dst.type = src0.type; - - resolve_ud_negate(&src0); - resolve_ud_negate(&src1); - - inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1); - inst->conditional_mod = condition; - - return inst; -} - -vec4_instruction * -vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index) -{ - vec4_instruction *inst; - - inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_READ, - dst, index); - inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver) + 1; - inst->mlen = 2; - - return inst; -} - -vec4_instruction * -vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src, - const src_reg &index) -{ - vec4_instruction *inst; - - inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GFX4_SCRATCH_WRITE, - dst, src, index); - inst->base_mrf = FIRST_SPILL_MRF(devinfo->ver); - inst->mlen = 3; - - return inst; -} - -src_reg -vec4_visitor::fix_3src_operand(const src_reg &src) -{ - /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be - * able to use vertical stride of zero to replicate the vec4 uniform, like - * - * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7] - * - * But you can't, since vertical stride is always four in three-source - * instructions. Instead, insert a MOV instruction to do the replication so - * that the three-source instruction can consume it. - */ - - /* The MOV is only needed if the source is a uniform or immediate. */ - if (src.file != UNIFORM && src.file != IMM) - return src; - - if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle)) - return src; - - dst_reg expanded = dst_reg(this, glsl_vec4_type()); - expanded.type = src.type; - emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src); - return src_reg(expanded); -} - -src_reg -vec4_visitor::fix_math_operand(const src_reg &src) -{ - if (devinfo->ver < 6 || src.file == BAD_FILE) - return src; - - /* The gfx6 math instruction ignores the source modifiers -- - * swizzle, abs, negate, and at least some parts of the register - * region description. - * - * Rather than trying to enumerate all these cases, *always* expand the - * operand to a temp GRF for gfx6. - * - * For gfx7, keep the operand as-is, except if immediate, which gfx7 still - * can't use. - */ - - if (devinfo->ver == 7 && src.file != IMM) - return src; - - dst_reg expanded = dst_reg(this, glsl_vec4_type()); - expanded.type = src.type; - emit(MOV(expanded, src)); - return src_reg(expanded); -} - -vec4_instruction * -vec4_visitor::emit_math(enum opcode opcode, - const dst_reg &dst, - const src_reg &src0, const src_reg &src1) -{ - vec4_instruction *math = - emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1)); - - if (devinfo->ver == 6 && dst.writemask != WRITEMASK_XYZW) { - /* MATH on Gfx6 must be align1, so we can't do writemasks. */ - math->dst = dst_reg(this, glsl_vec4_type()); - math->dst.type = dst.type; - math = emit(MOV(dst, src_reg(math->dst))); - } else if (devinfo->ver < 6) { - math->base_mrf = 1; - math->mlen = src1.file == BAD_FILE ? 1 : 2; - } - - return math; -} - -void -vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0) -{ - if (devinfo->ver < 7) { - unreachable("ir_unop_pack_half_2x16 should be lowered"); - } - - assert(dst.type == BRW_REGISTER_TYPE_UD); - assert(src0.type == BRW_REGISTER_TYPE_F); - - /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16: - * - * Because this instruction does not have a 16-bit floating-point type, - * the destination data type must be Word (W). - * - * The destination must be DWord-aligned and specify a horizontal stride - * (HorzStride) of 2. The 16-bit result is stored in the lower word of - * each destination channel and the upper word is not modified. - * - * The above restriction implies that the f32to16 instruction must use - * align1 mode, because only in align1 mode is it possible to specify - * horizontal stride. We choose here to defy the hardware docs and emit - * align16 instructions. - * - * (I [chadv] did attempt to emit align1 instructions for VS f32to16 - * instructions. I was partially successful in that the code passed all - * tests. However, the code was dubiously correct and fragile, and the - * tests were not harsh enough to probe that frailty. Not trusting the - * code, I chose instead to remain in align16 mode in defiance of the hw - * docs). - * - * I've [chadv] experimentally confirmed that, on gfx7 hardware and the - * simulator, emitting a f32to16 in align16 mode with UD as destination - * data type is safe. The behavior differs from that specified in the PRM - * in that the upper word of each destination channel is cleared to 0. - */ - - dst_reg tmp_dst(this, glsl_uvec2_type()); - src_reg tmp_src(tmp_dst); - -#if 0 - /* Verify the undocumented behavior on which the following instructions - * rely. If f32to16 fails to clear the upper word of the X and Y channels, - * then the result of the bit-or instruction below will be incorrect. - * - * You should inspect the disasm output in order to verify that the MOV is - * not optimized away. - */ - emit(MOV(tmp_dst, brw_imm_ud(0x12345678u))); -#endif - - /* Give tmp the form below, where "." means untouched. - * - * w z y x w z y x - * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll| - * - * That the upper word of each write-channel be 0 is required for the - * following bit-shift and bit-or instructions to work. Note that this - * relies on the undocumented hardware behavior mentioned above. - */ - tmp_dst.writemask = WRITEMASK_XY; - emit(F32TO16(tmp_dst, src0)); - - /* Give the write-channels of dst the form: - * 0xhhhh0000 - */ - tmp_src.swizzle = BRW_SWIZZLE_YYYY; - emit(SHL(dst, tmp_src, brw_imm_ud(16u))); - - /* Finally, give the write-channels of dst the form of packHalf2x16's - * output: - * 0xhhhhllll - */ - tmp_src.swizzle = BRW_SWIZZLE_XXXX; - emit(OR(dst, src_reg(dst), tmp_src)); -} - -void -vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0) -{ - if (devinfo->ver < 7) { - unreachable("ir_unop_unpack_half_2x16 should be lowered"); - } - - assert(dst.type == BRW_REGISTER_TYPE_F); - assert(src0.type == BRW_REGISTER_TYPE_UD); - - /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32: - * - * Because this instruction does not have a 16-bit floating-point type, - * the source data type must be Word (W). The destination type must be - * F (Float). - * - * To use W as the source data type, we must adjust horizontal strides, - * which is only possible in align1 mode. All my [chadv] attempts at - * emitting align1 instructions for unpackHalf2x16 failed to pass the - * Piglit tests, so I gave up. - * - * I've verified that, on gfx7 hardware and the simulator, it is safe to - * emit f16to32 in align16 mode with UD as source data type. - */ - - dst_reg tmp_dst(this, glsl_uvec2_type()); - src_reg tmp_src(tmp_dst); - - tmp_dst.writemask = WRITEMASK_X; - emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu))); - - tmp_dst.writemask = WRITEMASK_Y; - emit(SHR(tmp_dst, src0, brw_imm_ud(16u))); - - dst.writemask = WRITEMASK_XY; - emit(F16TO32(dst, tmp_src)); -} - -void -vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0) -{ - /* Instead of splitting the 32-bit integer, shifting, and ORing it back - * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate - * is not suitable to generate the shift values, but we can use the packed - * vector float and a type-converting MOV. - */ - dst_reg shift(this, glsl_uvec4_type()); - emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); - - dst_reg shifted(this, glsl_uvec4_type()); - src0.swizzle = BRW_SWIZZLE_XXXX; - emit(SHR(shifted, src0, src_reg(shift))); - - shifted.type = BRW_REGISTER_TYPE_UB; - dst_reg f(this, glsl_vec4_type()); - emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); - - emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f))); -} - -void -vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0) -{ - /* Instead of splitting the 32-bit integer, shifting, and ORing it back - * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate - * is not suitable to generate the shift values, but we can use the packed - * vector float and a type-converting MOV. - */ - dst_reg shift(this, glsl_uvec4_type()); - emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78))); - - dst_reg shifted(this, glsl_uvec4_type()); - src0.swizzle = BRW_SWIZZLE_XXXX; - emit(SHR(shifted, src0, src_reg(shift))); - - shifted.type = BRW_REGISTER_TYPE_B; - dst_reg f(this, glsl_vec4_type()); - emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted)); - - dst_reg scaled(this, glsl_vec4_type()); - emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f))); - - dst_reg max(this, glsl_vec4_type()); - emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f)); - emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f)); -} - -void -vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0) -{ - dst_reg saturated(this, glsl_vec4_type()); - vec4_instruction *inst = emit(MOV(saturated, src0)); - inst->saturate = true; - - dst_reg scaled(this, glsl_vec4_type()); - emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f))); - - dst_reg rounded(this, glsl_vec4_type()); - emit(RNDE(rounded, src_reg(scaled))); - - dst_reg u(this, glsl_uvec4_type()); - emit(MOV(u, src_reg(rounded))); - - src_reg bytes(u); - emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); -} - -void -vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0) -{ - dst_reg max(this, glsl_vec4_type()); - emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f)); - - dst_reg min(this, glsl_vec4_type()); - emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f)); - - dst_reg scaled(this, glsl_vec4_type()); - emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f))); - - dst_reg rounded(this, glsl_vec4_type()); - emit(RNDE(rounded, src_reg(scaled))); - - dst_reg i(this, glsl_ivec4_type()); - emit(MOV(i, src_reg(rounded))); - - src_reg bytes(i); - emit(VEC4_OPCODE_PACK_BYTES, dst, bytes); -} - -src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type) -{ - init(); - - this->file = VGRF; - this->nr = v->alloc.allocate(type_size_vec4(type, false)); - - if (glsl_type_is_array(type) || glsl_type_is_struct(type)) { - this->swizzle = BRW_SWIZZLE_NOOP; - } else { - this->swizzle = brw_swizzle_for_size(type->vector_elements); - } - - this->type = brw_type_for_base_type(type); -} - -src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size) -{ - assert(size > 0); - - init(); - - this->file = VGRF; - this->nr = v->alloc.allocate(type_size_vec4(type, false) * size); - - this->swizzle = BRW_SWIZZLE_NOOP; - - this->type = brw_type_for_base_type(type); -} - -dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type) -{ - init(); - - this->file = VGRF; - this->nr = v->alloc.allocate(type_size_vec4(type, false)); - - if (glsl_type_is_array(type) || glsl_type_is_struct(type)) { - this->writemask = WRITEMASK_XYZW; - } else { - this->writemask = (1 << type->vector_elements) - 1; - } - - this->type = brw_type_for_base_type(type); -} - -vec4_instruction * -vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst, - src_reg src0, src_reg src1) -{ - vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1); - inst->conditional_mod = conditionalmod; - return inst; -} - -/** - * Emits the instructions needed to perform a pull constant load. before_block - * and before_inst can be NULL in which case the instruction will be appended - * to the end of the instruction list. - */ -void -vec4_visitor::emit_pull_constant_load_reg(dst_reg dst, - src_reg surf_index, - src_reg offset_reg, - bblock_t *before_block, - vec4_instruction *before_inst) -{ - assert((before_inst == NULL && before_block == NULL) || - (before_inst && before_block)); - - vec4_instruction *pull; - - if (devinfo->ver >= 7) { - dst_reg grf_offset = dst_reg(this, glsl_uint_type()); - - grf_offset.type = offset_reg.type; - - pull = MOV(grf_offset, offset_reg); - - if (before_inst) - emit_before(before_block, before_inst, pull); - else - emit(pull); - - pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GFX7, - dst, - surf_index, - src_reg(grf_offset)); - pull->mlen = 1; - } else { - pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD, - dst, - surf_index, - offset_reg); - pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->ver) + 1; - pull->mlen = 1; - } - - if (before_inst) - emit_before(before_block, before_inst, pull); - else - emit(pull); -} - -src_reg -vec4_visitor::emit_uniformize(const src_reg &src) -{ - const src_reg chan_index(this, glsl_uint_type()); - const dst_reg dst = retype(dst_reg(this, glsl_uint_type()), - src.type); - - emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index)) - ->force_writemask_all = true; - emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index) - ->force_writemask_all = true; - - return src_reg(dst); -} - -void -vec4_visitor::gs_emit_vertex(int /* stream_id */) -{ - unreachable("not reached"); -} - -void -vec4_visitor::gs_end_primitive() -{ - unreachable("not reached"); -} - -void -vec4_visitor::emit_ndc_computation() -{ - if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE) - return; - - /* Get the position */ - src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]); - - /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */ - dst_reg ndc = dst_reg(this, glsl_vec4_type()); - output_reg[BRW_VARYING_SLOT_NDC][0] = ndc; - output_num_components[BRW_VARYING_SLOT_NDC][0] = 4; - - current_annotation = "NDC"; - dst_reg ndc_w = ndc; - ndc_w.writemask = WRITEMASK_W; - src_reg pos_w = pos; - pos_w.swizzle = BRW_SWIZZLE4(BRW_SWIZZLE_W, BRW_SWIZZLE_W, BRW_SWIZZLE_W, BRW_SWIZZLE_W); - emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w); - - dst_reg ndc_xyz = ndc; - ndc_xyz.writemask = WRITEMASK_XYZ; - - emit(MUL(ndc_xyz, pos, src_reg(ndc_w))); -} - -void -vec4_visitor::emit_psiz_and_flags(dst_reg reg) -{ - if (devinfo->ver < 6 && - ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) || - output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE || - devinfo->has_negative_rhw_bug)) { - dst_reg header1 = dst_reg(this, glsl_uvec4_type()); - dst_reg header1_w = header1; - header1_w.writemask = WRITEMASK_W; - - emit(MOV(header1, brw_imm_ud(0u))); - - if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) { - src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); - - current_annotation = "Point size"; - emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11)))); - emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8))); - } - - if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) { - current_annotation = "Clipping flags"; - dst_reg flags0 = dst_reg(this, glsl_uint_type()); - - emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); - emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0)); - emit(OR(header1_w, src_reg(header1_w), src_reg(flags0))); - } - - if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) { - dst_reg flags1 = dst_reg(this, glsl_uint_type()); - emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L)); - emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0)); - emit(SHL(flags1, src_reg(flags1), brw_imm_d(4))); - emit(OR(header1_w, src_reg(header1_w), src_reg(flags1))); - } - - /* i965 clipping workaround: - * 1) Test for -ve rhw - * 2) If set, - * set ndc = (0,0,0,0) - * set ucp[6] = 1 - * - * Later, clipping will detect ucp[6] and ensure the primitive is - * clipped against all fixed planes. - */ - if (devinfo->has_negative_rhw_bug && - output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) { - src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]); - ndc_w.swizzle = BRW_SWIZZLE_WWWW; - emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L)); - vec4_instruction *inst; - inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6))); - inst->predicate = BRW_PREDICATE_NORMAL; - output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F; - inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f))); - inst->predicate = BRW_PREDICATE_NORMAL; - } - - emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1))); - } else if (devinfo->ver < 6) { - emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u))); - } else { - emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0))); - if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) { - dst_reg reg_w = reg; - reg_w.writemask = WRITEMASK_W; - src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]); - reg_as_src.type = reg_w.type; - reg_as_src.swizzle = brw_swizzle_for_size(1); - emit(MOV(reg_w, reg_as_src)); - } - if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) { - dst_reg reg_y = reg; - reg_y.writemask = WRITEMASK_Y; - reg_y.type = BRW_REGISTER_TYPE_D; - output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type; - emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0]))); - } - if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) { - dst_reg reg_z = reg; - reg_z.writemask = WRITEMASK_Z; - reg_z.type = BRW_REGISTER_TYPE_D; - output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type; - emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0]))); - } - } -} - -vec4_instruction * -vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component) -{ - assert(varying < VARYING_SLOT_MAX); - - unsigned num_comps = output_num_components[varying][component]; - if (num_comps == 0) - return NULL; - - assert(output_reg[varying][component].type == reg.type); - current_annotation = output_reg_annotation[varying]; - if (output_reg[varying][component].file != BAD_FILE) { - src_reg src = src_reg(output_reg[varying][component]); - src.swizzle = BRW_SWZ_COMP_OUTPUT(component); - reg.writemask = - brw_writemask_for_component_packing(num_comps, component); - return emit(MOV(reg, src)); - } - return NULL; -} - -void -vec4_visitor::emit_urb_slot(dst_reg reg, int varying) -{ - reg.type = BRW_REGISTER_TYPE_F; - output_reg[varying][0].type = reg.type; - - switch (varying) { - case VARYING_SLOT_PSIZ: - { - /* PSIZ is always in slot 0, and is coupled with other flags. */ - current_annotation = "indices, point width, clip flags"; - emit_psiz_and_flags(reg); - break; - } - case BRW_VARYING_SLOT_NDC: - current_annotation = "NDC"; - if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) - emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]))); - break; - case VARYING_SLOT_POS: - current_annotation = "gl_Position"; - if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE) - emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0]))); - break; - case BRW_VARYING_SLOT_PAD: - /* No need to write to this slot */ - break; - default: - for (int i = 0; i < 4; i++) { - emit_generic_urb_slot(reg, varying, i); - } - break; - } -} - -static unsigned -align_interleaved_urb_mlen(const struct intel_device_info *devinfo, - unsigned mlen) -{ - if (devinfo->ver >= 6) { - /* URB data written (does not include the message header reg) must - * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, - * section 5.4.3.2.2: URB_INTERLEAVED. - * - * URB entries are allocated on a multiple of 1024 bits, so an - * extra 128 bits written here to make the end align to 256 is - * no problem. - */ - if ((mlen % 2) != 1) - mlen++; - } - - return mlen; -} - - -/** - * Generates the VUE payload plus the necessary URB write instructions to - * output it. - * - * The VUE layout is documented in Volume 2a. - */ -void -vec4_visitor::emit_vertex() -{ - /* MRF 0 is reserved for the debugger, so start with message header - * in MRF 1. - */ - int base_mrf = 1; - int mrf = base_mrf; - /* In the process of generating our URB write message contents, we - * may need to unspill a register or load from an array. Those - * reads would use MRFs 14-15. - */ - int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver); - - /* The following assertion verifies that max_usable_mrf causes an - * even-numbered amount of URB write data, which will meet gfx6's - * requirements for length alignment. - */ - assert ((max_usable_mrf - base_mrf) % 2 == 0); - - /* First mrf is the g0-based message header containing URB handles and - * such. - */ - emit_urb_write_header(mrf++); - - if (devinfo->ver < 6) { - emit_ndc_computation(); - } - - /* We may need to split this up into several URB writes, so do them in a - * loop. - */ - int slot = 0; - bool complete = false; - do { - /* URB offset is in URB row increments, and each of our MRFs is half of - * one of those, since we're doing interleaved writes. - */ - int offset = slot / 2; - - mrf = base_mrf + 1; - for (; slot < prog_data->vue_map.num_slots; ++slot) { - emit_urb_slot(dst_reg(MRF, mrf++), - prog_data->vue_map.slot_to_varying[slot]); - - /* If this was max_usable_mrf, we can't fit anything more into this - * URB WRITE. Same thing if we reached the maximum length available. - */ - if (mrf > max_usable_mrf || - align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { - slot++; - break; - } - } - - complete = slot >= prog_data->vue_map.num_slots; - current_annotation = "URB write"; - vec4_instruction *inst = emit_urb_write_opcode(complete); - inst->base_mrf = base_mrf; - inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf); - inst->offset += offset; - } while(!complete); -} - - -src_reg -vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst, - src_reg *reladdr, int reg_offset) -{ - /* Because we store the values to scratch interleaved like our - * vertex data, we need to scale the vec4 index by 2. - */ - int message_header_scale = 2; - - /* Pre-gfx6, the message header uses byte offsets instead of vec4 - * (16-byte) offset units. - */ - if (devinfo->ver < 6) - message_header_scale *= 16; - - if (reladdr) { - /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have - * to multiply the reladdr by 2. Notice that the reg_offset part - * is in units of 16 bytes and is used to select the low/high 16-byte - * chunk of a full dvec4, so we don't want to multiply that part. - */ - src_reg index = src_reg(this, glsl_int_type()); - if (type_sz(inst->dst.type) < 8) { - emit_before(block, inst, ADD(dst_reg(index), *reladdr, - brw_imm_d(reg_offset))); - emit_before(block, inst, MUL(dst_reg(index), index, - brw_imm_d(message_header_scale))); - } else { - emit_before(block, inst, MUL(dst_reg(index), *reladdr, - brw_imm_d(message_header_scale * 2))); - emit_before(block, inst, ADD(dst_reg(index), index, - brw_imm_d(reg_offset * message_header_scale))); - } - return index; - } else { - return brw_imm_d(reg_offset * message_header_scale); - } -} - -/** - * Emits an instruction before @inst to load the value named by @orig_src - * from scratch space at @base_offset to @temp. - * - * @base_offset is measured in 32-byte units (the size of a register). - */ -void -vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst, - dst_reg temp, src_reg orig_src, - int base_offset) -{ - assert(orig_src.offset % REG_SIZE == 0); - int reg_offset = base_offset + orig_src.offset / REG_SIZE; - src_reg index = get_scratch_offset(block, inst, orig_src.reladdr, - reg_offset); - - if (type_sz(orig_src.type) < 8) { - emit_before(block, inst, SCRATCH_READ(temp, index)); - } else { - dst_reg shuffled = dst_reg(this, glsl_dvec4_type()); - dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F); - emit_before(block, inst, SCRATCH_READ(shuffled_float, index)); - index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1); - vec4_instruction *last_read = - SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index); - emit_before(block, inst, last_read); - shuffle_64bit_data(temp, src_reg(shuffled), false, true, block, last_read); - } -} - -/** - * Emits an instruction after @inst to store the value to be written - * to @orig_dst to scratch space at @base_offset, from @temp. - * - * @base_offset is measured in 32-byte units (the size of a register). - */ -void -vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst, - int base_offset) -{ - assert(inst->dst.offset % REG_SIZE == 0); - int reg_offset = base_offset + inst->dst.offset / REG_SIZE; - src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, - reg_offset); - - /* Create a temporary register to store *inst's result in. - * - * We have to be careful in MOVing from our temporary result register in - * the scratch write. If we swizzle from channels of the temporary that - * weren't initialized, it will confuse live interval analysis, which will - * make spilling fail to make progress. - */ - bool is_64bit = type_sz(inst->dst.type) == 8; - const glsl_type *alloc_type = - is_64bit ? glsl_dvec4_type() : glsl_vec4_type(); - const src_reg temp = swizzle(retype(src_reg(this, alloc_type), - inst->dst.type), - brw_swizzle_for_mask(inst->dst.writemask)); - - if (!is_64bit) { - dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), - inst->dst.writemask)); - vec4_instruction *write = SCRATCH_WRITE(dst, temp, index); - if (inst->opcode != BRW_OPCODE_SEL) - write->predicate = inst->predicate; - write->ir = inst->ir; - write->annotation = inst->annotation; - inst->insert_after(block, write); - } else { - dst_reg shuffled = dst_reg(this, alloc_type); - vec4_instruction *last = - shuffle_64bit_data(shuffled, temp, true, true, block, inst); - src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F)); - - uint8_t mask = 0; - if (inst->dst.writemask & WRITEMASK_X) - mask |= WRITEMASK_XY; - if (inst->dst.writemask & WRITEMASK_Y) - mask |= WRITEMASK_ZW; - if (mask) { - dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); - - vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index); - if (inst->opcode != BRW_OPCODE_SEL) - write->predicate = inst->predicate; - write->ir = inst->ir; - write->annotation = inst->annotation; - last->insert_after(block, write); - } - - mask = 0; - if (inst->dst.writemask & WRITEMASK_Z) - mask |= WRITEMASK_XY; - if (inst->dst.writemask & WRITEMASK_W) - mask |= WRITEMASK_ZW; - if (mask) { - dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask)); - - src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr, - reg_offset + 1); - vec4_instruction *write = - SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index); - if (inst->opcode != BRW_OPCODE_SEL) - write->predicate = inst->predicate; - write->ir = inst->ir; - write->annotation = inst->annotation; - last->insert_after(block, write); - } - } - - inst->dst.file = temp.file; - inst->dst.nr = temp.nr; - inst->dst.offset %= REG_SIZE; - inst->dst.reladdr = NULL; -} - -/** - * Checks if \p src and/or \p src.reladdr require a scratch read, and if so, - * adds the scratch read(s) before \p inst. The function also checks for - * recursive reladdr scratch accesses, issuing the corresponding scratch - * loads and rewriting reladdr references accordingly. - * - * \return \p src if it did not require a scratch load, otherwise, the - * register holding the result of the scratch load that the caller should - * use to rewrite src. - */ -src_reg -vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block, - vec4_instruction *inst, src_reg src) -{ - /* Resolve recursive reladdr scratch access by calling ourselves - * with src.reladdr - */ - if (src.reladdr) - *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, - *src.reladdr); - - /* Now handle scratch access on src */ - if (src.file == VGRF && scratch_loc[src.nr] != -1) { - dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ? - glsl_dvec4_type() : glsl_vec4_type()); - emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]); - src.nr = temp.nr; - src.offset %= REG_SIZE; - src.reladdr = NULL; - } - - return src; -} - -/** - * We can't generally support array access in GRF space, because a - * single instruction's destination can only span 2 contiguous - * registers. So, we send all GRF arrays that get variable index - * access to scratch space. - */ -void -vec4_visitor::move_grf_array_access_to_scratch() -{ - int scratch_loc[this->alloc.count]; - memset(scratch_loc, -1, sizeof(scratch_loc)); - - /* First, calculate the set of virtual GRFs that need to be punted - * to scratch due to having any array access on them, and where in - * scratch. - */ - foreach_block_and_inst(block, vec4_instruction, inst, cfg) { - if (inst->dst.file == VGRF && inst->dst.reladdr) { - if (scratch_loc[inst->dst.nr] == -1) { - scratch_loc[inst->dst.nr] = last_scratch; - last_scratch += this->alloc.sizes[inst->dst.nr]; - } - - for (src_reg *iter = inst->dst.reladdr; - iter->reladdr; - iter = iter->reladdr) { - if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { - scratch_loc[iter->nr] = last_scratch; - last_scratch += this->alloc.sizes[iter->nr]; - } - } - } - - for (int i = 0 ; i < 3; i++) { - for (src_reg *iter = &inst->src[i]; - iter->reladdr; - iter = iter->reladdr) { - if (iter->file == VGRF && scratch_loc[iter->nr] == -1) { - scratch_loc[iter->nr] = last_scratch; - last_scratch += this->alloc.sizes[iter->nr]; - } - } - } - } - - /* Now, for anything that will be accessed through scratch, rewrite - * it to load/store. Note that this is a _safe list walk, because - * we may generate a new scratch_write instruction after the one - * we're processing. - */ - foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) { - /* Set up the annotation tracking for new generated instructions. */ - base_ir = inst->ir; - current_annotation = inst->annotation; - - /* First handle scratch access on the dst. Notice we have to handle - * the case where the dst's reladdr also points to scratch space. - */ - if (inst->dst.reladdr) - *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst, - *inst->dst.reladdr); - - /* Now that we have handled any (possibly recursive) reladdr scratch - * accesses for dst we can safely do the scratch write for dst itself - */ - if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1) - emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]); - - /* Now handle scratch access on any src. In this case, since inst->src[i] - * already is a src_reg, we can just call emit_resolve_reladdr with - * inst->src[i] and it will take care of handling scratch loads for - * both src and src.reladdr (recursively). - */ - for (int i = 0 ; i < 3; i++) { - inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst, - inst->src[i]); - } - } -} - -void -vec4_visitor::resolve_ud_negate(src_reg *reg) -{ - if (reg->type != BRW_REGISTER_TYPE_UD || - !reg->negate) - return; - - src_reg temp = src_reg(this, glsl_uvec4_type()); - emit(BRW_OPCODE_MOV, dst_reg(temp), *reg); - *reg = temp; -} - -static brw_rnd_mode -brw_rnd_mode_from_execution_mode(unsigned execution_mode) -{ - if (nir_has_any_rounding_mode_rtne(execution_mode)) - return BRW_RND_MODE_RTNE; - if (nir_has_any_rounding_mode_rtz(execution_mode)) - return BRW_RND_MODE_RTZ; - return BRW_RND_MODE_UNSPECIFIED; -} - -void -vec4_visitor::emit_shader_float_controls_execution_mode() -{ - unsigned execution_mode = this->nir->info.float_controls_execution_mode; - if (nir_has_any_rounding_mode_enabled(execution_mode)) { - brw_rnd_mode rnd = brw_rnd_mode_from_execution_mode(execution_mode); - const vec4_builder bld = vec4_builder(this).at_end(); - bld.exec_all().emit(SHADER_OPCODE_RND_MODE, dst_null_ud(), brw_imm_d(rnd)); - } -} - -vec4_visitor::vec4_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_sampler_prog_key_data *key_tex, - struct brw_vue_prog_data *prog_data, - const nir_shader *shader, - bool no_spills, - bool debug_enabled) - : backend_shader(compiler, params, shader, &prog_data->base, debug_enabled), - key_tex(key_tex), - prog_data(prog_data), - fail_msg(NULL), - first_non_payload_grf(0), - ubo_push_start(), - push_length(0), - live_analysis(this), performance_analysis(this), - no_spills(no_spills), - last_scratch(0) -{ - this->failed = false; - - this->base_ir = NULL; - this->current_annotation = NULL; - memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation)); - - memset(this->output_num_components, 0, sizeof(this->output_num_components)); - - this->max_grf = devinfo->ver >= 7 ? GFX7_MRF_HACK_START : BRW_MAX_GRF; - - this->uniforms = 0; - - this->nir_ssa_values = NULL; -} - - -void -vec4_visitor::fail(const char *format, ...) -{ - va_list va; - char *msg; - - if (failed) - return; - - failed = true; - - va_start(va, format); - msg = ralloc_vasprintf(mem_ctx, format, va); - va_end(va); - msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", - _mesa_shader_stage_to_abbrev(stage), msg); - - this->fail_msg = msg; - - if (unlikely(debug_enabled)) { - fprintf(stderr, "%s", msg); - } -} - -} /* namespace brw */ diff --git a/src/intel/compiler/brw_vec4_vs.h b/src/intel/compiler/brw_vec4_vs.h deleted file mode 100644 index 0929df5ff3d..00000000000 --- a/src/intel/compiler/brw_vec4_vs.h +++ /dev/null @@ -1,58 +0,0 @@ -/* - * Copyright © 2006 - 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#ifndef BRW_VEC4_VS_VISITOR_H -#define BRW_VEC4_VS_VISITOR_H - -#include "brw_vec4.h" - -namespace brw { - -class vec4_vs_visitor : public vec4_visitor -{ -public: - vec4_vs_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *vs_prog_data, - const nir_shader *shader, - bool debug_enabled); - -protected: - virtual void setup_payload(); - virtual void emit_prolog(); - virtual void emit_thread_end(); - virtual void emit_urb_write_header(int mrf); - virtual void emit_urb_slot(dst_reg reg, int varying); - virtual vec4_instruction *emit_urb_write_opcode(bool complete); - -private: - int setup_attributes(int payload_reg); - - const struct brw_vs_prog_key *const key; - struct brw_vs_prog_data * const vs_prog_data; -}; - -} /* namespace brw */ - -#endif /* BRW_VEC4_VS_VISITOR_H */ diff --git a/src/intel/compiler/brw_vec4_vs_visitor.cpp b/src/intel/compiler/brw_vec4_vs_visitor.cpp deleted file mode 100644 index c30a3434451..00000000000 --- a/src/intel/compiler/brw_vec4_vs_visitor.cpp +++ /dev/null @@ -1,108 +0,0 @@ -/* - * Copyright © 2013 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - - -#include "brw_vec4_vs.h" -#include "dev/intel_debug.h" - -namespace brw { - -void -vec4_vs_visitor::emit_prolog() -{ -} - - -void -vec4_vs_visitor::emit_urb_write_header(int mrf) -{ - /* No need to do anything for VS; an implied write to this MRF will be - * performed by VEC4_VS_OPCODE_URB_WRITE. - */ - (void) mrf; -} - - -vec4_instruction * -vec4_vs_visitor::emit_urb_write_opcode(bool complete) -{ - vec4_instruction *inst = emit(VEC4_VS_OPCODE_URB_WRITE); - inst->urb_write_flags = complete ? - BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS; - - return inst; -} - - -void -vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying) -{ - reg.type = BRW_REGISTER_TYPE_F; - output_reg[varying][0].type = reg.type; - - switch (varying) { - case VARYING_SLOT_COL0: - case VARYING_SLOT_COL1: - case VARYING_SLOT_BFC0: - case VARYING_SLOT_BFC1: { - /* These built-in varyings are only supported in compatibility mode, - * and we only support GS in core profile. So, this must be a vertex - * shader. - */ - vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0); - if (inst && key->clamp_vertex_color) - inst->saturate = true; - break; - } - default: - return vec4_visitor::emit_urb_slot(reg, varying); - } -} - - -void -vec4_vs_visitor::emit_thread_end() -{ - /* For VS, we always end the thread by emitting a single vertex. - * emit_urb_write_opcode() will take care of setting the eot flag on the - * SEND instruction. - */ - emit_vertex(); -} - - -vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler, - const struct brw_compile_params *params, - const struct brw_vs_prog_key *key, - struct brw_vs_prog_data *vs_prog_data, - const nir_shader *shader, - bool debug_enabled) - : vec4_visitor(compiler, params, &key->base.tex, &vs_prog_data->base, - shader, false /* no_spills */, debug_enabled), - key(key), - vs_prog_data(vs_prog_data) -{ -} - - -} /* namespace brw */ diff --git a/src/intel/compiler/gfx6_gs_visitor.cpp b/src/intel/compiler/gfx6_gs_visitor.cpp deleted file mode 100644 index 5465094ed36..00000000000 --- a/src/intel/compiler/gfx6_gs_visitor.cpp +++ /dev/null @@ -1,702 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * This code is based on original work by Ilia Mirkin. - */ - -/** - * \file gfx6_gs_visitor.cpp - * - * Gfx6 geometry shader implementation - */ - -#include "gfx6_gs_visitor.h" -#include "brw_eu.h" -#include "brw_prim.h" - -namespace brw { - -void -gfx6_gs_visitor::emit_prolog() -{ - vec4_gs_visitor::emit_prolog(); - - /* Gfx6 geometry shaders require to allocate an initial VUE handle via - * FF_SYNC message, however the documentation remarks that only one thread - * can write to the URB simultaneously and the FF_SYNC message provides the - * synchronization mechanism for this, so using this message effectively - * stalls the thread until it is its turn to write to the URB. Because of - * this, the best way to implement geometry shader algorithms in gfx6 is to - * execute the algorithm before the FF_SYNC message to maximize parallelism. - * - * To achieve this we buffer the geometry shader outputs for each emitted - * vertex in vertex_output during operation. Then, when we have processed - * the last vertex (that is, at thread end time), we send the FF_SYNC - * message to allocate the initial VUE handle and write all buffered vertex - * data to the URB in one go. - * - * For each emitted vertex, vertex_output will hold vue_map.num_slots - * data items plus one additional item to hold required flags - * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message) - * which come right after the data items for that vertex. Vertex data and - * flags for the next vertex come right after the data items and flags for - * the previous vertex. - */ - this->current_annotation = "gfx6 prolog"; - this->vertex_output = src_reg(this, - glsl_uint_type(), - (prog_data->vue_map.num_slots + 1) * - nir->info.gs.vertices_out); - this->vertex_output_offset = src_reg(this, glsl_uint_type()); - emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); - - /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES), - * so initialize it once to R0. - */ - vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1), - retype(brw_vec8_grf(0, 0), - BRW_REGISTER_TYPE_UD))); - inst->force_writemask_all = true; - - /* This will be used as a temporary to store writeback data of FF_SYNC - * and URB_WRITE messages. - */ - this->temp = src_reg(this, glsl_uint_type()); - - /* This will be used to know when we are processing the first vertex of - * a primitive. We will set this to URB_WRITE_PRIM_START only when we know - * that we are processing the first vertex in the primitive and to zero - * otherwise. This way we can use its value directly in the URB write - * headers. - */ - this->first_vertex = src_reg(this, glsl_uint_type()); - emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START))); - - /* The FF_SYNC message requires to know the number of primitives generated, - * so keep a counter for this. - */ - this->prim_count = src_reg(this, glsl_uint_type()); - emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u))); - - if (gs_prog_data->num_transform_feedback_bindings) { - /* Create a virtual register to hold destination indices in SOL */ - this->destination_indices = src_reg(this, glsl_uvec4_type()); - /* Create a virtual register to hold number of written primitives */ - this->sol_prim_written = src_reg(this, glsl_uint_type()); - /* Create a virtual register to hold Streamed Vertex Buffer Indices */ - this->svbi = src_reg(this, glsl_uvec4_type()); - /* Create a virtual register to hold max values of SVBI */ - this->max_svbi = src_reg(this, glsl_uvec4_type()); - emit(MOV(dst_reg(this->max_svbi), - src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD)))); - } - - /* PrimitveID is delivered in r0.1 of the thread payload. If the program - * needs it we have to move it to a separate register where we can map - * the attribute. - * - * Notice that we cannot use a virtual register for this, because we need to - * map all input attributes to hardware registers in setup_payload(), - * which happens before virtual registers are mapped to hardware registers. - * We could work around that issue if we were able to compute the first - * non-payload register here and move the PrimitiveID information to that - * register, but we can't because at this point we don't know the final - * number uniforms that will be included in the payload. - * - * So, what we do is to place PrimitiveID information in r1, which is always - * delivered as part of the payload, but its only populated with data - * relevant for transform feedback when we set GFX6_GS_SVBI_PAYLOAD_ENABLE - * in the 3DSTATE_GS state packet. That information can be obtained by other - * means though, so we can safely use r1 for this purpose. - */ - if (gs_prog_data->include_primitive_id) { - this->primitive_id = - src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)); - emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id)); - } -} - -void -gfx6_gs_visitor::gs_emit_vertex(int stream_id) -{ - this->current_annotation = "gfx6 emit vertex"; - - /* Buffer all output slots for this vertex in vertex_output */ - for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) { - int varying = prog_data->vue_map.slot_to_varying[slot]; - if (varying != VARYING_SLOT_PSIZ) { - dst_reg dst(this->vertex_output); - dst.reladdr = ralloc(mem_ctx, src_reg); - memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); - emit_urb_slot(dst, varying); - } else { - /* The PSIZ slot can pack multiple varyings in different channels - * and emit_urb_slot() will produce a MOV instruction for each of - * them. Since we are writing to an array, that will translate to - * possibly multiple MOV instructions with an array destination and - * each will generate a scratch write with the same offset into - * scratch space (thus, each one overwriting the previous). This is - * not what we want. What we will do instead is emit PSIZ to a - * a regular temporary register, then move that register into the - * array. This way we only have one instruction with an array - * destination and we only produce a single scratch write. - */ - dst_reg tmp = dst_reg(src_reg(this, glsl_uvec4_type())); - emit_urb_slot(tmp, varying); - dst_reg dst(this->vertex_output); - dst.reladdr = ralloc(mem_ctx, src_reg); - memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); - vec4_instruction *inst = emit(MOV(dst, src_reg(tmp))); - inst->force_writemask_all = true; - } - - emit(ADD(dst_reg(this->vertex_output_offset), - this->vertex_output_offset, brw_imm_ud(1u))); - } - - /* Now buffer flags for this vertex */ - dst_reg dst(this->vertex_output); - dst.reladdr = ralloc(mem_ctx, src_reg); - memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg)); - if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) { - /* If we are outputting points, then every vertex has PrimStart and - * PrimEnd set. - */ - emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) | - URB_WRITE_PRIM_START | URB_WRITE_PRIM_END))); - emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); - } else { - /* Otherwise, we can only set the PrimStart flag, which we have stored - * in the first_vertex register. We will have to wait until we execute - * EndPrimitive() or we end the thread to set the PrimEnd flag on a - * vertex. - */ - emit(OR(dst, this->first_vertex, - brw_imm_ud(gs_prog_data->output_topology << - URB_WRITE_PRIM_TYPE_SHIFT))); - emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u))); - } - emit(ADD(dst_reg(this->vertex_output_offset), - this->vertex_output_offset, brw_imm_ud(1u))); -} - -void -gfx6_gs_visitor::gs_end_primitive() -{ - this->current_annotation = "gfx6 end primitive"; - /* Calling EndPrimitive() is optional for point output. In this case we set - * the PrimEnd flag when we process EmitVertex(). - */ - if (nir->info.gs.output_primitive == MESA_PRIM_POINTS) - return; - - /* Otherwise we know that the last vertex we have processed was the last - * vertex in the primitive and we need to set its PrimEnd flag, so do this - * unless we haven't emitted that vertex at all (vertex_count != 0). - * - * Notice that we have already incremented vertex_count when we processed - * the last emit_vertex, so we need to take that into account in the - * comparison below (hence the num_output_vertices + 1 in the comparison - * below). - */ - unsigned num_output_vertices = nir->info.gs.vertices_out; - emit(CMP(dst_null_ud(), this->vertex_count, - brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L)); - vec4_instruction *inst = emit(CMP(dst_null_ud(), - this->vertex_count, brw_imm_ud(0u), - BRW_CONDITIONAL_NEQ)); - inst->predicate = BRW_PREDICATE_NORMAL; - emit(IF(BRW_PREDICATE_NORMAL)); - { - /* vertex_output_offset is already pointing at the first entry of the - * next vertex. So subtract 1 to modify the flags for the previous - * vertex. - */ - src_reg offset(this, glsl_uint_type()); - emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1))); - - src_reg dst(this->vertex_output); - dst.reladdr = ralloc(mem_ctx, src_reg); - memcpy(dst.reladdr, &offset, sizeof(src_reg)); - - emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END))); - emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u))); - - /* Set the first vertex flag to indicate that the next vertex will start - * a primitive. - */ - emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START))); - } - emit(BRW_OPCODE_ENDIF); -} - -void -gfx6_gs_visitor::emit_urb_write_header(int mrf) -{ - this->current_annotation = "gfx6 urb header"; - /* Compute offset of the flags for the current vertex in vertex_output and - * write them in dw2 of the message header. - * - * Notice that by the time that emit_thread_end() calls here - * vertex_output_offset should point to the first data item of the current - * vertex in vertex_output, thus we only need to add the number of output - * slots per vertex to that offset to obtain the flags data offset. - */ - src_reg flags_offset(this, glsl_uint_type()); - emit(ADD(dst_reg(flags_offset), - this->vertex_output_offset, - brw_imm_d(prog_data->vue_map.num_slots))); - - src_reg flags_data(this->vertex_output); - flags_data.reladdr = ralloc(mem_ctx, src_reg); - memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg)); - - emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data); -} - -static unsigned -align_interleaved_urb_mlen(unsigned mlen) -{ - /* URB data written (does not include the message header reg) must - * be a multiple of 256 bits, or 2 VS registers. See vol5c.5, - * section 5.4.3.2.2: URB_INTERLEAVED. - */ - if ((mlen % 2) != 1) - mlen++; - return mlen; -} - -void -gfx6_gs_visitor::emit_snb_gs_urb_write_opcode(bool complete, int base_mrf, - int last_mrf, int urb_offset) -{ - vec4_instruction *inst = NULL; - - if (!complete) { - /* If the vertex is not complete we don't have to do anything special */ - inst = emit(VEC4_GS_OPCODE_URB_WRITE); - inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS; - } else { - /* Otherwise we always request to allocate a new VUE handle. If this is - * the last write before the EOT message and the new handle never gets - * used it will be dereferenced when we send the EOT message. This is - * necessary to avoid different setups for the EOT message (one for the - * case when there is no output and another for the case when there is) - * which would require to end the program with an IF/ELSE/ENDIF block, - * something we do not want. - */ - inst = emit(VEC4_GS_OPCODE_URB_WRITE_ALLOCATE); - inst->urb_write_flags = BRW_URB_WRITE_COMPLETE; - inst->dst = dst_reg(MRF, base_mrf); - inst->src[0] = this->temp; - } - - inst->base_mrf = base_mrf; - inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf); - inst->offset = urb_offset; -} - -void -gfx6_gs_visitor::emit_thread_end() -{ - /* Make sure the current primitive is ended: we know it is not ended when - * first_vertex is not zero. This is only relevant for outputs other than - * points because in the point case we set PrimEnd on all vertices. - */ - if (nir->info.gs.output_primitive != MESA_PRIM_POINTS) { - emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z)); - emit(IF(BRW_PREDICATE_NORMAL)); - gs_end_primitive(); - emit(BRW_OPCODE_ENDIF); - } - - /* Here we have to: - * 1) Emit an FF_SYNC message to obtain an initial VUE handle. - * 2) Loop over all buffered vertex data and write it to corresponding - * URB entries. - * 3) Allocate new VUE handles for all vertices other than the first. - * 4) Send a final EOT message. - */ - - /* MRF 0 is reserved for the debugger, so start with message header - * in MRF 1. - */ - int base_mrf = 1; - - /* In the process of generating our URB write message contents, we - * may need to unspill a register or load from an array. Those - * reads would use MRFs 21..23 - */ - int max_usable_mrf = FIRST_SPILL_MRF(devinfo->ver); - - /* Issue the FF_SYNC message and obtain the initial VUE handle. */ - this->current_annotation = "gfx6 thread end: ff_sync"; - - vec4_instruction *inst = NULL; - if (gs_prog_data->num_transform_feedback_bindings) { - src_reg sol_temp(this, glsl_uvec4_type()); - emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES, - dst_reg(this->svbi), - this->vertex_count, - this->prim_count, - sol_temp); - inst = emit(GS_OPCODE_FF_SYNC, - dst_reg(this->temp), this->prim_count, this->svbi); - } else { - inst = emit(GS_OPCODE_FF_SYNC, - dst_reg(this->temp), this->prim_count, brw_imm_ud(0u)); - } - inst->base_mrf = base_mrf; - - emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G)); - emit(IF(BRW_PREDICATE_NORMAL)); - { - /* Loop over all buffered vertices and emit URB write messages */ - this->current_annotation = "gfx6 thread end: urb writes init"; - src_reg vertex(this, glsl_uint_type()); - emit(MOV(dst_reg(vertex), brw_imm_ud(0u))); - emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); - - this->current_annotation = "gfx6 thread end: urb writes"; - emit(BRW_OPCODE_DO); - { - emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE)); - inst = emit(BRW_OPCODE_BREAK); - inst->predicate = BRW_PREDICATE_NORMAL; - - /* First we prepare the message header */ - emit_urb_write_header(base_mrf); - - /* Then add vertex data to the message in interleaved fashion */ - int slot = 0; - bool complete = false; - do { - int mrf = base_mrf + 1; - - /* URB offset is in URB row increments, and each of our MRFs is half - * of one of those, since we're doing interleaved writes. - */ - int urb_offset = slot / 2; - - for (; slot < prog_data->vue_map.num_slots; ++slot) { - int varying = prog_data->vue_map.slot_to_varying[slot]; - current_annotation = output_reg_annotation[varying]; - - /* Compute offset of this slot for the current vertex - * in vertex_output - */ - src_reg data(this->vertex_output); - data.reladdr = ralloc(mem_ctx, src_reg); - memcpy(data.reladdr, &this->vertex_output_offset, - sizeof(src_reg)); - - /* Copy this slot to the appropriate message register */ - dst_reg reg = dst_reg(MRF, mrf); - reg.type = output_reg[varying][0].type; - data.type = reg.type; - inst = emit(MOV(reg, data)); - inst->force_writemask_all = true; - - mrf++; - emit(ADD(dst_reg(this->vertex_output_offset), - this->vertex_output_offset, brw_imm_ud(1u))); - - /* If this was max_usable_mrf, we can't fit anything more into - * this URB WRITE. Same if we reached the max. message length. - */ - if (mrf > max_usable_mrf || - align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) { - slot++; - break; - } - } - - complete = slot >= prog_data->vue_map.num_slots; - emit_snb_gs_urb_write_opcode(complete, base_mrf, mrf, urb_offset); - } while (!complete); - - /* Skip over the flags data item so that vertex_output_offset points - * to the first data item of the next vertex, so that we can start - * writing the next vertex. - */ - emit(ADD(dst_reg(this->vertex_output_offset), - this->vertex_output_offset, brw_imm_ud(1u))); - - emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u))); - } - emit(BRW_OPCODE_WHILE); - - if (gs_prog_data->num_transform_feedback_bindings) - xfb_write(); - } - emit(BRW_OPCODE_ENDIF); - - /* Finally, emit EOT message. - * - * In gfx6 we need to end the thread differently depending on whether we have - * emitted at least one vertex or not. In case we did, the EOT message must - * always include the COMPLETE flag or else the GPU hangs. If we have not - * produced any output we can't use the COMPLETE flag. - * - * However, this would lead us to end the program with an ENDIF opcode, - * which we want to avoid, so what we do is that we always request a new - * VUE handle every time, even if GS produces no output. - * With this we make sure that whether we have emitted at least one vertex - * or none at all, we have to finish the thread without writing to the URB, - * which works for both cases by setting the COMPLETE and UNUSED flags in - * the EOT message. - */ - this->current_annotation = "gfx6 thread end: EOT"; - - if (gs_prog_data->num_transform_feedback_bindings) { - /* When emitting EOT, set SONumPrimsWritten Increment Value. */ - src_reg data(this, glsl_uint_type()); - emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu))); - emit(SHL(dst_reg(data), data, brw_imm_ud(16u))); - emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data); - } - - inst = emit(GS_OPCODE_THREAD_END); - inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED; - inst->base_mrf = base_mrf; - inst->mlen = 1; -} - -void -gfx6_gs_visitor::setup_payload() -{ - int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES]; - - /* Attributes are going to be interleaved, so one register contains two - * attribute slots. - */ - int attributes_per_reg = 2; - - /* If a geometry shader tries to read from an input that wasn't written by - * the vertex shader, that produces undefined results, but it shouldn't - * crash anything. So initialize attribute_map to zeros--that ensures that - * these undefined results are read from r0. - */ - memset(attribute_map, 0, sizeof(attribute_map)); - - int reg = 0; - - /* The payload always contains important data in r0. */ - reg++; - - /* r1 is always part of the payload and it holds information relevant - * for transform feedback when we set the GFX6_GS_SVBI_PAYLOAD_ENABLE bit in - * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID - * information (and move the original value to a virtual register if - * necessary). - */ - if (gs_prog_data->include_primitive_id) - attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg; - reg++; - - reg = setup_uniforms(reg); - - reg = setup_varying_inputs(reg, attributes_per_reg); - - this->first_non_payload_grf = reg; -} - -void -gfx6_gs_visitor::xfb_write() -{ - unsigned num_verts; - - switch (gs_prog_data->output_topology) { - case _3DPRIM_POINTLIST: - num_verts = 1; - break; - case _3DPRIM_LINELIST: - case _3DPRIM_LINESTRIP: - case _3DPRIM_LINELOOP: - num_verts = 2; - break; - case _3DPRIM_TRILIST: - case _3DPRIM_TRIFAN: - case _3DPRIM_TRISTRIP: - case _3DPRIM_RECTLIST: - num_verts = 3; - break; - case _3DPRIM_QUADLIST: - case _3DPRIM_QUADSTRIP: - case _3DPRIM_POLYGON: - num_verts = 3; - break; - default: - unreachable("Unexpected primitive type in Gfx6 SOL program."); - } - - this->current_annotation = "gfx6 thread end: svb writes init"; - - emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u))); - emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u))); - - /* Check that at least one primitive can be written - * - * Note: since we use the binding table to keep track of buffer offsets - * and stride, the GS doesn't need to keep track of a separate pointer - * into each buffer; it uses a single pointer which increments by 1 for - * each vertex. So we use SVBI0 for this pointer, regardless of whether - * transform feedback is in interleaved or separate attribs mode. - */ - src_reg sol_temp(this, glsl_uvec4_type()); - emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts))); - - /* Compare SVBI calculated number with the maximum value, which is - * in R1.4 (previously saved in this->max_svbi) for gfx6. - */ - emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); - emit(IF(BRW_PREDICATE_NORMAL)); - { - vec4_instruction *inst = emit(MOV(dst_reg(destination_indices), - brw_imm_vf4(brw_float_to_vf(0.0), - brw_float_to_vf(1.0), - brw_float_to_vf(2.0), - brw_float_to_vf(0.0)))); - inst->force_writemask_all = true; - - emit(ADD(dst_reg(this->destination_indices), - this->destination_indices, - this->svbi)); - } - emit(BRW_OPCODE_ENDIF); - - /* Write transform feedback data for all processed vertices. */ - for (int i = 0; i < (int)nir->info.gs.vertices_out; i++) { - emit(MOV(dst_reg(sol_temp), brw_imm_d(i))); - emit(CMP(dst_null_d(), sol_temp, this->vertex_count, - BRW_CONDITIONAL_L)); - emit(IF(BRW_PREDICATE_NORMAL)); - { - xfb_program(i, num_verts); - } - emit(BRW_OPCODE_ENDIF); - } -} - -void -gfx6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts) -{ - unsigned binding; - unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings; - src_reg sol_temp(this, glsl_uvec4_type()); - - /* Check for buffer overflow: we need room to write the complete primitive - * (all vertices). Otherwise, avoid writing any vertices for it - */ - emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u))); - emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts))); - emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi)); - emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE)); - emit(IF(BRW_PREDICATE_NORMAL)); - { - /* Avoid overwriting MRF 1 as it is used as URB write message header */ - dst_reg mrf_reg(MRF, 2); - - this->current_annotation = "gfx6: emit SOL vertex data"; - /* For each vertex, generate code to output each varying using the - * appropriate binding table entry. - */ - for (binding = 0; binding < num_bindings; ++binding) { - unsigned char varying = - gs_prog_data->transform_feedback_bindings[binding]; - - /* Set up the correct destination index for this vertex */ - vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX, - mrf_reg, - this->destination_indices); - inst->sol_vertex = vertex % num_verts; - - /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1: - * - * "Prior to End of Thread with a URB_WRITE, the kernel must - * ensure that all writes are complete by sending the final - * write as a committed write." - */ - bool final_write = binding == (unsigned) num_bindings - 1 && - inst->sol_vertex == num_verts - 1; - - /* Compute offset of this varying for the current vertex - * in vertex_output - */ - this->current_annotation = output_reg_annotation[varying]; - src_reg data(this->vertex_output); - data.reladdr = ralloc(mem_ctx, src_reg); - int offset = get_vertex_output_offset_for_varying(vertex, varying); - emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset))); - memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg)); - data.type = output_reg[varying][0].type; - data.swizzle = gs_prog_data->transform_feedback_swizzles[binding]; - - /* Write data */ - inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp); - inst->sol_binding = binding; - inst->sol_final_write = final_write; - - if (final_write) { - /* This is the last vertex of the primitive, then increment - * SO num primitive counter and destination indices. - */ - emit(ADD(dst_reg(this->destination_indices), - this->destination_indices, - brw_imm_ud(num_verts))); - emit(ADD(dst_reg(this->sol_prim_written), - this->sol_prim_written, brw_imm_ud(1u))); - } - - } - this->current_annotation = NULL; - } - emit(BRW_OPCODE_ENDIF); -} - -int -gfx6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying) -{ - /* Find the output slot assigned to this varying. - * - * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot - * as VARYING_SLOT_PSIZ. - */ - if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT) - varying = VARYING_SLOT_PSIZ; - int slot = prog_data->vue_map.varying_to_slot[varying]; - - if (slot < 0) { - /* This varying does not exist in the VUE so we are not writing to it - * and its value is undefined. We still want to return a valid offset - * into vertex_output though, to prevent any out-of-bound accesses into - * the vertex_output array. Since the value for this varying is undefined - * we don't really care for the value we assign to it, so any offset - * within the limits of vertex_output will do. - */ - slot = 0; - } - - return vertex * (prog_data->vue_map.num_slots + 1) + slot; -} - -} /* namespace brw */ diff --git a/src/intel/compiler/gfx6_gs_visitor.h b/src/intel/compiler/gfx6_gs_visitor.h deleted file mode 100644 index 61832a0cb6b..00000000000 --- a/src/intel/compiler/gfx6_gs_visitor.h +++ /dev/null @@ -1,84 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - */ - -#ifndef GFX6_GS_VISITOR_H -#define GFX6_GS_VISITOR_H - -#include "brw_vec4.h" -#include "brw_vec4_gs_visitor.h" - -#ifdef __cplusplus - -namespace brw { - -class gfx6_gs_visitor : public vec4_gs_visitor -{ -public: - gfx6_gs_visitor(const struct brw_compiler *comp, - const struct brw_compile_params *params, - struct brw_gs_compile *c, - struct brw_gs_prog_data *prog_data, - const nir_shader *shader, - bool no_spills, - bool debug_enabled) : - vec4_gs_visitor(comp, params, c, prog_data, shader, no_spills, debug_enabled) - { - } - -protected: - virtual void emit_prolog(); - virtual void emit_thread_end(); - virtual void gs_emit_vertex(int stream_id); - virtual void gs_end_primitive(); - virtual void emit_urb_write_header(int mrf); - virtual void setup_payload(); - -private: - void xfb_write(); - void xfb_program(unsigned vertex, unsigned num_verts); - int get_vertex_output_offset_for_varying(int vertex, int varying); - void emit_snb_gs_urb_write_opcode(bool complete, - int base_mrf, - int last_mrf, - int urb_offset); - - src_reg vertex_output; - src_reg vertex_output_offset; - src_reg temp; - src_reg first_vertex; - src_reg prim_count; - src_reg primitive_id; - - /* Transform Feedback members */ - src_reg sol_prim_written; - src_reg svbi; - src_reg max_svbi; - src_reg destination_indices; -}; - -} /* namespace brw */ - -#endif /* __cplusplus */ - -#endif /* GFX6_GS_VISITOR_H */ diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index 700da611dda..d61e98405ee 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -105,7 +105,6 @@ libintel_compiler_brw_files = files( 'brw_ir_fs.h', 'brw_ir_performance.h', 'brw_ir_performance.cpp', - 'brw_ir_vec4.h', 'brw_isa_info.h', 'brw_lower_logical_sends.cpp', 'brw_mesh.cpp', @@ -137,33 +136,7 @@ libintel_compiler_brw_files = files( 'brw_shader.cpp', 'brw_shader.h', 'brw_simd_selection.cpp', - 'brw_vec4_builder.h', - 'brw_vec4_cmod_propagation.cpp', - 'brw_vec4_copy_propagation.cpp', - 'brw_vec4.cpp', - 'brw_vec4_cse.cpp', - 'brw_vec4_dead_code_eliminate.cpp', - 'brw_vec4_generator.cpp', - 'brw_vec4_gs_visitor.cpp', - 'brw_vec4_gs_visitor.h', - 'brw_vec4.h', - 'brw_vec4_live_variables.cpp', - 'brw_vec4_live_variables.h', - 'brw_vec4_nir.cpp', - 'brw_vec4_gs_nir.cpp', - 'brw_vec4_reg_allocate.cpp', - 'brw_vec4_surface_builder.cpp', - 'brw_vec4_surface_builder.h', - 'brw_vec4_tcs.cpp', - 'brw_vec4_tcs.h', - 'brw_vec4_tes.cpp', - 'brw_vec4_tes.h', - 'brw_vec4_visitor.cpp', - 'brw_vec4_vs_visitor.cpp', - 'brw_vec4_vs.h', 'brw_vue_map.c', - 'gfx6_gs_visitor.cpp', - 'gfx6_gs_visitor.h', ) brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c', @@ -236,10 +209,6 @@ if with_tests 'test_fs_saturate_propagation.cpp', 'test_fs_scoreboard.cpp', 'test_simd_selection.cpp', - 'test_vec4_cmod_propagation.cpp', - 'test_vec4_copy_propagation.cpp', - 'test_vec4_dead_code_eliminate.cpp', - 'test_vec4_register_coalesce.cpp', 'test_vf_float_conversions.cpp', ), ir_expression_operation_h, diff --git a/src/intel/compiler/test_vec4_cmod_propagation.cpp b/src/intel/compiler/test_vec4_cmod_propagation.cpp deleted file mode 100644 index 73de39d10fe..00000000000 --- a/src/intel/compiler/test_vec4_cmod_propagation.cpp +++ /dev/null @@ -1,1056 +0,0 @@ -/* - * Copyright © 2015 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - * - * Based on test_fs_cmod_propagation.cpp - */ - -#include -#include "brw_vec4.h" -#include "brw_vec4_builder.h" -#include "brw_cfg.h" - -using namespace brw; - -class cmod_propagation_vec4_test : public ::testing::Test { - virtual void SetUp(); - virtual void TearDown(); - -public: - struct brw_compiler *compiler; - struct brw_compile_params params; - struct intel_device_info *devinfo; - void *ctx; - struct gl_shader_program *shader_prog; - struct brw_vue_prog_data *prog_data; - vec4_visitor *v; -}; - -class cmod_propagation_vec4_visitor : public vec4_visitor -{ -public: - cmod_propagation_vec4_visitor(struct brw_compiler *compiler, - struct brw_compile_params *params, - nir_shader *shader, - struct brw_vue_prog_data *prog_data) - : vec4_visitor(compiler, params, NULL, prog_data, shader, - false, false) - { - prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; - } - -protected: - /* Dummy implementation for pure virtual methods */ - virtual dst_reg *make_reg_for_system_value(int /* location */) - { - unreachable("Not reached"); - } - - virtual void setup_payload() - { - unreachable("Not reached"); - } - - virtual void emit_prolog() - { - unreachable("Not reached"); - } - - virtual void emit_program_code() - { - unreachable("Not reached"); - } - - virtual void emit_thread_end() - { - unreachable("Not reached"); - } - - virtual void emit_urb_write_header(int /* mrf */) - { - unreachable("Not reached"); - } - - virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) - { - unreachable("Not reached"); - } -}; - - -void cmod_propagation_vec4_test::SetUp() -{ - ctx = ralloc_context(NULL); - compiler = rzalloc(ctx, struct brw_compiler); - devinfo = rzalloc(ctx, struct intel_device_info); - compiler->devinfo = devinfo; - - params = {}; - params.mem_ctx = ctx; - - prog_data = ralloc(ctx, struct brw_vue_prog_data); - nir_shader *shader = - nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL); - - v = new cmod_propagation_vec4_visitor(compiler, ¶ms, shader, prog_data); - - devinfo->ver = 7; - devinfo->verx10 = devinfo->ver * 10; -} - -void cmod_propagation_vec4_test::TearDown() -{ - delete v; - v = NULL; - - ralloc_free(ctx); - ctx = NULL; -} - -static vec4_instruction * -instruction(bblock_t *block, int num) -{ - vec4_instruction *inst = (vec4_instruction *)block->start(); - for (int i = 0; i < num; i++) { - inst = (vec4_instruction *)inst->next; - } - return inst; -} - -static bool -cmod_propagation(vec4_visitor *v) -{ - const bool print = getenv("TEST_DEBUG"); - - if (print) { - fprintf(stderr, "= Before =\n"); - v->dump_instructions(); - } - - bool ret = v->opt_cmod_propagation(); - - if (print) { - fprintf(stderr, "\n= After =\n"); - v->dump_instructions(); - } - - return ret; -} - -TEST_F(cmod_propagation_vec4_test, basic) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - bld.ADD(dest, src0, src1); - bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest.x src0.xxxx src1.xxxx - * 1: cmp.ge.f0 null.x dest.xxxx 0.0f - * - * = After = - * 0: add.ge.f0 dest.x src0.xxxx src1.xxxx - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, basic_different_dst_writemask) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - - bld.ADD(dest, src0, src1); - bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest.x src0 src1 - * 1: cmp.ge.f0 null.xyzw dest 0.0f - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, andz_one) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_int_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - src_reg one(brw_imm_d(1)); - - bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); - set_condmod(BRW_CONDITIONAL_Z, - bld.AND(bld.null_reg_d(), src_reg(dest), one)); - - /* = Before = - * 0: cmp.l.f0 dest:F src0:F 0F - * 1: and.z.f0 null:D dest:D 1D - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, non_cmod_instruction) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_uint_type()); - src_reg src0 = src_reg(v, glsl_uint_type()); - src_reg zero(brw_imm_ud(0u)); - bld.FBL(dest, src0); - bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: fbl dest src0 - * 1: cmp.ge.f0 null dest 0u - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, intervening_flag_write) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg src2 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - bld.ADD(dest, src0, src1); - bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE); - bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest src0 src1 - * 1: cmp.ge.f0 null src2 0.0f - * 2: cmp.ge.f0 null dest 0.0f - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(2, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(2, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, intervening_flag_read) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest0 = dst_reg(v, glsl_float_type()); - dst_reg dest1 = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg src2 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - bld.ADD(dest0, src0, src1); - set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); - bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest0 src0 src1 - * 1: (+f0) sel dest1 src2 0.0f - * 2: cmp.ge.f0 null dest0 0.0f - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(2, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(2, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, intervening_dest_write) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg src2 = src_reg(v, glsl_vec2_type()); - src_reg zero(brw_imm_f(0.0f)); - bld.ADD(offset(dest, 8, 2), src0, src1); - bld.emit(SHADER_OPCODE_TEX, dest, src2) - ->size_written = 4 * REG_SIZE; - bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 8, 2), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest+2 src0 src1 - * 1: tex rlen 4 dest+0 src2 - * 2: cmp.ge.f0 null dest+2 0.0f - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(2, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(2, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, intervening_flag_read_same_value) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest0 = dst_reg(v, glsl_float_type()); - dst_reg dest1 = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg src2 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1)); - set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero)); - bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add.ge.f0 dest0 src0 src1 - * 1: (+f0) sel dest1 src2 0.0f - * 2: cmp.ge.f0 null.x dest0 0.0f - * - * = After = - * 0: add.ge.f0 dest0 src0 src1 - * 1: (+f0) sel dest1 src2 0.0f - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(2, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate); -} - -TEST_F(cmod_propagation_vec4_test, negate) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - bld.ADD(dest, src0, src1); - src_reg tmp_src = src_reg(dest); - tmp_src.negate = true; - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest src0 src1 - * 1: cmp.ge.f0 null.x -dest 0.0f - * - * = After = - * 0: add.le.f0 dest src0 src1 - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, movnz) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L); - set_condmod(BRW_CONDITIONAL_NZ, - bld.MOV(dest_null, src_reg(dest))); - - /* = Before = - * - * 0: cmp.l.f0 dest:F src0:F src1:F - * 1: mov.nz.f0 null.x dest:F - * - * = After = - * 0: cmp.l.f0 dest src0:F src1:F - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, different_types_cmod_with_zero) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_int_type()); - src_reg src0 = src_reg(v, glsl_int_type()); - src_reg src1 = src_reg(v, glsl_int_type()); - src_reg zero(brw_imm_f(0.0f)); - bld.ADD(dest, src0, src1); - bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero, - BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest:D src0:D src1:D - * 1: cmp.ge.f0 null:F dest:F 0.0f - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, andnz_non_one) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_int_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - src_reg nonone(brw_imm_d(38)); - - bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L); - set_condmod(BRW_CONDITIONAL_NZ, - bld.AND(bld.null_reg_d(), src_reg(dest), nonone)); - - /* = Before = - * 0: cmp.l.f0 dest:F src0:F 0F - * 1: and.nz.f0 null:D dest:D 38D - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); -} - -/* Note that basic is using glsl_type:float types, while this one is using - * glsl_type::vec4 */ -TEST_F(cmod_propagation_vec4_test, basic_vec4) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - src_reg zero(brw_imm_f(0.0f)); - - bld.MUL(dest, src0, src1); - bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ); - - /* = Before = - * 0: mul dest.xyzw src0.xyzw src1.xyzw - * 1: cmp.nz.f0.0 null.xyzw dest.xyzw 0.0f - * - * = After = - * 0: mul.nz.f0.0 dest.xyzw src0.xyzw src1.xyzw - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, basic_vec4_different_dst_writemask) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - dest.writemask = WRITEMASK_X; - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - - bld.MUL(dest, src0, src1); - bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ); - - /* = Before = - * 0: mul dest.x src0 src1 - * 1: cmp.nz.f0.0 null dest 0.0f - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, mad_one_component_vec4) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - dest.writemask = WRITEMASK_X; - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - src_reg src2 = src_reg(v, glsl_vec4_type()); - src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX; - src2.negate = true; - src_reg zero(brw_imm_f(0.0f)); - src_reg tmp(dest); - tmp.swizzle = BRW_SWIZZLE_XXXX; - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - bld.MAD(dest, src0, src1, src2); - bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L); - - /* = Before = - * - * 0: mad dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F - * 1: cmp.l.f0.0 null.x:F dest.xxxx:F 0.0f - * - * = After = - * 0: mad.l.f0 dest.x:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, mad_more_one_component_vec4) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - dest.writemask = WRITEMASK_XW; - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - src_reg src2 = src_reg(v, glsl_vec4_type()); - src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX; - src2.negate = true; - src_reg zero(brw_imm_f(0.0f)); - src_reg tmp(dest); - tmp.swizzle = BRW_SWIZZLE_XXXX; - dst_reg dest_null = bld.null_reg_f(); - - bld.MAD(dest, src0, src1, src2); - bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L); - - /* = Before = - * - * 0: mad dest.xw:F src0.xxxx:F src10.xxxx:F -src2.xxxx:F - * 1: cmp.l.f0.0 null:F dest.xxxx:F zeroF - * - * = After = - * (No changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, cmp_mov_vec4) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_ivec4_type()); - dest.writemask = WRITEMASK_X; - src_reg src0 = src_reg(v, glsl_ivec4_type()); - src0.swizzle = BRW_SWIZZLE_XXXX; - src0.file = UNIFORM; - src_reg nonone = retype(brw_imm_d(16), BRW_REGISTER_TYPE_D); - src_reg mov_src = src_reg(dest); - mov_src.swizzle = BRW_SWIZZLE_XXXX; - dst_reg dest_null = bld.null_reg_d(); - dest_null.writemask = WRITEMASK_X; - - bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE); - set_condmod(BRW_CONDITIONAL_NZ, - bld.MOV(dest_null, mov_src)); - - /* = Before = - * - * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D - * 1: mov.nz.f0 null.x:D dest.xxxx:D - * - * = After = - * 0: cmp.ge.f0 dest.x:D u.xxxx:D 16D - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, mul_cmp_different_channels_vec4) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - src_reg zero(brw_imm_f(0.0f)); - src_reg cmp_src = src_reg(dest); - cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2); - - bld.MUL(dest, src0, src1); - bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ); - - /* = Before = - * 0: mul dest src0 src1 - * 1: cmp.nz.f0.0 null dest.xywz 0.0f - * - * = After = - * (No changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, add_cmp_same_dst_writemask) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_vec4_type()); - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - dst_reg dest_null = bld.null_reg_f(); - - bld.ADD(dest, src0, src1); - vec4_instruction *inst = bld.CMP(dest_null, src0, src1, BRW_CONDITIONAL_GE); - inst->src[1].negate = true; - - /* = Before = - * - * 0: add dest.xyzw src0 src1 - * 1: cmp.ge.f0 null.xyzw src0 -src1 - * - * = After = - * 0: add.ge.f0 dest.xyzw src0 src1 - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(0, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, add_cmp_different_dst_writemask) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_vec4_type()); - src_reg src1 = src_reg(v, glsl_vec4_type()); - dst_reg dest_null = bld.null_reg_f(); - - bld.ADD(dest, src0, src1); - vec4_instruction *inst = bld.CMP(dest_null, src0, src1, BRW_CONDITIONAL_GE); - inst->src[1].negate = true; - - /* = Before = - * - * 0: add dest.x src0 src1 - * 1: cmp.ge.f0 null.xyzw src0 -src1 - * - * = After = - * (no changes) - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, prop_across_sel_gfx7) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest1 = dst_reg(v, glsl_float_type()); - dst_reg dest2 = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg src2 = src_reg(v, glsl_float_type()); - src_reg src3 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - bld.ADD(dest1, src0, src1); - bld.SEL(dest2, src2, src3) - ->conditional_mod = BRW_CONDITIONAL_GE; - bld.CMP(dest_null, src_reg(dest1), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest1.x src0.xxxx src1.xxxx - * 1: sel.ge.f0 dest2.x src2.xxxx src3.xxxx - * 2: cmp.ge.f0 null.x dest.xxxx 0.0f - * - * = After = - * 0: add.ge.f0 dest.x src0.xxxx src1.xxxx - * 1: sel.ge.f0 dest2.x src2.xxxx src3.xxxx - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(2, block0->end_ip); - - EXPECT_TRUE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, prop_across_sel_gfx5) -{ - devinfo->ver = 5; - devinfo->verx10 = devinfo->ver * 10; - - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest1 = dst_reg(v, glsl_float_type()); - dst_reg dest2 = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg src2 = src_reg(v, glsl_float_type()); - src_reg src3 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - bld.ADD(dest1, src0, src1); - bld.SEL(dest2, src2, src3) - ->conditional_mod = BRW_CONDITIONAL_GE; - bld.CMP(dest_null, src_reg(dest1), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: add dest1.x src0.xxxx src1.xxxx - * 1: sel.ge.f0 dest2.x src2.xxxx src3.xxxx - * 2: cmp.ge.f0 null.x dest.xxxx 0.0f - * - * = After = - * (no changes) - * - * On Gfx4 and Gfx5, sel.l (for min) and sel.ge (for max) are implemented - * using a separate cmpn and sel instruction. This lowering occurs in - * fs_vistor::lower_minmax which is called a long time after the first - * calls to cmod_propagation. - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(2, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(2, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod); -} - -TEST_F(cmod_propagation_vec4_test, prop_into_sel_gfx5) -{ - devinfo->ver = 5; - devinfo->verx10 = devinfo->ver * 10; - - const vec4_builder bld = vec4_builder(v).at_end(); - dst_reg dest = dst_reg(v, glsl_float_type()); - src_reg src0 = src_reg(v, glsl_float_type()); - src_reg src1 = src_reg(v, glsl_float_type()); - src_reg zero(brw_imm_f(0.0f)); - dst_reg dest_null = bld.null_reg_f(); - dest_null.writemask = WRITEMASK_X; - - bld.SEL(dest, src0, src1) - ->conditional_mod = BRW_CONDITIONAL_GE; - bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE); - - /* = Before = - * - * 0: sel.ge.f0 dest.x src2.xxxx src3.xxxx - * 1: cmp.ge.f0 null.x dest.xxxx 0.0f - * - * = After = - * (no changes) - * - * Do not copy propagate into a sel.cond instruction. While it does modify - * the flags, the flags are not based on the result compared with zero (as - * with most other instructions). The result is based on the sources - * compared with each other (like cmp.cond). - */ - - v->calculate_cfg(); - bblock_t *block0 = v->cfg->blocks[0]; - - EXPECT_EQ(0, block0->start_ip); - EXPECT_EQ(1, block0->end_ip); - - EXPECT_FALSE(cmod_propagation(v)); - - ASSERT_EQ(0, block0->start_ip); - ASSERT_EQ(1, block0->end_ip); - EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 0)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod); - EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode); - EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod); -} diff --git a/src/intel/compiler/test_vec4_copy_propagation.cpp b/src/intel/compiler/test_vec4_copy_propagation.cpp deleted file mode 100644 index 7690458b928..00000000000 --- a/src/intel/compiler/test_vec4_copy_propagation.cpp +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright © 2014 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include -#include "brw_vec4.h" - -using namespace brw; - -class copy_propagation_vec4_test : public ::testing::Test { - virtual void SetUp(); - virtual void TearDown(); - -public: - struct brw_compiler *compiler; - struct brw_compile_params params; - struct intel_device_info *devinfo; - void *ctx; - struct gl_shader_program *shader_prog; - struct brw_vue_prog_data *prog_data; - vec4_visitor *v; -}; - -class copy_propagation_vec4_visitor : public vec4_visitor -{ -public: - copy_propagation_vec4_visitor(struct brw_compiler *compiler, - struct brw_compile_params *params, - nir_shader *shader, - struct brw_vue_prog_data *prog_data) - : vec4_visitor(compiler, params, NULL, prog_data, shader, - false /* no_spills */, false) - { - prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; - } - -protected: - virtual dst_reg *make_reg_for_system_value(int /* location */) - { - unreachable("Not reached"); - } - - virtual void setup_payload() - { - unreachable("Not reached"); - } - - virtual void emit_prolog() - { - unreachable("Not reached"); - } - - virtual void emit_thread_end() - { - unreachable("Not reached"); - } - - virtual void emit_urb_write_header(int /* mrf */) - { - unreachable("Not reached"); - } - - virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) - { - unreachable("Not reached"); - } -}; - - -void copy_propagation_vec4_test::SetUp() -{ - ctx = ralloc_context(NULL); - compiler = rzalloc(ctx, struct brw_compiler); - devinfo = rzalloc(ctx, struct intel_device_info); - compiler->devinfo = devinfo; - - params = {}; - params.mem_ctx = ctx; - - prog_data = ralloc(ctx, struct brw_vue_prog_data); - nir_shader *shader = - nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL); - - v = new copy_propagation_vec4_visitor(compiler, ¶ms, shader, prog_data); - - devinfo->ver = 4; - devinfo->verx10 = devinfo->ver * 10; -} - -void copy_propagation_vec4_test::TearDown() -{ - delete v; - v = NULL; - - ralloc_free(ctx); - ctx = NULL; -} - - -static void -copy_propagation(vec4_visitor *v) -{ - const bool print = getenv("TEST_DEBUG"); - - if (print) { - fprintf(stderr, "instructions before:\n"); - v->dump_instructions(); - } - - v->calculate_cfg(); - v->opt_copy_propagation(); - - if (print) { - fprintf(stderr, "instructions after:\n"); - v->dump_instructions(); - } -} - -TEST_F(copy_propagation_vec4_test, test_swizzle_swizzle) -{ - dst_reg a = dst_reg(v, glsl_vec4_type()); - dst_reg b = dst_reg(v, glsl_vec4_type()); - dst_reg c = dst_reg(v, glsl_vec4_type()); - - v->emit(v->ADD(a, src_reg(a), src_reg(a))); - - v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_Y, - BRW_SWIZZLE_Z, - BRW_SWIZZLE_W, - BRW_SWIZZLE_X)))); - - vec4_instruction *test_mov = - v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_Y, - BRW_SWIZZLE_Z, - BRW_SWIZZLE_W, - BRW_SWIZZLE_X))); - v->emit(test_mov); - - copy_propagation(v); - - EXPECT_EQ(test_mov->src[0].nr, a.nr); - EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_Z, - BRW_SWIZZLE_W, - BRW_SWIZZLE_X, - BRW_SWIZZLE_Y)); -} - -TEST_F(copy_propagation_vec4_test, test_swizzle_writemask) -{ - dst_reg a = dst_reg(v, glsl_vec4_type()); - dst_reg b = dst_reg(v, glsl_vec4_type()); - dst_reg c = dst_reg(v, glsl_vec4_type()); - - v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(BRW_SWIZZLE_X, - BRW_SWIZZLE_Y, - BRW_SWIZZLE_X, - BRW_SWIZZLE_Z)))); - - v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f))); - - vec4_instruction *test_mov = - v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(BRW_SWIZZLE_W, - BRW_SWIZZLE_W, - BRW_SWIZZLE_W, - BRW_SWIZZLE_W))); - v->emit(test_mov); - - copy_propagation(v); - - /* should not copy propagate */ - EXPECT_EQ(test_mov->src[0].nr, b.nr); - EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(BRW_SWIZZLE_W, - BRW_SWIZZLE_W, - BRW_SWIZZLE_W, - BRW_SWIZZLE_W)); -} diff --git a/src/intel/compiler/test_vec4_dead_code_eliminate.cpp b/src/intel/compiler/test_vec4_dead_code_eliminate.cpp deleted file mode 100644 index c3a07c1735b..00000000000 --- a/src/intel/compiler/test_vec4_dead_code_eliminate.cpp +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright © 2018 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include -#include "brw_vec4.h" - -using namespace brw; - -class dead_code_eliminate_vec4_test : public ::testing::Test { - virtual void SetUp(); - virtual void TearDown(); - -public: - struct brw_compiler *compiler; - struct brw_compile_params params; - struct intel_device_info *devinfo; - void *ctx; - struct gl_shader_program *shader_prog; - struct brw_vue_prog_data *prog_data; - vec4_visitor *v; -}; - -class dead_code_eliminate_vec4_visitor : public vec4_visitor -{ -public: - dead_code_eliminate_vec4_visitor(struct brw_compiler *compiler, - struct brw_compile_params *params, - nir_shader *shader, - struct brw_vue_prog_data *prog_data) - : vec4_visitor(compiler, params, NULL, prog_data, shader, - false /* no_spills */, false) - { - prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; - } - -protected: - virtual dst_reg *make_reg_for_system_value(int /* location */) - { - unreachable("Not reached"); - } - - virtual void setup_payload() - { - unreachable("Not reached"); - } - - virtual void emit_prolog() - { - unreachable("Not reached"); - } - - virtual void emit_thread_end() - { - unreachable("Not reached"); - } - - virtual void emit_urb_write_header(int /* mrf */) - { - unreachable("Not reached"); - } - - virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) - { - unreachable("Not reached"); - } -}; - - -void dead_code_eliminate_vec4_test::SetUp() -{ - ctx = ralloc_context(NULL); - compiler = rzalloc(ctx, struct brw_compiler); - devinfo = rzalloc(ctx, struct intel_device_info); - compiler->devinfo = devinfo; - - params = {}; - params.mem_ctx = ctx; - - prog_data = ralloc(ctx, struct brw_vue_prog_data); - nir_shader *shader = - nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL); - - v = new dead_code_eliminate_vec4_visitor(compiler, ¶ms, shader, prog_data); - - devinfo->ver = 4; - devinfo->verx10 = devinfo->ver * 10; -} - -void dead_code_eliminate_vec4_test::TearDown() -{ - delete v; - v = NULL; - - ralloc_free(ctx); - ctx = NULL; -} - -static void -dead_code_eliminate(vec4_visitor *v) -{ - const bool print = getenv("TEST_DEBUG"); - - if (print) { - fprintf(stderr, "instructions before:\n"); - v->dump_instructions(); - } - - v->calculate_cfg(); - v->dead_code_eliminate(); - - if (print) { - fprintf(stderr, "instructions after:\n"); - v->dump_instructions(); - } -} - -TEST_F(dead_code_eliminate_vec4_test, some_dead_channels_all_flags_used) -{ - const vec4_builder bld = vec4_builder(v).at_end(); - src_reg r1 = src_reg(v, glsl_vec4_type()); - src_reg r2 = src_reg(v, glsl_vec4_type()); - src_reg r3 = src_reg(v, glsl_vec4_type()); - src_reg r4 = src_reg(v, glsl_vec4_type()); - src_reg r5 = src_reg(v, glsl_vec4_type()); - src_reg r6 = src_reg(v, glsl_vec4_type()); - - /* Sequence like the following should not be modified by DCE. - * - * cmp.l.f0(8) g4<1>F g2<4,4,1>.wF g1<4,4,1>.xF - * mov(8) g5<1>.xF g4<4,4,1>.xF - * (+f0.x) sel(8) g6<1>UD g3<4>UD g6<4>UD - */ - vec4_instruction *test_cmp = - bld.CMP(dst_reg(r4), r2, r1, BRW_CONDITIONAL_L); - - test_cmp->src[0].swizzle = BRW_SWIZZLE_WWWW; - test_cmp->src[1].swizzle = BRW_SWIZZLE_XXXX; - - vec4_instruction *test_mov = - bld.MOV(dst_reg(r5), r4); - - test_mov->dst.writemask = WRITEMASK_X; - test_mov->src[0].swizzle = BRW_SWIZZLE_XXXX; - - vec4_instruction *test_sel = - bld.SEL(dst_reg(r6), r3, r6); - - set_predicate(BRW_PREDICATE_NORMAL, test_sel); - - /* The scratch write is here just to make r5 and r6 be live so that the - * whole program doesn't get eliminated by DCE. - */ - v->emit(v->SCRATCH_WRITE(dst_reg(r4), r6, r5)); - - dead_code_eliminate(v); - - EXPECT_EQ(test_cmp->dst.writemask, WRITEMASK_XYZW); -} diff --git a/src/intel/compiler/test_vec4_register_coalesce.cpp b/src/intel/compiler/test_vec4_register_coalesce.cpp deleted file mode 100644 index 13d01c450d4..00000000000 --- a/src/intel/compiler/test_vec4_register_coalesce.cpp +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Copyright © 2012 Intel Corporation - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS - * IN THE SOFTWARE. - */ - -#include -#include "brw_vec4.h" - -using namespace brw; - -#define register_coalesce(v) _register_coalesce(v, __func__) - -class register_coalesce_vec4_test : public ::testing::Test { - virtual void SetUp(); - virtual void TearDown(); - -public: - struct brw_compiler *compiler; - struct brw_compile_params params; - struct intel_device_info *devinfo; - void *ctx; - struct gl_shader_program *shader_prog; - struct brw_vue_prog_data *prog_data; - vec4_visitor *v; -}; - - -class register_coalesce_vec4_visitor : public vec4_visitor -{ -public: - register_coalesce_vec4_visitor(struct brw_compiler *compiler, - struct brw_compile_params *params, - nir_shader *shader, - struct brw_vue_prog_data *prog_data) - : vec4_visitor(compiler, params, NULL, prog_data, shader, - false /* no_spills */, false) - { - prog_data->dispatch_mode = INTEL_DISPATCH_MODE_4X2_DUAL_OBJECT; - } - -protected: - virtual dst_reg *make_reg_for_system_value(int /* location */) - { - unreachable("Not reached"); - } - - virtual void setup_payload() - { - unreachable("Not reached"); - } - - virtual void emit_prolog() - { - unreachable("Not reached"); - } - - virtual void emit_thread_end() - { - unreachable("Not reached"); - } - - virtual void emit_urb_write_header(int /* mrf */) - { - unreachable("Not reached"); - } - - virtual vec4_instruction *emit_urb_write_opcode(bool /* complete */) - { - unreachable("Not reached"); - } -}; - - -void register_coalesce_vec4_test::SetUp() -{ - ctx = ralloc_context(NULL); - compiler = rzalloc(ctx, struct brw_compiler); - devinfo = rzalloc(ctx, struct intel_device_info); - compiler->devinfo = devinfo; - - prog_data = ralloc(ctx, struct brw_vue_prog_data); - - params = {}; - params.mem_ctx = ctx; - - nir_shader *shader = - nir_shader_create(ctx, MESA_SHADER_VERTEX, NULL, NULL); - - v = new register_coalesce_vec4_visitor(compiler, ¶ms, shader, prog_data); - - devinfo->ver = 4; - devinfo->verx10 = devinfo->ver * 10; -} - -void register_coalesce_vec4_test::TearDown() -{ - delete v; - v = NULL; - - ralloc_free(ctx); - ctx = NULL; -} - -static void -_register_coalesce(vec4_visitor *v, const char *func) -{ - const bool print = getenv("TEST_DEBUG"); - - if (print) { - printf("%s: instructions before:\n", func); - v->dump_instructions(); - } - - v->calculate_cfg(); - v->opt_register_coalesce(); - - if (print) { - printf("%s: instructions after:\n", func); - v->dump_instructions(); - } -} - -TEST_F(register_coalesce_vec4_test, test_compute_to_mrf) -{ - src_reg something = src_reg(v, glsl_float_type()); - dst_reg temp = dst_reg(v, glsl_float_type()); - dst_reg init; - - dst_reg m0 = dst_reg(MRF, 0); - m0.writemask = WRITEMASK_X; - m0.type = BRW_REGISTER_TYPE_F; - - vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f))); - v->emit(v->MOV(m0, src_reg(temp))); - - register_coalesce(v); - - EXPECT_EQ(mul->dst.file, MRF); -} - - -TEST_F(register_coalesce_vec4_test, test_multiple_use) -{ - src_reg something = src_reg(v, glsl_float_type()); - dst_reg temp = dst_reg(v, glsl_vec4_type()); - dst_reg init; - - dst_reg m0 = dst_reg(MRF, 0); - m0.writemask = WRITEMASK_X; - m0.type = BRW_REGISTER_TYPE_F; - - dst_reg m1 = dst_reg(MRF, 1); - m1.writemask = WRITEMASK_XYZW; - m1.type = BRW_REGISTER_TYPE_F; - - src_reg src = src_reg(temp); - vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f))); - src.swizzle = BRW_SWIZZLE_XXXX; - v->emit(v->MOV(m0, src)); - src.swizzle = BRW_SWIZZLE_XYZW; - v->emit(v->MOV(m1, src)); - - register_coalesce(v); - - EXPECT_NE(mul->dst.file, MRF); -} - -TEST_F(register_coalesce_vec4_test, test_dp4_mrf) -{ - src_reg some_src_1 = src_reg(v, glsl_vec4_type()); - src_reg some_src_2 = src_reg(v, glsl_vec4_type()); - dst_reg init; - - dst_reg m0 = dst_reg(MRF, 0); - m0.writemask = WRITEMASK_Y; - m0.type = BRW_REGISTER_TYPE_F; - - dst_reg temp = dst_reg(v, glsl_float_type()); - - vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2)); - v->emit(v->MOV(m0, src_reg(temp))); - - register_coalesce(v); - - EXPECT_EQ(dp4->dst.file, MRF); - EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); -} - -TEST_F(register_coalesce_vec4_test, test_dp4_grf) -{ - src_reg some_src_1 = src_reg(v, glsl_vec4_type()); - src_reg some_src_2 = src_reg(v, glsl_vec4_type()); - dst_reg init; - - dst_reg to = dst_reg(v, glsl_vec4_type()); - dst_reg temp = dst_reg(v, glsl_float_type()); - - vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2)); - to.writemask = WRITEMASK_Y; - v->emit(v->MOV(to, src_reg(temp))); - - /* if we don't do something with the result, the automatic dead code - * elimination will remove all our instructions. - */ - src_reg src = src_reg(to); - src.negate = true; - v->emit(v->MOV(dst_reg(MRF, 0), src)); - - register_coalesce(v); - - EXPECT_EQ(dp4->dst.nr, to.nr); - EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y); -} - -TEST_F(register_coalesce_vec4_test, test_channel_mul_grf) -{ - src_reg some_src_1 = src_reg(v, glsl_vec4_type()); - src_reg some_src_2 = src_reg(v, glsl_vec4_type()); - dst_reg init; - - dst_reg to = dst_reg(v, glsl_vec4_type()); - dst_reg temp = dst_reg(v, glsl_float_type()); - - vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2)); - to.writemask = WRITEMASK_Y; - v->emit(v->MOV(to, src_reg(temp))); - - /* if we don't do something with the result, the automatic dead code - * elimination will remove all our instructions. - */ - src_reg src = src_reg(to); - src.negate = true; - v->emit(v->MOV(dst_reg(MRF, 0), src)); - - register_coalesce(v); - - EXPECT_EQ(mul->dst.nr, to.nr); -}