mesa/src/intel/compiler/brw_vec4.h

391 lines
13 KiB
C
Raw Normal View History

/*
* Copyright © 2011 Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice (including the next
* paragraph) shall be included in all copies or substantial portions of the
* Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
* IN THE SOFTWARE.
*/
#ifndef BRW_VEC4_H
#define BRW_VEC4_H
#include "brw_shader.h"
#ifdef __cplusplus
#include "brw_ir_vec4.h"
#include "brw_vec4_builder.h"
#endif
#include "compiler/glsl/ir.h"
#include "compiler/nir/nir.h"
#ifdef __cplusplus
extern "C" {
#endif
const unsigned *
brw_vec4_generate_assembly(const struct brw_compiler *compiler,
void *log_data,
void *mem_ctx,
const nir_shader *nir,
struct brw_vue_prog_data *prog_data,
const struct cfg_t *cfg);
#ifdef __cplusplus
} /* extern "C" */
namespace brw {
class vec4_live_variables;
/**
* The vertex shader front-end.
*
* Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
* fixed-function) into VS IR.
*/
class vec4_visitor : public backend_shader
{
public:
vec4_visitor(const struct brw_compiler *compiler,
void *log_data,
const struct brw_sampler_prog_key_data *key,
struct brw_vue_prog_data *prog_data,
const nir_shader *shader,
void *mem_ctx,
bool no_spills,
int shader_time_index);
dst_reg dst_null_f()
{
return dst_reg(brw_null_reg());
}
dst_reg dst_null_df()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
}
dst_reg dst_null_d()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
}
dst_reg dst_null_ud()
{
return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
}
const struct brw_sampler_prog_key_data * const key_tex;
struct brw_vue_prog_data * const prog_data;
char *fail_msg;
bool failed;
/**
* GLSL IR currently being processed, which is associated with our
* driver IR instructions for debugging purposes.
*/
const void *base_ir;
const char *current_annotation;
int first_non_payload_grf;
unsigned int max_grf;
int *virtual_grf_start;
int *virtual_grf_end;
brw::vec4_live_variables *live_intervals;
dst_reg userplane[MAX_CLIP_PLANES];
bool need_all_constants_in_pull_buffer;
/* Regs for vertex results. Generated at ir_variable visiting time
* for the ir->location's used.
*/
dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
int uniforms;
src_reg shader_start_time;
bool run();
void fail(const char *msg, ...);
int setup_uniforms(int payload_reg);
bool reg_allocate_trivial();
bool reg_allocate();
void evaluate_spill_costs(float *spill_costs, bool *no_spill);
int choose_spill_reg(struct ra_graph *g);
void spill_reg(int spill_reg);
void move_grf_array_access_to_scratch();
void move_uniform_array_access_to_pull_constants();
void move_push_constants_to_pull_constants();
void split_uniform_registers();
void pack_uniform_registers();
void calculate_live_intervals();
void invalidate_live_intervals();
void split_virtual_grfs();
bool opt_vector_float();
bool opt_reduce_swizzle();
bool dead_code_eliminate();
int var_range_start(unsigned v, unsigned n) const;
int var_range_end(unsigned v, unsigned n) const;
bool virtual_grf_interferes(int a, int b);
bool opt_cmod_propagation();
bool opt_copy_propagation(bool do_constant_prop = true);
bool opt_cse_local(bblock_t *block);
bool opt_cse();
bool opt_algebraic();
bool opt_register_coalesce();
bool eliminate_find_live_channel();
bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
void opt_set_dependency_control();
void opt_schedule_instructions();
void convert_to_hw_regs();
void fixup_3src_null_dest();
bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
i965/vec4: add a SIMD lowering pass Generally, instructions in Align16 mode only ever write to a single register and don't need any form of SIMD splitting, that's why we have never had a SIMD splitting pass in the vec4 backend. However, double-precision instructions typically write 2 registers and in some cases they run into certain hardware bugs and limitations that we need to work around by splitting the instructions so we only write to 1 register at a time. This patch implements a SIMD splitting pass similar to the one in the scalar backend. Because we only use double-precision instructions in Align16 mode in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64) the pass should be a no-op on any other generation. For now the pass only handles the gen7 restriction where any instruction that writes 2 registers also needs to read 2 registers. This affects double-precision instructions reading uniforms, for example. Later patches will extend the lowering pass adding a few more cases. v2: - Move the simd lowering pass after the main optimization loop and run copy-propagation and dce if it reports progress (Curro) - Compute number of registers written instead of fixing it to 1 (Iago) - Use group from backend_instruction (Iago) - Drop assertion that checked that we only split 8-wide instructions into 4-wide. (Curro) - Don't assume that instructions can only be 8-wide, we might want to use 16-wide instructions in the future too (Curro) - Wrap gen7 workarounds in a conditional to ease adding workarounds for other gens in the future (Curro) - Handle dst/src overlap hazard (Curro) - Use the horiz_offset() helper to simplify the implementation (Curro) - Drop the assertion that checks that each split instruction writes exactly one register (Curro) - Use the copy constructor to generate split instructions with all the relevant fields initialized to the values in the original instruction instead of copying only a handful of them manually (Curro) v3 (Iago): - When copying to a temporary, allocate the number of registers required for the copy based on the size written of the lowered instruction instead of assuming that all lowered instructions produce single-register writes - Adapt to changes in offset() Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-08-29 10:41:45 +02:00
bool lower_simd_width();
i965/vec4: add a scalarization pass for double-precision instructions The hardware only supports 32-bit swizzles, which means that we can only access directly channels XY of a DF making access to channels ZW more difficult, specially considering the various regioning restrictions imposed by the hardware. The combination of both things makes handling ramdom swizzles on DF operands rather difficult, as there are many combinations that can't be represented at all, at least not without some work and some level of instruction splitting depending on the case. Writemasks are 64-bit in general, however XY and ZW writemasks also work in 32-bit, which means these writemasks can't be represented natively, adding to the complexity. For now, we decided to try and simplify things as much as possible to avoid dealing with all this from the get go by adding a scalarization pass that runs after the main optimization loop. By fully scalarizing DF instructions in align16 we avoid most of the complexity introduced by the aforementioned hardware restrictions and we have an easier path to an initial fully functional version for the vector backend in Haswell and IvyBridge. Later, we can improve the implementation so we don't necessarily scalarize everything, iteratively adding more complexity and building on top of a framework that is already working. Curro drafted some ideas for how this could be done here: https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82 v2: - Use a copy constructor for the scalar instructions so we copy all relevant instructions fields from the original instruction. v3: Fix indention in one switch (Matt) Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-05-24 09:20:51 +02:00
bool scalarize_df();
bool lower_64bit_mad_to_mul_add();
void apply_logical_swizzle(struct brw_reg *hw_reg,
vec4_instruction *inst, int arg);
i965/vec4: add a SIMD lowering pass Generally, instructions in Align16 mode only ever write to a single register and don't need any form of SIMD splitting, that's why we have never had a SIMD splitting pass in the vec4 backend. However, double-precision instructions typically write 2 registers and in some cases they run into certain hardware bugs and limitations that we need to work around by splitting the instructions so we only write to 1 register at a time. This patch implements a SIMD splitting pass similar to the one in the scalar backend. Because we only use double-precision instructions in Align16 mode in gen7 (gen8+ is fully scalar and gens < 7 do not implement fp64) the pass should be a no-op on any other generation. For now the pass only handles the gen7 restriction where any instruction that writes 2 registers also needs to read 2 registers. This affects double-precision instructions reading uniforms, for example. Later patches will extend the lowering pass adding a few more cases. v2: - Move the simd lowering pass after the main optimization loop and run copy-propagation and dce if it reports progress (Curro) - Compute number of registers written instead of fixing it to 1 (Iago) - Use group from backend_instruction (Iago) - Drop assertion that checked that we only split 8-wide instructions into 4-wide. (Curro) - Don't assume that instructions can only be 8-wide, we might want to use 16-wide instructions in the future too (Curro) - Wrap gen7 workarounds in a conditional to ease adding workarounds for other gens in the future (Curro) - Handle dst/src overlap hazard (Curro) - Use the horiz_offset() helper to simplify the implementation (Curro) - Drop the assertion that checks that each split instruction writes exactly one register (Curro) - Use the copy constructor to generate split instructions with all the relevant fields initialized to the values in the original instruction instead of copying only a handful of them manually (Curro) v3 (Iago): - When copying to a temporary, allocate the number of registers required for the copy based on the size written of the lowered instruction instead of assuming that all lowered instructions produce single-register writes - Adapt to changes in offset() Reviewed-by: Matt Turner <mattst88@gmail.com>
2016-08-29 10:41:45 +02:00
vec4_instruction *emit(vec4_instruction *inst);
vec4_instruction *emit(enum opcode opcode);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0, const src_reg &src1);
vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
const src_reg &src0, const src_reg &src1,
const src_reg &src2);
vec4_instruction *emit_before(bblock_t *block,
vec4_instruction *inst,
vec4_instruction *new_inst);
#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
EMIT1(MOV)
EMIT1(NOT)
EMIT1(RNDD)
EMIT1(RNDE)
EMIT1(RNDZ)
EMIT1(FRC)
EMIT1(F32TO16)
EMIT1(F16TO32)
EMIT2(ADD)
EMIT2(MUL)
EMIT2(MACH)
EMIT2(MAC)
EMIT2(AND)
EMIT2(OR)
EMIT2(XOR)
EMIT2(DP3)
EMIT2(DP4)
EMIT2(DPH)
EMIT2(SHL)
EMIT2(SHR)
EMIT2(ASR)
vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
enum brw_conditional_mod condition);
vec4_instruction *IF(src_reg src0, src_reg src1,
enum brw_conditional_mod condition);
vec4_instruction *IF(enum brw_predicate predicate);
EMIT1(SCRATCH_READ)
EMIT2(SCRATCH_WRITE)
EMIT3(LRP)
EMIT1(BFREV)
EMIT3(BFE)
EMIT2(BFI1)
EMIT3(BFI2)
EMIT1(FBH)
EMIT1(FBL)
EMIT1(CBIT)
EMIT3(MAD)
EMIT2(ADDC)
EMIT2(SUBB)
EMIT1(DIM)
#undef EMIT1
#undef EMIT2
#undef EMIT3
int implied_mrf_writes(vec4_instruction *inst);
vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
src_reg src0, src_reg src1);
vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x,
const src_reg &y, const src_reg &a);
/**
* Copy any live channel from \p src to the first channel of the
* result.
*/
src_reg emit_uniformize(const src_reg &src);
src_reg fix_3src_operand(const src_reg &src);
src_reg resolve_source_modifiers(const src_reg &src);
vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
const src_reg &src1 = src_reg());
src_reg fix_math_operand(const src_reg &src);
void emit_pack_half_2x16(dst_reg dst, src_reg src0);
void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
void emit_texture(ir_texture_opcode op,
dst_reg dest,
const glsl_type *dest_type,
src_reg coordinate,
int coord_components,
src_reg shadow_comparator,
src_reg lod, src_reg lod2,
src_reg sample_index,
uint32_t constant_offset,
src_reg offset_value,
src_reg mcs,
uint32_t surface, src_reg surface_reg,
src_reg sampler_reg);
src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
src_reg surface);
void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
void emit_ndc_computation();
void emit_psiz_and_flags(dst_reg reg);
vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
virtual void emit_urb_slot(dst_reg reg, int varying);
void emit_shader_time_begin();
void emit_shader_time_end();
void emit_shader_time_write(int shader_time_subindex, src_reg value);
src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
src_reg *reladdr, int reg_offset);
void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
int base_offset);
void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
int base_offset);
void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
dst_reg dst,
src_reg orig_src,
int base_offset,
src_reg indirect);
void emit_pull_constant_load_reg(dst_reg dst,
src_reg surf_index,
src_reg offset,
bblock_t *before_block,
vec4_instruction *before_inst);
src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
vec4_instruction *inst, src_reg src);
void resolve_ud_negate(src_reg *reg);
bool lower_minmax();
src_reg get_timestamp();
void dump_instruction(backend_instruction *inst);
void dump_instruction(backend_instruction *inst, FILE *file);
bool is_high_sampler(src_reg sampler);
bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
void emit_conversion_from_double(dst_reg dst, src_reg src, bool saturate);
void emit_conversion_to_double(dst_reg dst, src_reg src, bool saturate);
vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
bool for_write,
bblock_t *block = NULL,
vec4_instruction *ref = NULL);
virtual void emit_nir_code();
virtual void nir_setup_uniforms();
virtual void nir_emit_impl(nir_function_impl *impl);
virtual void nir_emit_cf_list(exec_list *list);
virtual void nir_emit_if(nir_if *if_stmt);
virtual void nir_emit_loop(nir_loop *loop);
virtual void nir_emit_block(nir_block *block);
virtual void nir_emit_instr(nir_instr *instr);
virtual void nir_emit_load_const(nir_load_const_instr *instr);
src_reg get_nir_ssbo_intrinsic_index(nir_intrinsic_instr *instr);
virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
virtual void nir_emit_alu(nir_alu_instr *instr);
virtual void nir_emit_jump(nir_jump_instr *instr);
virtual void nir_emit_texture(nir_tex_instr *instr);
virtual void nir_emit_undef(nir_ssa_undef_instr *instr);
virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
dst_reg get_nir_dest(const nir_dest &dest, enum brw_reg_type type);
dst_reg get_nir_dest(const nir_dest &dest, nir_alu_type type);
dst_reg get_nir_dest(const nir_dest &dest);
src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
unsigned num_components = 4);
src_reg get_nir_src(const nir_src &src, nir_alu_type type,
unsigned num_components = 4);
src_reg get_nir_src(const nir_src &src,
unsigned num_components = 4);
src_reg get_nir_src_imm(const nir_src &src);
src_reg get_indirect_offset(nir_intrinsic_instr *instr);
dst_reg *nir_locals;
dst_reg *nir_ssa_values;
protected:
void emit_vertex();
void setup_payload_interference(struct ra_graph *g, int first_payload_node,
int reg_node_count);
virtual void setup_payload() = 0;
virtual void emit_prolog() = 0;
virtual void emit_thread_end() = 0;
virtual void emit_urb_write_header(int mrf) = 0;
virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
virtual void gs_emit_vertex(int stream_id);
virtual void gs_end_primitive();
private:
/**
* If true, then register allocation should fail instead of spilling.
*/
const bool no_spills;
int shader_time_index;
unsigned last_scratch; /**< measured in 32-byte (register size) units */
};
} /* namespace brw */
#endif /* __cplusplus */
#endif /* BRW_VEC4_H */