diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index adc273844a0..6fe2c04d985 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2940,6 +2940,18 @@ fs_visitor::allocate_registers(bool allow_spilling) debug_optimizer(nir, "post_ra_alloc_scheduling", 96, 2); + /* Lowering VGRF to FIXED_GRF is currently done as a separate pass instead + * of part of assign_regs since both bank conflicts optimization and post + * RA scheduling take advantage of distinguishing references to registers + * that were allocated from references that were already fixed. + * + * TODO: Change the passes above, then move this lowering to be part of + * assign_regs. + */ + brw_fs_lower_vgrfs_to_fixed_grfs(*this); + + debug_optimizer(nir, "lowered_vgrfs_to_fixed_grfs", 96, 3); + if (last_scratch > 0) { ASSERTED unsigned max_scratch_size = 2 * 1024 * 1024; diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h index 930aaeac85b..36acbc5d080 100644 --- a/src/intel/compiler/brw_fs.h +++ b/src/intel/compiler/brw_fs.h @@ -608,6 +608,7 @@ bool brw_fs_lower_sends_overlapping_payload(fs_visitor &s); bool brw_fs_lower_simd_width(fs_visitor &s); bool brw_fs_lower_sub_sat(fs_visitor &s); bool brw_fs_lower_uniform_pull_constant_loads(fs_visitor &s); +void brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s); bool brw_fs_opt_algebraic(fs_visitor &s); bool brw_fs_opt_bank_conflicts(fs_visitor &s); diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp index 9a10ab0d61b..cd32c40d405 100644 --- a/src/intel/compiler/brw_fs_generator.cpp +++ b/src/intel/compiler/brw_fs_generator.cpp @@ -64,80 +64,13 @@ brw_math_function(enum opcode op) } } - -static enum brw_reg_file -brw_file_from_reg(fs_reg *reg) -{ - switch (reg->file) { - case ARF: - return BRW_ARCHITECTURE_REGISTER_FILE; - case FIXED_GRF: - case VGRF: - return BRW_GENERAL_REGISTER_FILE; - case IMM: - return BRW_IMMEDIATE_VALUE; - case BAD_FILE: - case ATTR: - case UNIFORM: - unreachable("not reached"); - } - return BRW_ARCHITECTURE_REGISTER_FILE; -} - static struct brw_reg brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst, - fs_reg *reg, bool compressed) + fs_reg *reg) { struct brw_reg brw_reg; switch (reg->file) { - case VGRF: - if (reg->stride == 0) { - brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0); - } else { - /* From the Haswell PRM: - * - * "VertStride must be used to cross GRF register boundaries. This - * rule implies that elements within a 'Width' cannot cross GRF - * boundaries." - * - * The maximum width value that could satisfy this restriction is: - */ - const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); - - /* Because the hardware can only split source regions at a whole - * multiple of width during decompression (i.e. vertically), clamp - * the value obtained above to the physical execution size of a - * single decompressed chunk of the instruction: - */ - const unsigned phys_width = compressed ? inst->exec_size / 2 : - inst->exec_size; - - const unsigned max_hw_width = 16; - - /* XXX - The equation above is strictly speaking not correct on - * hardware that supports unbalanced GRF writes -- On Gfx9+ - * each decompressed chunk of the instruction may have a - * different execution size when the number of components - * written to each destination GRF is not the same. - */ - if (reg->stride > 4) { - assert(reg != &inst->dst); - assert(reg->stride * type_sz(reg->type) <= REG_SIZE); - brw_reg = brw_vecn_reg(1, brw_file_from_reg(reg), reg->nr, 0); - brw_reg = stride(brw_reg, reg->stride, 1, 0); - } else { - const unsigned width = MIN3(reg_width, phys_width, max_hw_width); - brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0); - brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride); - } - } - - brw_reg = retype(brw_reg, reg->type); - brw_reg = byte_offset(brw_reg, reg->offset); - brw_reg.abs = reg->abs; - brw_reg.negate = reg->negate; - break; case ARF: case FIXED_GRF: case IMM: @@ -148,6 +81,7 @@ brw_reg_from_fs_reg(const struct intel_device_info *devinfo, fs_inst *inst, /* Probably unused. */ brw_reg = brw_null_reg(); break; + case VGRF: case ATTR: case UNIFORM: unreachable("not reached"); @@ -913,22 +847,6 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, if (unlikely(debug_flag)) disasm_annotate(disasm_info, inst, p->next_insn_offset); - /* If the instruction writes to more than one register, it needs to be - * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the - * hardware figures out by itself what the right compression mode is, - * but we still need to know whether the instruction is compressed to - * set up the source register regions appropriately. - * - * XXX - This is wrong for instructions that write a single register but - * read more than one which should strictly speaking be treated as - * compressed. For instructions that don't write any registers it - * relies on the destination being a null register of the correct - * type and regioning so the instruction is considered compressed - * or not accordingly. - */ - const bool compressed = - inst->dst.component_size(inst->exec_size) > REG_SIZE; - if (devinfo->ver >= 20 && inst->group % 8 != 0) { assert(inst->force_writemask_all); assert(!inst->predicate && !inst->conditional_mod); @@ -941,8 +859,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, } for (unsigned int i = 0; i < inst->sources; i++) { - src[i] = brw_reg_from_fs_reg(devinfo, inst, - &inst->src[i], compressed); + src[i] = brw_reg_from_fs_reg(devinfo, inst, &inst->src[i]); /* The accumulator result appears to get used for the * conditional modifier generation. When negating a UD * value, there is a 33rd bit generated for the sign in the @@ -953,8 +870,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width, inst->src[i].type != BRW_REGISTER_TYPE_UD || !inst->src[i].negate); } - dst = brw_reg_from_fs_reg(devinfo, inst, - &inst->dst, compressed); + dst = brw_reg_from_fs_reg(devinfo, inst, &inst->dst); brw_set_default_access_mode(p, BRW_ALIGN_1); brw_set_default_predicate_control(p, inst->predicate); diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp index c47d42e7741..51cb76165a1 100644 --- a/src/intel/compiler/brw_fs_lower.cpp +++ b/src/intel/compiler/brw_fs_lower.cpp @@ -612,3 +612,95 @@ brw_fs_lower_alu_restrictions(fs_visitor &s) return progress; } + +static void +brw_fs_lower_vgrf_to_fixed_grf(const struct intel_device_info *devinfo, fs_inst *inst, + fs_reg *reg, bool compressed) +{ + if (reg->file != VGRF) + return; + + struct brw_reg new_reg; + + if (reg->stride == 0) { + new_reg = brw_vec1_grf(reg->nr, 0); + } else if (reg->stride > 4) { + assert(reg != &inst->dst); + assert(reg->stride * type_sz(reg->type) <= REG_SIZE); + new_reg = brw_vecn_grf(1, reg->nr, 0); + new_reg = stride(new_reg, reg->stride, 1, 0); + } else { + /* From the Haswell PRM: + * + * "VertStride must be used to cross GRF register boundaries. This + * rule implies that elements within a 'Width' cannot cross GRF + * boundaries." + * + * The maximum width value that could satisfy this restriction is: + */ + const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type)); + + /* Because the hardware can only split source regions at a whole + * multiple of width during decompression (i.e. vertically), clamp + * the value obtained above to the physical execution size of a + * single decompressed chunk of the instruction: + */ + const bool compressed = inst->dst.component_size(inst->exec_size) > REG_SIZE; + const unsigned phys_width = compressed ? inst->exec_size / 2 : + inst->exec_size; + + /* XXX - The equation above is strictly speaking not correct on + * hardware that supports unbalanced GRF writes -- On Gfx9+ + * each decompressed chunk of the instruction may have a + * different execution size when the number of components + * written to each destination GRF is not the same. + */ + + const unsigned max_hw_width = 16; + + const unsigned width = MIN3(reg_width, phys_width, max_hw_width); + new_reg = brw_vecn_grf(width, reg->nr, 0); + new_reg = stride(new_reg, width * reg->stride, width, reg->stride); + } + + new_reg = retype(new_reg, reg->type); + new_reg = byte_offset(new_reg, reg->offset); + new_reg.abs = reg->abs; + new_reg.negate = reg->negate; + + *reg = new_reg; +} + +void +brw_fs_lower_vgrfs_to_fixed_grfs(fs_visitor &s) +{ + assert(s.grf_used || !"Must be called after register allocation"); + + foreach_block_and_inst(block, fs_inst, inst, s.cfg) { + /* If the instruction writes to more than one register, it needs to be + * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the + * hardware figures out by itself what the right compression mode is, + * but we still need to know whether the instruction is compressed to + * set up the source register regions appropriately. + * + * XXX - This is wrong for instructions that write a single register but + * read more than one which should strictly speaking be treated as + * compressed. For instructions that don't write any registers it + * relies on the destination being a null register of the correct + * type and regioning so the instruction is considered compressed + * or not accordingly. + */ + + const bool compressed = + inst->dst.component_size(inst->exec_size) > REG_SIZE; + + brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->dst, compressed); + for (int i = 0; i < inst->sources; i++) { + brw_fs_lower_vgrf_to_fixed_grf(s.devinfo, inst, &inst->src[i], compressed); + } + } + + s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DATA_FLOW | + DEPENDENCY_VARIABLES); +} +