diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp index d3aec055a77..7892a69d981 100644 --- a/src/intel/compiler/brw_fs.cpp +++ b/src/intel/compiler/brw_fs.cpp @@ -2132,77 +2132,6 @@ fs_visitor::get_pull_locs(const fs_reg &src, return true; } -/** - * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD - * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. - */ -bool -brw_fs_lower_constant_loads(fs_visitor &s) -{ - unsigned index, pull_index; - bool progress = false; - - foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { - /* Set up the annotation tracking for new generated instructions. */ - const fs_builder ibld(&s, block, inst); - - for (int i = 0; i < inst->sources; i++) { - if (inst->src[i].file != UNIFORM) - continue; - - /* We'll handle this case later */ - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) - continue; - - if (!s.get_pull_locs(inst->src[i], &index, &pull_index)) - continue; - - assert(inst->src[i].stride == 0); - - const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ - const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); - const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); - const unsigned base = pull_index * 4; - - fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; - srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); - srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); - srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); - - - ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, - srcs, PULL_UNIFORM_CONSTANT_SRCS); - - /* Rewrite the instruction to use the temporary VGRF. */ - inst->src[i].file = VGRF; - inst->src[i].nr = dst.nr; - inst->src[i].offset = (base & (block_sz - 1)) + - inst->src[i].offset % 4; - - progress = true; - } - - if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && - inst->src[0].file == UNIFORM) { - - if (!s.get_pull_locs(inst->src[0], &index, &pull_index)) - continue; - - s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, - brw_imm_ud(index), - fs_reg() /* surface_handle */, - inst->src[1], - pull_index * 4, 4, 1); - inst->remove(block); - - progress = true; - } - } - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - /** * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE * instructions to FS_OPCODE_REP_FB_WRITE. @@ -2276,114 +2205,6 @@ fs_visitor::emit_repclear_shader() brw_fs_lower_scoreboard(*this); } -bool -brw_fs_lower_load_payload(fs_visitor &s) -{ - bool progress = false; - - foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { - if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) - continue; - - assert(inst->dst.file == MRF || inst->dst.file == VGRF); - assert(inst->saturate == false); - fs_reg dst = inst->dst; - - /* Get rid of COMPR4. We'll add it back in if we need it */ - if (dst.file == MRF) - dst.nr = dst.nr & ~BRW_MRF_COMPR4; - - const fs_builder ibld(&s, block, inst); - const fs_builder ubld = ibld.exec_all(); - - for (uint8_t i = 0; i < inst->header_size;) { - /* Number of header GRFs to initialize at once with a single MOV - * instruction. - */ - const unsigned n = - (i + 1 < inst->header_size && inst->src[i].stride == 1 && - inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ? - 2 : 1; - - if (inst->src[i].file != BAD_FILE) - ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD), - retype(inst->src[i], BRW_REGISTER_TYPE_UD)); - - dst = byte_offset(dst, n * REG_SIZE); - i += n; - } - - if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && - inst->exec_size > 8) { - /* In this case, the payload portion of the LOAD_PAYLOAD isn't - * a straightforward copy. Instead, the result of the - * LOAD_PAYLOAD is treated as interleaved and the first four - * non-header sources are unpacked as: - * - * m + 0: r0 - * m + 1: g0 - * m + 2: b0 - * m + 3: a0 - * m + 4: r1 - * m + 5: g1 - * m + 6: b1 - * m + 7: a1 - * - * This is used for gen <= 5 fb writes. - */ - assert(inst->exec_size == 16); - assert(inst->header_size + 4 <= inst->sources); - for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { - if (inst->src[i].file != BAD_FILE) { - if (s.devinfo->has_compr4) { - fs_reg compr4_dst = retype(dst, inst->src[i].type); - compr4_dst.nr |= BRW_MRF_COMPR4; - ibld.MOV(compr4_dst, inst->src[i]); - } else { - /* Platform doesn't have COMPR4. We have to fake it */ - fs_reg mov_dst = retype(dst, inst->src[i].type); - ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0)); - mov_dst.nr += 4; - ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1)); - } - } - - dst.nr++; - } - - /* The loop above only ever incremented us through the first set - * of 4 registers. However, thanks to the magic of COMPR4, we - * actually wrote to the first 8 registers, so we need to take - * that into account now. - */ - dst.nr += 4; - - /* The COMPR4 code took care of the first 4 sources. We'll let - * the regular path handle any remaining sources. Yes, we are - * modifying the instruction but we're about to delete it so - * this really doesn't hurt anything. - */ - inst->header_size += 4; - } - - for (uint8_t i = inst->header_size; i < inst->sources; i++) { - dst.type = inst->src[i].type; - if (inst->src[i].file != BAD_FILE) { - ibld.MOV(dst, inst->src[i]); - } - dst = offset(dst, ibld, 1); - } - - inst->remove(block); - progress = true; - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); - - return progress; -} - /** * Factor an unsigned 32-bit integer. * @@ -2879,95 +2700,6 @@ brw_fs_lower_integer_multiplication(fs_visitor &s) return progress; } -bool -brw_fs_lower_sub_sat(fs_visitor &s) -{ - bool progress = false; - - foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { - const fs_builder ibld(&s, block, inst); - - if (inst->opcode == SHADER_OPCODE_USUB_SAT || - inst->opcode == SHADER_OPCODE_ISUB_SAT) { - /* The fundamental problem is the hardware performs source negation - * at the bit width of the source. If the source is 0x80000000D, the - * negation is 0x80000000D. As a result, subtractSaturate(0, - * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There - * are at least three ways to resolve this: - * - * 1. Use the accumulator for the negated source. The accumulator is - * 33 bits, so our source 0x80000000 is sign-extended to - * 0x1800000000. The negation of which is 0x080000000. This - * doesn't help for 64-bit integers (which are already bigger than - * 33 bits). There are also only 8 accumulators, so SIMD16 or - * SIMD32 instructions would have to be split into multiple SIMD8 - * instructions. - * - * 2. Use slightly different math. For any n-bit value x, we know (x - * >> 1) != -(x >> 1). We can use this fact to only do - * subtractions involving (x >> 1). subtractSaturate(a, b) == - * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). - * - * 3. For unsigned sources, it is sufficient to replace the - * subtractSaturate with (a > b) ? a - b : 0. - * - * It may also be possible to use the SUBB instruction. This - * implicitly writes the accumulator, so it could only be used in the - * same situations as #1 above. It is further limited by only - * allowing UD sources. - */ - if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q && - inst->src[0].type != BRW_REGISTER_TYPE_UQ) { - fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type); - - ibld.MOV(acc, inst->src[1]); - fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); - add->saturate = true; - add->src[0].negate = true; - } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { - /* tmp = src1 >> 1; - * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); - */ - fs_reg tmp1 = ibld.vgrf(inst->src[0].type); - fs_reg tmp2 = ibld.vgrf(inst->src[0].type); - fs_reg tmp3 = ibld.vgrf(inst->src[0].type); - fs_inst *add; - - ibld.SHR(tmp1, inst->src[1], brw_imm_d(1)); - - add = ibld.ADD(tmp2, inst->src[1], tmp1); - add->src[1].negate = true; - - add = ibld.ADD(tmp3, inst->src[0], tmp1); - add->src[1].negate = true; - add->saturate = true; - - add = ibld.ADD(inst->dst, tmp3, tmp2); - add->src[1].negate = true; - add->saturate = true; - } else { - /* a > b ? a - b : 0 */ - ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], - BRW_CONDITIONAL_G); - - fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); - add->src[1].negate = !add->src[1].negate; - - ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) - ->predicate = BRW_PREDICATE_NORMAL; - } - - inst->remove(block); - progress = true; - } - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - /** * Get the mask of SIMD channels enabled during dispatch and not yet disabled * by discard. Due to the layout of the sample mask in the fragment shader @@ -4029,226 +3761,6 @@ brw_fs_lower_simd_width(fs_visitor &s) return progress; } -/** - * Transform barycentric vectors into the interleaved form expected by the PLN - * instruction and returned by the Gfx7+ PI shared function. - * - * For channels 0-15 in SIMD16 mode they are expected to be laid out as - * follows in the register file: - * - * rN+0: X[0-7] - * rN+1: Y[0-7] - * rN+2: X[8-15] - * rN+3: Y[8-15] - * - * There is no need to handle SIMD32 here -- This is expected to be run after - * SIMD lowering, since SIMD lowering relies on vectors having the standard - * component layout. - */ -bool -brw_fs_lower_barycentrics(fs_visitor &s) -{ - const intel_device_info *devinfo = s.devinfo; - const bool has_interleaved_layout = devinfo->has_pln || - (devinfo->ver >= 7 && devinfo->ver < 20); - bool progress = false; - - if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout) - return false; - - foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { - if (inst->exec_size < 16) - continue; - - const fs_builder ibld(&s, block, inst); - const fs_builder ubld = ibld.exec_all().group(8, 0); - - switch (inst->opcode) { - case FS_OPCODE_LINTERP : { - assert(inst->exec_size == 16); - const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2); - fs_reg srcs[4]; - - for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) - srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2), - 8 * (i / 2)); - - ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs)); - - inst->src[0] = tmp; - progress = true; - break; - } - case FS_OPCODE_INTERPOLATE_AT_SAMPLE: - case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: - case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: { - assert(inst->exec_size == 16); - const fs_reg tmp = ibld.vgrf(inst->dst.type, 2); - - for (unsigned i = 0; i < 2; i++) { - for (unsigned g = 0; g < inst->exec_size / 8; g++) { - fs_inst *mov = ibld.at(block, inst->next).group(8, g) - .MOV(horiz_offset(offset(inst->dst, ibld, i), - 8 * g), - offset(tmp, ubld, 2 * g + i)); - mov->predicate = inst->predicate; - mov->predicate_inverse = inst->predicate_inverse; - mov->flag_subreg = inst->flag_subreg; - } - } - - inst->dst = tmp; - progress = true; - break; - } - default: - break; - } - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - -/** - * Lower a derivative instruction as the floating-point difference of two - * swizzles of the source, specified as \p swz0 and \p swz1. - */ -static bool -lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst, - unsigned swz0, unsigned swz1) -{ - const fs_builder ubld = fs_builder(&s, block, inst).exec_all(); - const fs_reg tmp0 = ubld.vgrf(inst->src[0].type); - const fs_reg tmp1 = ubld.vgrf(inst->src[0].type); - - ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0)); - ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1)); - - inst->resize_sources(2); - inst->src[0] = negate(tmp0); - inst->src[1] = tmp1; - inst->opcode = BRW_OPCODE_ADD; - - return true; -} - -/** - * Lower derivative instructions on platforms where codegen cannot implement - * them efficiently (i.e. XeHP). - */ -bool -brw_fs_lower_derivatives(fs_visitor &s) -{ - bool progress = false; - - if (s.devinfo->verx10 < 125) - return false; - - foreach_block_and_inst(block, fs_inst, inst, s.cfg) { - if (inst->opcode == FS_OPCODE_DDX_COARSE) - progress |= lower_derivative(s, block, inst, - BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY); - - else if (inst->opcode == FS_OPCODE_DDX_FINE) - progress |= lower_derivative(s, block, inst, - BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW); - - else if (inst->opcode == FS_OPCODE_DDY_COARSE) - progress |= lower_derivative(s, block, inst, - BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ); - - else if (inst->opcode == FS_OPCODE_DDY_FINE) - progress |= lower_derivative(s, block, inst, - BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW); - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - -bool -brw_fs_lower_find_live_channel(fs_visitor &s) -{ - bool progress = false; - - if (s.devinfo->ver < 8) - return false; - - bool packed_dispatch = - brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons, - s.stage_prog_data); - bool vmask = - s.stage == MESA_SHADER_FRAGMENT && - brw_wm_prog_data(s.stage_prog_data)->uses_vmask; - - foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { - if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL && - inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL) - continue; - - bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL; - - /* Getting the first active channel index is easy on Gfx8: Just find - * the first bit set in the execution mask. The register exists on - * HSW already but it reads back as all ones when the current - * instruction has execution masking disabled, so it's kind of - * useless there. - */ - fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); - - const fs_builder ibld(&s, block, inst); - if (!inst->is_partial_write()) - ibld.emit_undef_for_dst(inst); - - const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0); - - /* ce0 doesn't consider the thread dispatch mask (DMask or VMask), - * so combine the execution and dispatch masks to obtain the true mask. - * - * If we're looking for the first live channel, and we have packed - * dispatch, we can skip this step, as we know all dispatched channels - * will appear at the front of the mask. - */ - if (!(first && packed_dispatch)) { - fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD); - ubld.UNDEF(mask); - ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2)); - - /* Quarter control has the effect of magically shifting the value of - * ce0 so you'll get the first/last active channel relative to the - * specified quarter control as result. - */ - if (inst->group > 0) - ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8))); - - ubld.AND(mask, exec_mask, mask); - exec_mask = mask; - } - - if (first) { - ubld.FBL(inst->dst, exec_mask); - } else { - fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1); - ubld.UNDEF(tmp); - ubld.LZD(tmp, exec_mask); - ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31)); - } - - inst->remove(block); - progress = true; - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - void fs_visitor::dump_instructions_to_file(FILE *file) const { @@ -4559,78 +4071,6 @@ fs_visitor::debug_optimizer(const nir_shader *nir, free(filename); } -/** - * From the Skylake PRM Vol. 2a docs for sends: - * - * "It is required that the second block of GRFs does not overlap with the - * first block." - * - * There are plenty of cases where we may accidentally violate this due to - * having, for instance, both sources be the constant 0. This little pass - * just adds a new vgrf for the second payload and copies it over. - */ -bool -brw_fs_lower_sends_overlapping_payload(fs_visitor &s) -{ - bool progress = false; - - foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { - if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && - regions_overlap(inst->src[2], inst->mlen * REG_SIZE, - inst->src[3], inst->ex_mlen * REG_SIZE)) { - fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen), - BRW_REGISTER_TYPE_UD); - /* Sadly, we've lost all notion of channels and bit sizes at this - * point. Just WE_all it. - */ - const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0); - fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD); - fs_reg copy_dst = tmp; - for (unsigned i = 0; i < inst->ex_mlen; i += 2) { - if (inst->ex_mlen == i + 1) { - /* Only one register left; do SIMD8 */ - ibld.group(8, 0).MOV(copy_dst, copy_src); - } else { - ibld.MOV(copy_dst, copy_src); - } - copy_src = offset(copy_src, ibld, 1); - copy_dst = offset(copy_dst, ibld, 1); - } - inst->src[3] = tmp; - progress = true; - } - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); - - return progress; -} - -/** - * Three source instruction must have a GRF/MRF destination register. - * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. - */ -bool -brw_fs_lower_3src_null_dest(fs_visitor &s) -{ - bool progress = false; - - foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { - if (inst->is_3src(s.compiler) && inst->dst.is_null()) { - inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8), - inst->dst.type); - progress = true; - } - } - - if (progress) - s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | - DEPENDENCY_VARIABLES); - - return progress; -} - static bool needs_dummy_fence(const intel_device_info *devinfo, fs_inst *inst) { diff --git a/src/intel/compiler/brw_fs_lower.cpp b/src/intel/compiler/brw_fs_lower.cpp new file mode 100644 index 00000000000..e573fb21b56 --- /dev/null +++ b/src/intel/compiler/brw_fs_lower.cpp @@ -0,0 +1,608 @@ +/* + * Copyright © 2010 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "brw_fs.h" +#include "brw_fs_builder.h" + +using namespace brw; + +/** + * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD + * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. + */ +bool +brw_fs_lower_constant_loads(fs_visitor &s) +{ + unsigned index, pull_index; + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { + /* Set up the annotation tracking for new generated instructions. */ + const fs_builder ibld(&s, block, inst); + + for (int i = 0; i < inst->sources; i++) { + if (inst->src[i].file != UNIFORM) + continue; + + /* We'll handle this case later */ + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) + continue; + + if (!s.get_pull_locs(inst->src[i], &index, &pull_index)) + continue; + + assert(inst->src[i].stride == 0); + + const unsigned block_sz = 64; /* Fetch one cacheline at a time. */ + const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0); + const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD); + const unsigned base = pull_index * 4; + + fs_reg srcs[PULL_UNIFORM_CONSTANT_SRCS]; + srcs[PULL_UNIFORM_CONSTANT_SRC_SURFACE] = brw_imm_ud(index); + srcs[PULL_UNIFORM_CONSTANT_SRC_OFFSET] = brw_imm_ud(base & ~(block_sz - 1)); + srcs[PULL_UNIFORM_CONSTANT_SRC_SIZE] = brw_imm_ud(block_sz); + + + ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, dst, + srcs, PULL_UNIFORM_CONSTANT_SRCS); + + /* Rewrite the instruction to use the temporary VGRF. */ + inst->src[i].file = VGRF; + inst->src[i].nr = dst.nr; + inst->src[i].offset = (base & (block_sz - 1)) + + inst->src[i].offset % 4; + + progress = true; + } + + if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && + inst->src[0].file == UNIFORM) { + + if (!s.get_pull_locs(inst->src[0], &index, &pull_index)) + continue; + + s.VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst, + brw_imm_ud(index), + fs_reg() /* surface_handle */, + inst->src[1], + pull_index * 4, 4, 1); + inst->remove(block); + + progress = true; + } + } + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +bool +brw_fs_lower_load_payload(fs_visitor &s) +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { + if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) + continue; + + assert(inst->dst.file == MRF || inst->dst.file == VGRF); + assert(inst->saturate == false); + fs_reg dst = inst->dst; + + /* Get rid of COMPR4. We'll add it back in if we need it */ + if (dst.file == MRF) + dst.nr = dst.nr & ~BRW_MRF_COMPR4; + + const fs_builder ibld(&s, block, inst); + const fs_builder ubld = ibld.exec_all(); + + for (uint8_t i = 0; i < inst->header_size;) { + /* Number of header GRFs to initialize at once with a single MOV + * instruction. + */ + const unsigned n = + (i + 1 < inst->header_size && inst->src[i].stride == 1 && + inst->src[i + 1].equals(byte_offset(inst->src[i], REG_SIZE))) ? + 2 : 1; + + if (inst->src[i].file != BAD_FILE) + ubld.group(8 * n, 0).MOV(retype(dst, BRW_REGISTER_TYPE_UD), + retype(inst->src[i], BRW_REGISTER_TYPE_UD)); + + dst = byte_offset(dst, n * REG_SIZE); + i += n; + } + + if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) && + inst->exec_size > 8) { + /* In this case, the payload portion of the LOAD_PAYLOAD isn't + * a straightforward copy. Instead, the result of the + * LOAD_PAYLOAD is treated as interleaved and the first four + * non-header sources are unpacked as: + * + * m + 0: r0 + * m + 1: g0 + * m + 2: b0 + * m + 3: a0 + * m + 4: r1 + * m + 5: g1 + * m + 6: b1 + * m + 7: a1 + * + * This is used for gen <= 5 fb writes. + */ + assert(inst->exec_size == 16); + assert(inst->header_size + 4 <= inst->sources); + for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { + if (inst->src[i].file != BAD_FILE) { + if (s.devinfo->has_compr4) { + fs_reg compr4_dst = retype(dst, inst->src[i].type); + compr4_dst.nr |= BRW_MRF_COMPR4; + ibld.MOV(compr4_dst, inst->src[i]); + } else { + /* Platform doesn't have COMPR4. We have to fake it */ + fs_reg mov_dst = retype(dst, inst->src[i].type); + ibld.quarter(0).MOV(mov_dst, quarter(inst->src[i], 0)); + mov_dst.nr += 4; + ibld.quarter(1).MOV(mov_dst, quarter(inst->src[i], 1)); + } + } + + dst.nr++; + } + + /* The loop above only ever incremented us through the first set + * of 4 registers. However, thanks to the magic of COMPR4, we + * actually wrote to the first 8 registers, so we need to take + * that into account now. + */ + dst.nr += 4; + + /* The COMPR4 code took care of the first 4 sources. We'll let + * the regular path handle any remaining sources. Yes, we are + * modifying the instruction but we're about to delete it so + * this really doesn't hurt anything. + */ + inst->header_size += 4; + } + + for (uint8_t i = inst->header_size; i < inst->sources; i++) { + dst.type = inst->src[i].type; + if (inst->src[i].file != BAD_FILE) { + ibld.MOV(dst, inst->src[i]); + } + dst = offset(dst, ibld, 1); + } + + inst->remove(block); + progress = true; + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +bool +brw_fs_lower_minmax(fs_visitor &s) +{ + assert(s.devinfo->ver < 6); + + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + const fs_builder ibld(&s, block, inst); + + if (inst->opcode == BRW_OPCODE_SEL && + inst->predicate == BRW_PREDICATE_NONE) { + /* If src1 is an immediate value that is not NaN, then it can't be + * NaN. In that case, emit CMP because it is much better for cmod + * propagation. Likewise if src1 is not float. Gfx4 and Gfx5 don't + * support HF or DF, so it is not necessary to check for those. + */ + if (inst->src[1].type != BRW_REGISTER_TYPE_F || + (inst->src[1].file == IMM && !isnan(inst->src[1].f))) { + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + } else { + ibld.CMPN(ibld.null_reg_d(), inst->src[0], inst->src[1], + inst->conditional_mod); + } + inst->predicate = BRW_PREDICATE_NORMAL; + inst->conditional_mod = BRW_CONDITIONAL_NONE; + + progress = true; + } + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS); + + return progress; +} + +bool +brw_fs_lower_sub_sat(fs_visitor &s) +{ + bool progress = false; + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + const fs_builder ibld(&s, block, inst); + + if (inst->opcode == SHADER_OPCODE_USUB_SAT || + inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* The fundamental problem is the hardware performs source negation + * at the bit width of the source. If the source is 0x80000000D, the + * negation is 0x80000000D. As a result, subtractSaturate(0, + * 0x80000000) will produce 0x80000000 instead of 0x7fffffff. There + * are at least three ways to resolve this: + * + * 1. Use the accumulator for the negated source. The accumulator is + * 33 bits, so our source 0x80000000 is sign-extended to + * 0x1800000000. The negation of which is 0x080000000. This + * doesn't help for 64-bit integers (which are already bigger than + * 33 bits). There are also only 8 accumulators, so SIMD16 or + * SIMD32 instructions would have to be split into multiple SIMD8 + * instructions. + * + * 2. Use slightly different math. For any n-bit value x, we know (x + * >> 1) != -(x >> 1). We can use this fact to only do + * subtractions involving (x >> 1). subtractSaturate(a, b) == + * subtractSaturate(subtractSaturate(a, (b >> 1)), b - (b >> 1)). + * + * 3. For unsigned sources, it is sufficient to replace the + * subtractSaturate with (a > b) ? a - b : 0. + * + * It may also be possible to use the SUBB instruction. This + * implicitly writes the accumulator, so it could only be used in the + * same situations as #1 above. It is further limited by only + * allowing UD sources. + */ + if (inst->exec_size == 8 && inst->src[0].type != BRW_REGISTER_TYPE_Q && + inst->src[0].type != BRW_REGISTER_TYPE_UQ) { + fs_reg acc(ARF, BRW_ARF_ACCUMULATOR, inst->src[1].type); + + ibld.MOV(acc, inst->src[1]); + fs_inst *add = ibld.ADD(inst->dst, acc, inst->src[0]); + add->saturate = true; + add->src[0].negate = true; + } else if (inst->opcode == SHADER_OPCODE_ISUB_SAT) { + /* tmp = src1 >> 1; + * dst = add.sat(add.sat(src0, -tmp), -(src1 - tmp)); + */ + fs_reg tmp1 = ibld.vgrf(inst->src[0].type); + fs_reg tmp2 = ibld.vgrf(inst->src[0].type); + fs_reg tmp3 = ibld.vgrf(inst->src[0].type); + fs_inst *add; + + ibld.SHR(tmp1, inst->src[1], brw_imm_d(1)); + + add = ibld.ADD(tmp2, inst->src[1], tmp1); + add->src[1].negate = true; + + add = ibld.ADD(tmp3, inst->src[0], tmp1); + add->src[1].negate = true; + add->saturate = true; + + add = ibld.ADD(inst->dst, tmp3, tmp2); + add->src[1].negate = true; + add->saturate = true; + } else { + /* a > b ? a - b : 0 */ + ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1], + BRW_CONDITIONAL_G); + + fs_inst *add = ibld.ADD(inst->dst, inst->src[0], inst->src[1]); + add->src[1].negate = !add->src[1].negate; + + ibld.SEL(inst->dst, inst->dst, brw_imm_ud(0)) + ->predicate = BRW_PREDICATE_NORMAL; + } + + inst->remove(block); + progress = true; + } + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Transform barycentric vectors into the interleaved form expected by the PLN + * instruction and returned by the Gfx7+ PI shared function. + * + * For channels 0-15 in SIMD16 mode they are expected to be laid out as + * follows in the register file: + * + * rN+0: X[0-7] + * rN+1: Y[0-7] + * rN+2: X[8-15] + * rN+3: Y[8-15] + * + * There is no need to handle SIMD32 here -- This is expected to be run after + * SIMD lowering, since SIMD lowering relies on vectors having the standard + * component layout. + */ +bool +brw_fs_lower_barycentrics(fs_visitor &s) +{ + const intel_device_info *devinfo = s.devinfo; + const bool has_interleaved_layout = devinfo->has_pln || + (devinfo->ver >= 7 && devinfo->ver < 20); + bool progress = false; + + if (s.stage != MESA_SHADER_FRAGMENT || !has_interleaved_layout) + return false; + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + if (inst->exec_size < 16) + continue; + + const fs_builder ibld(&s, block, inst); + const fs_builder ubld = ibld.exec_all().group(8, 0); + + switch (inst->opcode) { + case FS_OPCODE_LINTERP : { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->src[0].type, 2); + fs_reg srcs[4]; + + for (unsigned i = 0; i < ARRAY_SIZE(srcs); i++) + srcs[i] = horiz_offset(offset(inst->src[0], ibld, i % 2), + 8 * (i / 2)); + + ubld.LOAD_PAYLOAD(tmp, srcs, ARRAY_SIZE(srcs), ARRAY_SIZE(srcs)); + + inst->src[0] = tmp; + progress = true; + break; + } + case FS_OPCODE_INTERPOLATE_AT_SAMPLE: + case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: + case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: { + assert(inst->exec_size == 16); + const fs_reg tmp = ibld.vgrf(inst->dst.type, 2); + + for (unsigned i = 0; i < 2; i++) { + for (unsigned g = 0; g < inst->exec_size / 8; g++) { + fs_inst *mov = ibld.at(block, inst->next).group(8, g) + .MOV(horiz_offset(offset(inst->dst, ibld, i), + 8 * g), + offset(tmp, ubld, 2 * g + i)); + mov->predicate = inst->predicate; + mov->predicate_inverse = inst->predicate_inverse; + mov->flag_subreg = inst->flag_subreg; + } + } + + inst->dst = tmp; + progress = true; + break; + } + default: + break; + } + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Lower a derivative instruction as the floating-point difference of two + * swizzles of the source, specified as \p swz0 and \p swz1. + */ +static bool +lower_derivative(fs_visitor &s, bblock_t *block, fs_inst *inst, + unsigned swz0, unsigned swz1) +{ + const fs_builder ubld = fs_builder(&s, block, inst).exec_all(); + const fs_reg tmp0 = ubld.vgrf(inst->src[0].type); + const fs_reg tmp1 = ubld.vgrf(inst->src[0].type); + + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp0, inst->src[0], brw_imm_ud(swz0)); + ubld.emit(SHADER_OPCODE_QUAD_SWIZZLE, tmp1, inst->src[0], brw_imm_ud(swz1)); + + inst->resize_sources(2); + inst->src[0] = negate(tmp0); + inst->src[1] = tmp1; + inst->opcode = BRW_OPCODE_ADD; + + return true; +} + +/** + * Lower derivative instructions on platforms where codegen cannot implement + * them efficiently (i.e. XeHP). + */ +bool +brw_fs_lower_derivatives(fs_visitor &s) +{ + bool progress = false; + + if (s.devinfo->verx10 < 125) + return false; + + foreach_block_and_inst(block, fs_inst, inst, s.cfg) { + if (inst->opcode == FS_OPCODE_DDX_COARSE) + progress |= lower_derivative(s, block, inst, + BRW_SWIZZLE_XXXX, BRW_SWIZZLE_YYYY); + + else if (inst->opcode == FS_OPCODE_DDX_FINE) + progress |= lower_derivative(s, block, inst, + BRW_SWIZZLE_XXZZ, BRW_SWIZZLE_YYWW); + + else if (inst->opcode == FS_OPCODE_DDY_COARSE) + progress |= lower_derivative(s, block, inst, + BRW_SWIZZLE_XXXX, BRW_SWIZZLE_ZZZZ); + + else if (inst->opcode == FS_OPCODE_DDY_FINE) + progress |= lower_derivative(s, block, inst, + BRW_SWIZZLE_XYXY, BRW_SWIZZLE_ZWZW); + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +bool +brw_fs_lower_find_live_channel(fs_visitor &s) +{ + bool progress = false; + + if (s.devinfo->ver < 8) + return false; + + bool packed_dispatch = + brw_stage_has_packed_dispatch(s.devinfo, s.stage, s.max_polygons, + s.stage_prog_data); + bool vmask = + s.stage == MESA_SHADER_FRAGMENT && + brw_wm_prog_data(s.stage_prog_data)->uses_vmask; + + foreach_block_and_inst_safe(block, fs_inst, inst, s.cfg) { + if (inst->opcode != SHADER_OPCODE_FIND_LIVE_CHANNEL && + inst->opcode != SHADER_OPCODE_FIND_LAST_LIVE_CHANNEL) + continue; + + bool first = inst->opcode == SHADER_OPCODE_FIND_LIVE_CHANNEL; + + /* Getting the first active channel index is easy on Gfx8: Just find + * the first bit set in the execution mask. The register exists on + * HSW already but it reads back as all ones when the current + * instruction has execution masking disabled, so it's kind of + * useless there. + */ + fs_reg exec_mask(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)); + + const fs_builder ibld(&s, block, inst); + if (!inst->is_partial_write()) + ibld.emit_undef_for_dst(inst); + + const fs_builder ubld = fs_builder(&s, block, inst).exec_all().group(1, 0); + + /* ce0 doesn't consider the thread dispatch mask (DMask or VMask), + * so combine the execution and dispatch masks to obtain the true mask. + * + * If we're looking for the first live channel, and we have packed + * dispatch, we can skip this step, as we know all dispatched channels + * will appear at the front of the mask. + */ + if (!(first && packed_dispatch)) { + fs_reg mask = ubld.vgrf(BRW_REGISTER_TYPE_UD); + ubld.UNDEF(mask); + ubld.emit(SHADER_OPCODE_READ_SR_REG, mask, brw_imm_ud(vmask ? 3 : 2)); + + /* Quarter control has the effect of magically shifting the value of + * ce0 so you'll get the first/last active channel relative to the + * specified quarter control as result. + */ + if (inst->group > 0) + ubld.SHR(mask, mask, brw_imm_ud(ALIGN(inst->group, 8))); + + ubld.AND(mask, exec_mask, mask); + exec_mask = mask; + } + + if (first) { + ubld.FBL(inst->dst, exec_mask); + } else { + fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 1); + ubld.UNDEF(tmp); + ubld.LZD(tmp, exec_mask); + ubld.ADD(inst->dst, negate(tmp), brw_imm_uw(31)); + } + + inst->remove(block); + progress = true; + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * From the Skylake PRM Vol. 2a docs for sends: + * + * "It is required that the second block of GRFs does not overlap with the + * first block." + * + * There are plenty of cases where we may accidentally violate this due to + * having, for instance, both sources be the constant 0. This little pass + * just adds a new vgrf for the second payload and copies it over. + */ +bool +brw_fs_lower_sends_overlapping_payload(fs_visitor &s) +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { + if (inst->opcode == SHADER_OPCODE_SEND && inst->ex_mlen > 0 && + regions_overlap(inst->src[2], inst->mlen * REG_SIZE, + inst->src[3], inst->ex_mlen * REG_SIZE)) { + fs_reg tmp = fs_reg(VGRF, s.alloc.allocate(inst->ex_mlen), + BRW_REGISTER_TYPE_UD); + /* Sadly, we've lost all notion of channels and bit sizes at this + * point. Just WE_all it. + */ + const fs_builder ibld = fs_builder(&s, block, inst).exec_all().group(16, 0); + fs_reg copy_src = retype(inst->src[3], BRW_REGISTER_TYPE_UD); + fs_reg copy_dst = tmp; + for (unsigned i = 0; i < inst->ex_mlen; i += 2) { + if (inst->ex_mlen == i + 1) { + /* Only one register left; do SIMD8 */ + ibld.group(8, 0).MOV(copy_dst, copy_src); + } else { + ibld.MOV(copy_dst, copy_src); + } + copy_src = offset(copy_src, ibld, 1); + copy_dst = offset(copy_dst, ibld, 1); + } + inst->src[3] = tmp; + progress = true; + } + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTIONS | DEPENDENCY_VARIABLES); + + return progress; +} + +/** + * Three source instruction must have a GRF/MRF destination register. + * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. + */ +bool +brw_fs_lower_3src_null_dest(fs_visitor &s) +{ + bool progress = false; + + foreach_block_and_inst_safe (block, fs_inst, inst, s.cfg) { + if (inst->is_3src(s.compiler) && inst->dst.is_null()) { + inst->dst = fs_reg(VGRF, s.alloc.allocate(s.dispatch_width / 8), + inst->dst.type); + progress = true; + } + } + + if (progress) + s.invalidate_analysis(DEPENDENCY_INSTRUCTION_DETAIL | + DEPENDENCY_VARIABLES); + + return progress; +} + diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index ea6382465c1..16cde743976 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -75,6 +75,7 @@ libintel_compiler_brw_files = files( 'brw_fs.h', 'brw_fs_live_variables.cpp', 'brw_fs_live_variables.h', + 'brw_fs_lower.cpp', 'brw_fs_lower_dpas.cpp', 'brw_fs_lower_pack.cpp', 'brw_fs_lower_regioning.cpp',