diff --git a/src/intel/compiler/jay/jay_builder.h b/src/intel/compiler/jay/jay_builder.h index da22cff6c55..5b3f6ee582d 100644 --- a/src/intel/compiler/jay/jay_builder.h +++ b/src/intel/compiler/jay/jay_builder.h @@ -462,6 +462,7 @@ struct jayb_send_params { enum jay_type src_type[2]; unsigned nr_srcs; uint32_t ex_desc_imm; + int split; /**< explicit split point */ bool eot; bool check_tdr; bool uniform; @@ -538,15 +539,9 @@ _jay_SEND(jay_builder *b, const struct jayb_send_params p) I->src[2] = p.nr_srcs > 0 ? p.srcs[0] : jay_null(); I->src[3] = p.nr_srcs > 1 ? p.srcs[1] : jay_null(); } else { - /* Otherwise, we need to pick a point to split at. - * - * Heuristic: don't split render targer writes becuase RA gets confused - * with the EOT requirements. Split everything else in half. - * - * TODO: Come up with a better heuristic. - */ + /* Otherwise, we need to pick a point to split at. */ assert(info->type_0 == info->type_1); - unsigned split = !p.check_tdr ? (p.nr_srcs / 2) : p.nr_srcs; + unsigned split = p.split > 0 ? p.split : p.nr_srcs / 2; I->src[2] = jay_collect_vectors(b, &p.srcs[0], split); I->src[3] = jay_collect_vectors(b, &p.srcs[split], p.nr_srcs - split); } diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index c88c4b6918b..ab233b769a6 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -804,6 +804,7 @@ jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr) jay_def srcs[4 + 16 + 4 + 1 + 16]; unsigned len = 0; + int split = -1; if (!jay_is_null(src0_alpha)) srcs[len++] = jay_as_gpr(b, src0_alpha); @@ -816,7 +817,16 @@ jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr) if (!jay_is_null(depth)) srcs[len++] = jay_as_gpr(b, depth); - assert(jay_is_null(stencil) && "TODO: stencil"); + if (!jay_is_null(stencil)) { + jay_def packed = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader)); + jay_BYTE_PACK(b, packed, jay_as_gpr(b, stencil)); + + /* Split send before stencil */ + split = len; + + for (unsigned i = 0; i < jay_num_values(packed); i++) + srcs[len++] = jay_extract(packed, i); + } /* Optimize out unconditional discards (probably should do this in NIR) */ if (nir_src_is_const(intr->src[5]) && nir_src_as_bool(intr->src[5])) { @@ -824,10 +834,15 @@ jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr) srcs[i] = jay_UNDEF_u32(b); } + /* Our current send splitting heuristic is bad, override it. */ + if (split == -1) { + split = len; + } + jay_inst *send = jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true, .msg_desc = desc | (ex_desc << 32), .srcs = srcs, .nr_srcs = len, - .type = JAY_TYPE_U32, .eot = last); + .type = JAY_TYPE_U32, .eot = last, .split = split); /* Handle the disable predicate. It is logically inverted. */ if (!nir_src_is_const(intr->src[5]) || nir_src_as_bool(intr->src[5])) { diff --git a/src/intel/compiler/jay/jay_nir.c b/src/intel/compiler/jay/jay_nir.c index dfaffb04a25..e3d4b588616 100644 --- a/src/intel/compiler/jay/jay_nir.c +++ b/src/intel/compiler/jay/jay_nir.c @@ -151,7 +151,6 @@ collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_) } else if (loc == FRAG_RESULT_DEPTH) { out = &ctx->depth; } else if (loc == FRAG_RESULT_STENCIL) { - UNREACHABLE("todo"); out = &ctx->stencil; } else if (loc == FRAG_RESULT_SAMPLE_MASK) { UNREACHABLE("todo"); diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py index 80bacd8442b..548557ae3c5 100644 --- a/src/intel/compiler/jay/jay_opcodes.py +++ b/src/intel/compiler/jay/jay_opcodes.py @@ -166,6 +166,12 @@ op('extract_byte_per_8lanes', 2, 'u32') op('shr_odd_subspans_by_4', 1, 'u16') op('and_u32_u16', 2, 'u32') +# Copy the first byte of each lane, treating the destination as if it were +# effectively JAY_STRIDE_1 (which doesn't exist). Because the destination +# doesn't follow proper lane alignments, this should not write to GPRs. +# This is used for stencil outputs in render target write messages. +op('byte_pack', 1, 'u32') + # Pixel coord calculations. expand_quad replicates out the per-2x2 values from # its source g0.[10...13] and - in the case of SIMD32 - g1.[10...13] into a # per-lane value. Then offset_packed_pixel_coords adds the appropriate packed diff --git a/src/intel/compiler/jay/jay_opt_propagate.c b/src/intel/compiler/jay/jay_opt_propagate.c index 3718e4b83db..cf37c8369f6 100644 --- a/src/intel/compiler/jay/jay_opt_propagate.c +++ b/src/intel/compiler/jay/jay_opt_propagate.c @@ -153,7 +153,9 @@ propagate_forwards(jay_function *f) } /* Don't propagate into phis yet - TODO: File awareness */ - if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND) + if (I->op == JAY_OPCODE_PHI_SRC || + I->op == JAY_OPCODE_SEND || + I->op == JAY_OPCODE_BYTE_PACK) continue; jay_foreach_ssa_src(I, s) { diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c index 2aa6408b033..6fdfef4780c 100644 --- a/src/intel/compiler/jay/jay_to_binary.c +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -488,6 +488,11 @@ emit(struct brw_codegen *p, brw_MOV(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UB), 1, 8, 0)); break; + case JAY_OPCODE_BYTE_PACK: + brw_MOV(p, stride(retype(dst, BRW_TYPE_UB), 1, 1, 0), + stride(retype(SRC(0), BRW_TYPE_UB), 4, 1, 0)); + break; + case JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4: brw_SHR(p, dst, SRC(0), brw_imm_uv(0x44440000)); break;