diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index e94adf7b89c..c98a061ab11 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -98,7 +98,6 @@ struct nir_to_jay_state { jay_def sampler_state_pointer, scratch_surface; jay_def inline_data; jay_def push_data[512]; - jay_def lane_id; jay_def urb_handle; union { @@ -1365,11 +1364,19 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) break; } - case nir_intrinsic_load_subgroup_invocation: + case nir_intrinsic_load_subgroup_invocation: { + jay_def lid = jay_alloc_def(b, UGPR, s->dispatch_width / 2); + jay_LANE_ID_8(b, jay_extract_range(lid, 0, 4)); + + for (unsigned i = 8; i < s->dispatch_width; i *= 2) { + jay_ADD(b, JAY_TYPE_U16, jay_extract_range(lid, i / 2, i / 2), + jay_extract_range(lid, 0, i / 2), i); + } + /* TODO: Lower this in NIR? */ - jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16, - JAY_ROUND, 0); + jay_CVT(b, JAY_TYPE_U32, dst, lid, JAY_TYPE_U16, JAY_ROUND, 0); break; + } case nir_intrinsic_demote: case nir_intrinsic_demote_if: @@ -1422,20 +1429,17 @@ jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) } /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a - * GPR input, we could uniformize (as behaviour is undefined for - * non-uniform inputs) but a lowered bit extract is cheaper than uniformize. + * GPR input, behaviour is undefined for non-uniform inputs. TODO: a lowered + * bit extract is cheaper than uniformize, but maybe lower in NIR..? */ case nir_intrinsic_inverse_ballot: { assert(dst.file == FLAG); jay_def x = nj_src(intr->src[0]); if (x.file == GPR) { - jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id); - jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1); - jay_set_conditional_mod(b, and, dst, GEN_CONDITION_NE); - } else { - jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width; + x = emit_uniformize(nj, x); } + jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width; break; } @@ -2765,18 +2769,6 @@ jay_setup_payload(struct nir_to_jay_state *nj) } s->payload_gprs = p.offsets[GPR]; - - /* Lane ID calculations require &W and therefore are calculated in - * uniform control flow to sidestep RA problems. The easy solution is - * calculating the lane ID in the first block. - * - * XXX: This doesn't work for multi-function. Reconsider. - */ - nj->payload.lane_id = jay_LANE_ID_8_u16(b); - - for (unsigned i = 8; i < s->dispatch_width; i *= 2) { - nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i); - } } /* diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h index 0a7fc576f5d..163767cd013 100644 --- a/src/intel/compiler/jay/jay_ir.h +++ b/src/intel/compiler/jay/jay_ir.h @@ -945,7 +945,7 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I) bool simd1 = jay_inst_is_uniform(I) && !I->broadcast_flag; unsigned base = simd1 ? 1 : s->dispatch_width; - /* Handle vectors-of-UGPR operations with special care for 64-bit */ + /* Handle vectors-of-UGPR operations with special care for bitsizes */ unsigned vec_per_channel = jay_type_vector_length(I->type); unsigned dst_size = jay_num_values(I->dst); assert(util_is_aligned(dst_size, vec_per_channel)); @@ -953,6 +953,12 @@ jay_simd_width_logical(const jay_shader *s, const jay_inst *I) if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) { assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1); base = dst_size; + + if (jay_type_size_bits(I->type) == 8) { + base *= 4; + } else if (jay_type_size_bits(I->type) == 16) { + base *= 2; + } } return base; @@ -985,9 +991,7 @@ jay_is_no_mask(const jay_inst *I) I->op == JAY_OPCODE_QUAD_SWIZZLE || I->op == JAY_OPCODE_DESWIZZLE_EVEN || I->op == JAY_OPCODE_DESWIZZLE_ODD || - I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || - I->op == JAY_OPCODE_LANE_ID_8 || - I->op == JAY_OPCODE_LANE_ID_EXPAND; + I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS; } /** diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c index 76f15b4cdc9..48071637b35 100644 --- a/src/intel/compiler/jay/jay_lower_spill.c +++ b/src/intel/compiler/jay/jay_lower_spill.c @@ -50,17 +50,9 @@ void jay_lower_spill(jay_function *func) { jay_builder b = jay_init_builder(func, jay_before_function(func)); + signed ugpr_reservation = -1; /* We reserved a block of UGPRs for our use */ - signed ugpr_reservation = -1, gpr2 = -1; - for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) { - struct jay_register_block B = func->shader->partition.blocks[GPR][i]; - - if (B.stride == JAY_STRIDE_2) { - gpr2 = B.start_gpr; - } - } - for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) { struct jay_register_block B = func->shader->partition.blocks[UGPR][i]; @@ -70,7 +62,6 @@ jay_lower_spill(jay_function *func) } assert(ugpr_reservation >= 0 && "must have reserved something"); - assert(gpr2 >= 0 && "must have a stride-2 gpr"); jay_def sp = jay_bare_reg(UGPR, ugpr_reservation); sp.num_values_m1 = func->shader->dispatch_width - 1; @@ -97,14 +88,17 @@ jay_lower_spill(jay_function *func) jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4); /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */ - jay_def tmp2 = jay_bare_reg(GPR, gpr2); - jay_LANE_ID_8(&b, tmp2); - for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) { - jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i); + unsigned disp_width = b.shader->dispatch_width; + jay_LANE_ID_8(&b, jay_extract_range_post_ra(sp, 0, 4)); + + for (unsigned i = 8; i < disp_width; i *= 2) { + jay_ADD(&b, JAY_TYPE_U16, jay_extract_range_post_ra(sp, i / 2, i / 2), + jay_extract_range_post_ra(sp, 0, i / 2), i); } - jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4)); - jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0); + jay_def lid = jay_extract_range_post_ra(sp, 0, disp_width / 2); + jay_SHL(&b, JAY_TYPE_U16, lid, lid, util_logbase2(4)); + jay_CVT(&b, JAY_TYPE_U32, sp, lid, JAY_TYPE_U16, JAY_ROUND, 0); if (b.shader->scratch_size) { jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size); } diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py index 13c218279a0..f3a15e7096b 100644 --- a/src/intel/compiler/jay/jay_opcodes.py +++ b/src/intel/compiler/jay/jay_opcodes.py @@ -156,10 +156,8 @@ op('preload', 0, 'u32', 0, ['unsigned reg']) op('deswizzle_odd', 2, 'f32', 0, ['bool src2_hi']) op('deswizzle_even', 1, 'f32', 0, ['bool src_hi']) -# Calculating the lane ID requires multiple power-of-two steps each involving -# complex architectural features not modelled in the IR. +# Return the UGPR[4] vector (0, 1, 2, 3, 4, 5, 6, 7) as packed 16-bit. op('lane_id_8', 0, 'u16') -op('lane_id_expand', 1, 'u16', 0, ['unsigned width']) # Fill a scalar GPR from a contiguous UGPR[16] range containing words or bytes. # src_type can be either U8 or U16 (only). For U8, stride can be 1 or 2, and diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c index 40d09cd9a53..d793cdd90b0 100644 --- a/src/intel/compiler/jay/jay_register_allocate.c +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -643,9 +643,6 @@ pick_regs_from_block(jay_ra_state *ra, unsigned *best_reg, unsigned first) { - bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND; - must_tie &= !is_src; - /* Cross-lane access cannot be SIMD split if the source/destination registers * overlap, but as long as we don't tie those destinations, we're ok. */ @@ -657,9 +654,7 @@ pick_regs_from_block(jay_ra_state *ra, unsigned cost = block_cost; bool tied = !is_src && BITSET_TEST(ra->killed[file], r); - - if (tied ? !may_tie : - (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) + if (tied ? !may_tie : BITSET_TEST_COUNT(ra->pinned[file], r, size)) continue; /* Try to tie predicated default values, otherwise post-RA lowering needs diff --git a/src/intel/compiler/jay/jay_stride.c b/src/intel/compiler/jay/jay_stride.c index 87d565143ec..ecdfa504c5f 100644 --- a/src/intel/compiler/jay/jay_stride.c +++ b/src/intel/compiler/jay/jay_stride.c @@ -110,13 +110,6 @@ jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max) max = JAY_STRIDE_4; } - /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet - * (preferring to burn the upper bits) but it is used internally. - */ - if (I->op == JAY_OPCODE_LANE_ID_EXPAND) { - max = JAY_STRIDE_2; - } - if (restrict_mixed_strides(I, s) && jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) { diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c index 75f1f2a41b2..41a91fb3ca5 100644 --- a/src/intel/compiler/jay/jay_to_binary.c +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -195,7 +195,7 @@ to_gen_operand(jay_function *f, R = gen_retype(gen_restride(R, 0, 1, 0), GEN_TYPE_UD); /* Handle 3-src restrictions and vectorized uniform code. */ - if (is_dest || jay_num_values(d) >= 8) { + if (is_dest || jay_num_values(d) >= 4) { R = gen_restride(R, 8, 8, 1); } @@ -203,7 +203,7 @@ to_gen_operand(jay_function *f, * but if we write a single UGPR the stride is ignored.. Specify * whatever stride is needed to satisfy the rules. */ - if (is_dest) { + if (is_dest && I->num_srcs > 0) { /* BSpec 56640 "Special Restrictions" says: * * "Conversion between HF and Integer must be DWord-aligned @@ -636,19 +636,9 @@ emit(struct jay_codegen *jc, break; case JAY_OPCODE_LANE_ID_8: - jc->state.exec_size = 8; jc_MOV(jc, dst, gen_imm_uv(0x76543210)); break; - case JAY_OPCODE_LANE_ID_EXPAND: { - unsigned width = jay_lane_id_expand_width(I); - jc->state.exec_size = width; - jc_append2(GEN_OP_ADD, - gen_element_offset(jc->devinfo, dst, width), - SRC(0), gen_imm_uw(width)); - break; - } - case JAY_OPCODE_GPR_FROM_UGPRS: jc_MOV(jc, dst, gen_byte_offset(jc->devinfo, diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c index c21bcb232b4..d6ef88b2b18 100644 --- a/src/intel/compiler/jay/jay_validate.c +++ b/src/intel/compiler/jay/jay_validate.c @@ -78,9 +78,16 @@ validate_flagness(struct validate_state *validate, CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def)); } +static unsigned +adjust_width_for_type(unsigned width, enum jay_type type) +{ + return (width * jay_type_size_bits(type)) / 32; +} + static unsigned get_src_words(struct validate_state *validate, jay_inst *I, unsigned s) { + /* TODO: I think this can be simplified */ if (I->op == JAY_OPCODE_EXPAND_QUAD) { return 4; } @@ -89,19 +96,19 @@ get_src_words(struct validate_state *validate, jay_inst *I, unsigned s) return jay_ugpr_per_grf(validate->func->shader); } - bool vectorized = I->dst.file == UGPR && - jay_num_values(I->dst) > jay_type_vector_length(I->type) && - I->op != JAY_OPCODE_SEND && - jay_num_values(I->src[s]) > 1; - + unsigned simd_width = jay_simd_width_logical(validate->func->shader, I); unsigned elsize = jay_type_vector_length(jay_src_type(I, s)); - unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1); - if (vectorized && I->src[s].file == GPR) { - CHECK(words == validate->func->shader->dispatch_width); + if (I->src[s].file == GPR && I->dst.file == UGPR) { + CHECK(jay_num_values(I->dst) == + adjust_width_for_type(simd_width, I->type) || + I->op == JAY_OPCODE_SEND); + return 1; + } else if (I->src[s].file == UGPR && jay_num_values(I->src[s]) > elsize) { + return adjust_width_for_type(simd_width, jay_src_type(I, s)); } else { - return words; + return elsize; } }