diff --git a/src/.clang-format b/src/.clang-format index 7e22bed1676..d2df8c5b55d 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -300,6 +300,52 @@ ForEachMacros: - foreach_bo - foreach_bo_safe +# intel + - jay_foreach_ssa_file + - jay_foreach_function + - jay_foreach_block + - jay_foreach_block_safe + - jay_foreach_block_rev + - jay_foreach_block_from + - jay_foreach_block_from_rev + - jay_foreach_dst + - jay_foreach_dst_index + - jay_foreach_inst_in_block + - jay_foreach_inst_in_block_rev + - jay_foreach_inst_in_block_safe + - jay_foreach_inst_in_block_safe_rev + - jay_foreach_inst_in_block_from + - jay_foreach_inst_in_block_from_rev + - jay_foreach_inst_in_shader + - jay_foreach_inst_in_shader_rev + - jay_foreach_inst_in_shader_safe + - jay_foreach_inst_in_shader_safe_rev + - jay_foreach_inst_in_func + - jay_foreach_inst_in_func_rev + - jay_foreach_inst_in_func_safe + - jay_foreach_inst_in_func_safe_rev + - jay_foreach_successor + - jay_foreach_predecessor + - jay_foreach_comp + - jay_foreach_comp_rev + - jay_foreach_src + - jay_foreach_src_rev + - jay_foreach_ssa_src + - jay_foreach_ssa_src_rev + - jay_foreach_ssa_src_comp + - jay_foreach_index + - jay_foreach_index_rev + - jay_foreach_src_index + - jay_foreach_src_index_rev + - jay_repair_foreach_phi + - jay_foreach_phi_src_in_block + - jay_foreach_phi_dst_in_block + - jay_foreach_preload + - jay_foreach_killed + - jay_foreach_ra_src + - jay_foreach_ra_file + - jay_foreach_pipe + # Disable clang formatting by default. Drivers that use clang-format # inherit from this .clang-format file and re-enable formatting: # diff --git a/src/intel/compiler/jay/.clang-format b/src/intel/compiler/jay/.clang-format new file mode 100644 index 00000000000..04cf17f20bb --- /dev/null +++ b/src/intel/compiler/jay/.clang-format @@ -0,0 +1,31 @@ +BasedOnStyle: InheritParentConfig +DisableFormat: false + +AlignConsecutiveBitFields: Consecutive +BitFieldColonSpacing: None + +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: + Enabled: true + AcrossComments: true +AlignArrayOfStructures: Left + +ColumnLimit: 80 + +BreakStringLiterals: false +SpaceBeforeParens: ControlStatementsExceptControlMacros +SpaceAfterCStyleCast: true +BinPackParameters: OnePerLine +AllowAllArgumentsOnNextLine: false +PenaltyBreakBeforeFirstCallParameter: 100 +ReferenceAlignment: Middle + +BreakBeforeBinaryOperators: None +PenaltyBreakAssignment: 0 + +SpacesInContainerLiterals: true +Cpp11BracedListStyle: false + +AlignOperands: Align +BreakBinaryOperations: RespectPrecedence +BreakBeforeTernaryOperators: false diff --git a/src/intel/compiler/jay/README.md b/src/intel/compiler/jay/README.md new file mode 100644 index 00000000000..8ac3ed0897b --- /dev/null +++ b/src/intel/compiler/jay/README.md @@ -0,0 +1,3 @@ +Xe2 compiler experiments. + +**Work-in-progress, not ready for users/benchmarks.** diff --git a/src/intel/compiler/jay/jay.h b/src/intel/compiler/jay/jay.h new file mode 100644 index 00000000000..914c0d8ea71 --- /dev/null +++ b/src/intel/compiler/jay/jay.h @@ -0,0 +1,25 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/brw/brw_compiler.h" +#include "util/shader_stats.h" +#include "nir.h" + +struct intel_device_info; +struct nir_shader_compiler_options; + +struct jay_shader_bin { + const uint32_t *kernel; + uint32_t size; + struct genisa_stats stats; +}; + +struct jay_shader_bin *jay_compile(const struct intel_device_info *devinfo, + void *mem_ctx, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key); diff --git a/src/intel/compiler/jay/jay_assign_flags.c b/src/intel/compiler/jay/jay_assign_flags.c new file mode 100644 index 00000000000..5442eb154a1 --- /dev/null +++ b/src/intel/compiler/jay/jay_assign_flags.c @@ -0,0 +1,365 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * Instruction selection works on SSA FLAG and UFLAG variables. This pass + * implements a flag register allocator, assigning each FLAG/UFLAG either to a + * hardware flag register and/or spilling to a GPR/UGPR. + * + * As a simplification, hardware flags are block-local. At block boundaries, + * 32-bit 0/~0 (U)GPRs are our canonical representation for (U)FLAGs. + * + * Producers: CMP produce both 0/~0 GPRs and flags, while conditional modifiers + * produce only flags. Boolean arithmetic is lowered to GPRs. + * + * Consumers: SEL/CSEL consumes both GPRs and flags, while predication consumes + * only flags. Boolean arithmetic again requires GPRs. + * + * Our strategy is to turn flags into GPR representations globally while keeping + * copies in flags where it makes sense locally. + */ + +static inline jay_def +canonicalize_flag(jay_def x) +{ + assert(jay_is_flag(x)); + x.file = x.file == UFLAG ? UGPR : GPR; + return x; +} + +struct var_info { + unsigned flag :3; + bool uniform :1; + bool read_by_predication:1; + bool free_canonical :1; + unsigned pad :2; +} PACKED; +static_assert(sizeof(struct var_info) == 1); + +struct flag_ra { + jay_builder *b; + struct var_info *vars; + uint32_t flag_to_global[JAY_MAX_FLAGS]; + uint32_t flag_to_local[JAY_MAX_FLAGS]; + unsigned roundrobin; + unsigned ballots:JAY_MAX_FLAGS; +}; + +static jay_def +assign_flag(struct flag_ra *ra, + jay_def flag, + enum jay_file file, + bool free_canonical, + bool ballot) +{ + jay_def canonical = canonicalize_flag(flag); + jay_def tmp = jay_alloc_def(ra->b, file, 1); + + /* Dedicate a flag for ballot since uniform access would clobber the zeroing. + * TODO: We could optimize this with more tracking. + */ + unsigned num_flags = jay_num_regs(ra->b->shader, FLAG); + tmp.reg = ballot ? 0 : (1 + (ra->roundrobin++) % (num_flags - 2)); + + ra->vars[jay_index(canonical)] = (struct var_info) { + .uniform = tmp.file == UFLAG, + .flag = tmp.reg, + .free_canonical = free_canonical, + }; + + ra->flag_to_global[tmp.reg] = jay_index(canonical); + ra->flag_to_local[tmp.reg] = jay_index(tmp); + + if (ballot) { + ra->ballots |= BITFIELD_BIT(tmp.reg); + } + + return tmp; +} + +static bool +rewrite_sel_with_zero(jay_inst *I, unsigned zero) +{ + jay_def flag = I->src[2]; + unsigned other = 1 - zero; + + if (!jay_defs_equivalent(I->src[zero], jay_imm(0)) || + I->src[other].abs || + I->src[other].negate || + jay_type_size_bits(I->type) != 32) { + return false; + } + + if (jay_defs_equivalent(I->src[other], jay_imm(0xffffffff)) && zero == 1) { + /* (c ? 0xffffffff : 0) -> canonical(c) */ + I->op = JAY_OPCODE_MOV; + I->src[0] = canonicalize_flag(flag); + jay_shrink_sources(I, 1); + } else { + /* ([!]c ? a : 0) --> (a & [~]canonical(c)) and + * ([!]c ? 0 : a) --> (a & ~[~]canonical(c)) + */ + I->op = JAY_OPCODE_AND; + I->src[0] = I->src[other]; + I->src[1] = canonicalize_flag(flag); + I->src[1].negate ^= (zero == 0); + jay_shrink_sources(I, 2); + } + + return true; +} + +static bool +rewrite_sel_to_csel(jay_inst *I) +{ + if (jay_type_size_bits(I->type) != 32) { + return false; + } + + /* SEL.f32 lowers to CSEL.f32 to preserve source modifiers & float controls. + * That works since we reinterpret 0/~0 as 0.0/NaN. + */ + jay_def flag = I->src[2]; + I->op = JAY_OPCODE_CSEL; + I->conditional_mod = flag.negate ? JAY_CONDITIONAL_EQ : JAY_CONDITIONAL_NE; + I->src[2] = canonicalize_flag(flag); + I->src[2].negate = false; + return true; +} + +static bool +rewrite_without_flag(struct flag_ra *ra, jay_inst *I, unsigned s, bool in_flag) +{ + if (I->op == JAY_OPCODE_PHI_SRC) { + I->src[s] = canonicalize_flag(I->src[s]); + return true; + } + + if (jay_debug & JAY_DBG_NOOPT) { + return false; + } + + if (I->op == JAY_OPCODE_SEL && + (!in_flag || ra->vars[jay_index(I->src[s])].free_canonical) && + !I->predication) { + + return rewrite_sel_with_zero(I, 0) || + rewrite_sel_with_zero(I, 1) || + (!in_flag && rewrite_sel_to_csel(I)); + } + + return false; +} + +static void +assign_block(jay_function *func, jay_block *block, struct var_info *var_to_flag) +{ + jay_builder b = { .shader = func->shader, .func = func }; + struct flag_ra ra_ = { .b = &b, .vars = var_to_flag }, *ra = &ra_; + + jay_foreach_inst_in_block_safe(block, I) { + if (I->op == JAY_OPCODE_CAST_CANONICAL_TO_FLAG) { + /* Assume the source is already 0/~0 canonical and use it. */ + I->op = JAY_OPCODE_MOV; + I->type = JAY_TYPE_U32; + I->dst = canonicalize_flag(I->dst); + continue; + } else if (I->type == JAY_TYPE_U1) { + /* Boolean logic turns into bitwise logic on the canonical form */ + if (!jay_is_null(I->dst)) { + I->dst = canonicalize_flag(I->dst); + } + + jay_foreach_src(I, s) { + if (!(s == 2 && I->op == JAY_OPCODE_SEL) && + jay_src_type(I, s) == JAY_TYPE_U1) { + if (jay_is_imm(I->src[s])) { + /* Convert 1-bit boolean to 0/~0 */ + assert(jay_is_imm(I->src[s]) && jay_as_uint(I->src[s]) <= 1); + I->src[s] = jay_imm(jay_as_uint(I->src[s]) ? ~0 : 0); + } else { + I->src[s] = canonicalize_flag(I->src[s]); + } + } + } + + I->type = JAY_TYPE_U32; + } + + /* Handle flag sources */ + jay_foreach_src(I, s) { + if (!jay_is_flag(I->src[s])) { + continue; + } + + unsigned index = jay_index(I->src[s]); + bool ballot = jay_src_type(I, s) != JAY_TYPE_U1; + enum jay_file file = I->dst.file == UGPR && !ballot ? UFLAG : FLAG; + bool in_flag = ra->flag_to_global[var_to_flag[index].flag] == index && + ((file == UFLAG) == var_to_flag[index].uniform); + + /* If we don't actually need the flag, we're done. */ + if (rewrite_without_flag(ra, I, s, in_flag)) { + continue; + } + + /* Otherwise, ensure we have the value in a flag. */ + if (!in_flag) { + jay_def tmp = assign_flag(ra, I->src[s], file, false, ballot); + + /* XXX: We need a more systematic approach to modifiers :/ */ + b.cursor = jay_before_inst(I); + jay_def d = I->src[s]; + d.negate = false; + jay_CMP(&b, JAY_TYPE_U32, JAY_CONDITIONAL_NE, tmp, + canonicalize_flag(d), 0); + } + + /* ...and rewrite to use the flag */ + unsigned reg = var_to_flag[index].flag; + jay_def flag = jay_scalar(file, ra->flag_to_local[reg]); + flag.reg = reg; + jay_replace_src(&I->src[s], flag); + } + + /* Handle flag writes */ + b.cursor = jay_after_inst(I); + + /* If the flag is written directly (for an inverse ballot), recover the + * canonical representation with a SEL. + */ + if (!jay_is_null(I->dst) && jay_is_flag(I->dst)) { + jay_def canonical = canonicalize_flag(I->dst); + I->dst = assign_flag(ra, I->dst, I->dst.file, false, false); + jay_SEL(&b, JAY_TYPE_U32, canonical, ~0, 0, I->dst); + } + + if (!jay_is_null(I->cond_flag)) { + I->broadcast_flag = + var_to_flag[jay_index(I->cond_flag)].read_by_predication && + I->cond_flag.file == UFLAG && + I->op == JAY_OPCODE_CMP; + + jay_def canonical = canonicalize_flag(I->cond_flag); + I->cond_flag = + assign_flag(ra, I->cond_flag, + I->broadcast_flag ? FLAG : I->cond_flag.file, + I->op == JAY_OPCODE_CMP, false); + + if (I->op == JAY_OPCODE_CMP) { + assert(jay_is_null(I->dst)); + + if (I->broadcast_flag) { + /* We need to recover the UGPR from the replicated FLAG. Thanks + * to our write-masking and broadcasting, the flag is already + * 0/~0. We simply need to sign-extend. + */ + jay_i2i32(&b, canonical, b.shader->dispatch_width, I->cond_flag); + } else if (jay_type_size_bits(I->type) != 32) { + I->dst = jay_alloc_def(&b, canonical.file, + jay_type_vector_length(I->type)); + jay_i2i32(&b, canonical, jay_type_size_bits(I->type), I->dst); + } else { + /* 32-bit CMP returns the canonical form */ + I->dst = canonical; + } + } else { + assert(jay_type_size_bits(I->type) == 32 && "limited cmod prop"); + + if (jay_is_null(I->dst)) { + I->dst = jay_alloc_def(&b, canonical.file, + jay_type_vector_length(I->type)); + } + + /* Recover the canonical representation with a CMP. Hopefully, + * either the CMP or the cmod will be eliminated by a later DCE. + */ + jay_CMP(&b, I->type, I->conditional_mod, canonical, I->dst, 0) + ->cond_flag.reg = + jay_num_regs(b.shader, FLAG) - 1; // TODO: no null flag + } + } + } + + /* Ballots require zeroing flags */ + b.cursor = jay_before_block(block); + u_foreach_bit(i, ra->ballots) { + jay_ZERO_FLAG(&b, i); + } +} + +static void +copyprop(jay_function *f) +{ + jay_inst **defs = calloc(f->ssa_alloc, sizeof(defs[0])); + + jay_foreach_inst_in_func_safe(f, block, I) { + jay_foreach_dst_index(I, _, d) { + defs[d] = I; + } + + if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND) + continue; + + jay_foreach_ssa_src(I, s) { + jay_def src = I->src[s]; + if (src.collect) + continue; + + jay_inst *def = defs[jay_base_index(src)]; + if (jay_defs_equivalent(def->dst, src) && + !def->predication && + def->op == JAY_OPCODE_MOV && + (I->src[s].file == def->src[0].file || + (I->op == JAY_OPCODE_CMP && jay_is_imm(def->src[0])))) { + + jay_replace_src(&I->src[s], def->src[0]); + } + } + } + + free(defs); +} + +void +jay_assign_flags(jay_shader *s) +{ + jay_foreach_function(s, f) { + struct var_info *map = calloc(f->ssa_alloc, sizeof(map[0])); + uint32_t *def_to_block = calloc(f->ssa_alloc, sizeof(def_to_block)); + + jay_foreach_inst_in_func(f, block, I) { + if (!jay_is_null(I->cond_flag)) { + def_to_block[jay_index(I->cond_flag)] = block->index + 1; + } + + if (I->predication) { + jay_def predicate = *jay_inst_get_predicate(I); + if (def_to_block[jay_index(predicate)] == block->index + 1) { + map[jay_index(predicate)].read_by_predication = true; + } + } + } + + jay_foreach_block(f, b) { + assign_block(f, b, map); + } + + free(map); + free(def_to_block); + + /* Flag RA leaves moves. Clean up after ourselves. */ + copyprop(f); + } +} +/* TODO: revisit + * dEQP-GLES3.functional.shaders.arrays.compare.equal_highp_vec4_highp_vec4_vertex + */ diff --git a/src/intel/compiler/jay/jay_builder.h b/src/intel/compiler/jay/jay_builder.h new file mode 100644 index 00000000000..a65b826e9f2 --- /dev/null +++ b/src/intel/compiler/jay/jay_builder.h @@ -0,0 +1,643 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "jay_ir.h" +#include "jay_opcodes.h" + +/* Like in NIR, for use with the builder */ +enum jay_cursor_option { + jay_cursor_after_block, + jay_cursor_before_inst, + jay_cursor_after_inst +}; + +typedef struct PACKED { + union { + jay_block *block; + jay_inst *inst; + }; + + enum jay_cursor_option option; +} jay_cursor; + +static inline bool +jay_cursors_equal(jay_cursor a, jay_cursor b) +{ + return !memcmp(&a, &b, sizeof(a)); +} + +static inline jay_cursor +jay_after_block(jay_block *block) +{ + return (jay_cursor) { .block = block, .option = jay_cursor_after_block }; +} + +static inline jay_cursor +jay_before_inst(jay_inst *I) +{ + return (jay_cursor) { .inst = I, .option = jay_cursor_before_inst }; +} + +static inline jay_cursor +jay_after_inst(jay_inst *I) +{ + return (jay_cursor) { .inst = I, .option = jay_cursor_after_inst }; +} + +static inline jay_cursor +jay_before_block(jay_block *block) +{ + jay_foreach_inst_in_block(block, I) { + if (I->op != JAY_OPCODE_PHI_DST && + I->op != JAY_OPCODE_PRELOAD && + I->op != JAY_OPCODE_ELSE) + return jay_before_inst(I); + } + + /* Whole block is phis, so insert at the end */ + return jay_after_block(block); +} + +static inline jay_cursor +jay_after_block_logical(jay_block *block) +{ + jay_foreach_inst_in_block_rev(block, I) { + if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) + return jay_after_inst(I); + } + + /* Whole block is phis, so insert at the start */ + return jay_before_block(block); +} + +static inline jay_cursor +jay_before_jump(jay_block *block) +{ + jay_inst *jump = jay_block_ending_jump(block); + return jump ? jay_before_inst(jump) : jay_after_block(block); +} + +/* Get a cursor at the start of a function, after any preloads */ +static inline jay_cursor +jay_before_function(jay_function *f) +{ + jay_block *block = jay_first_block(f); + + jay_foreach_inst_in_block(block, I) { + if (I->op != JAY_OPCODE_PRELOAD) + return jay_before_inst(I); + } + + /* The whole block is preloads, so insert at the end */ + return jay_after_block(block); +} + +/* + * Map a control flow edge to a block. If the block has one successor, the + * predecessor is unique. Else, the successor is unique; the successor must not + * have other predecessorss since there are no critical edges. + */ +static inline jay_block * +jay_edge_to_block(jay_block *pred, jay_block *succ) +{ + assert(jay_num_successors(pred) == 1 || jay_num_predecessors(succ) == 1); + return jay_num_successors(pred) == 1 ? pred : succ; +} + +/* + * Get a cursor to insert along a control flow edge: either at the start of + * the successor or the end of the predecessor. This relies on the control + * flow graph having no critical edges. + */ +static inline jay_cursor +jay_along_edge(jay_block *pred, jay_block *succ) +{ + jay_block *to = jay_edge_to_block(pred, succ); + + if (to == pred) + return jay_after_block_logical(pred); + else + return jay_before_block(succ); +} + +typedef struct { + jay_shader *shader; + jay_function *func; + jay_cursor cursor; +} jay_builder; + +static inline jay_builder +jay_init_builder(jay_function *f, jay_cursor cursor) +{ + return (jay_builder) { .shader = f->shader, .func = f, .cursor = cursor }; +} + +static inline void +jay_builder_insert(jay_builder *b, jay_inst *I) +{ + jay_cursor *cursor = &b->cursor; + + if (cursor->option == jay_cursor_after_inst) { + list_add(&I->link, &cursor->inst->link); + } else if (cursor->option == jay_cursor_after_block) { + list_addtail(&I->link, &cursor->block->instructions); + } else { + assert(cursor->option == jay_cursor_before_inst); + list_addtail(&I->link, &cursor->inst->link); + } + + cursor->option = jay_cursor_after_inst; + cursor->inst = I; +} + +static inline jay_def +jay_alloc_def(jay_builder *b, enum jay_file file, unsigned size) +{ + unsigned idx = b->func->ssa_alloc; + b->func->ssa_alloc += size; + return jay_contiguous_def(file, idx, size); +} + +/* + * Collect SSA indices into a source. If the indices are not contiguous, this + * uses a heap-allocated collect. Otherwise, a contiguous def is used. + */ +static inline jay_def +jay_collect(jay_builder *b, + enum jay_file file, + const uint32_t *indices, + unsigned nr) +{ + if (nr == 0) + return jay_null(); + + for (unsigned i = 1; i < nr; ++i) { + if (indices[i] != (indices[0] + i)) { + static_assert(sizeof(uintptr_t) <= sizeof(uint64_t) && + "sorry, no Morello support"); + void *dup = + linear_memdup(b->shader->lin_ctx, indices, sizeof(uint32_t) * nr); + uint64_t payload = (uintptr_t) dup; + + /* We require pointers to fit within (32+JAY_REG_BITS) bits. Luckily + * this will always be the case on common architectures. + */ + assert(payload < (1ull << (32 + JAY_REG_BITS))); + + return (jay_def) { + ._payload = (uint32_t) payload, + .reg = (uint32_t) (payload >> 32), + .file = file, + .num_values_m1 = nr - 1, + .collect = true, + }; + } + } + + return jay_contiguous_def(file, indices[0], nr); +} + +/* + * Set the n'th channel of a def to index. This requires a copy-on-write. + * + * This implementation could likely be optimized. + */ +static inline void +jay_insert_channel(jay_builder *b, jay_def *d, unsigned c, jay_def scalar) +{ + uint32_t indices[JAY_MAX_DEF_LENGTH]; + uint32_t count = jay_num_values(*d); + + assert(scalar.file == d->file && !scalar.negate && !scalar.abs); + assert(c < count && count <= ARRAY_SIZE(indices)); + + /* First, decompress the def. */ + jay_foreach_comp(*d, i) { + indices[i] = jay_channel(*d, i); + } + + /* Next, update the indices in place */ + indices[c] = jay_index(scalar); + + /* Now collect it back. */ + jay_replace_src(d, jay_collect(b, d->file, indices, count)); +} + +/* + * Concatenate a list of vectors, collecting all the indices in order. + */ +static inline jay_def +jay_collect_vectors(jay_builder *b, jay_def *vecs, uint32_t nr) +{ + uint32_t indices[JAY_MAX_DEF_LENGTH]; + uint32_t nr_indices = 0; + + for (unsigned i = 0; i < nr; ++i) { + assert(vecs[i].file == vecs[0].file && jay_is_ssa(vecs[i])); + assert(!vecs[i].negate && !vecs[i].abs); + + jay_foreach_comp(vecs[i], c) { + assert(nr_indices < ARRAY_SIZE(indices)); + indices[nr_indices++] = jay_channel(vecs[i], c); + } + } + + return jay_collect(b, vecs[0].file, indices, nr_indices); +} + +static inline jay_def +jay_collect_two(jay_builder *b, jay_def u, jay_def v) +{ + jay_def vecs[] = { u, v }; + return jay_collect_vectors(b, vecs, 2); +} + +static inline jay_inst * +jay_alloc_inst(jay_builder *b, + enum jay_opcode op, + uint8_t num_srcs, + unsigned extra_bytes) +{ + const size_t size = + offsetof(jay_inst, src) + num_srcs * sizeof(jay_def) + extra_bytes; + + jay_inst *I = (jay_inst *) linear_zalloc_child(b->shader->lin_ctx, size); + I->op = op; + I->num_srcs = num_srcs; + I->dst = jay_null(); + I->cond_flag = jay_null(); + + return I; +} + +static inline void +jay_shrink_sources(jay_inst *I, uint8_t new_num_srcs) +{ + assert(new_num_srcs < I->num_srcs); + unsigned info_size = jay_inst_info_size(I); + + memmove(&I->src[new_num_srcs], &I->src[I->num_srcs], info_size); + I->num_srcs = new_num_srcs; +} + +static inline jay_inst * +jay_clone_inst(jay_builder *b, jay_inst *I, uint8_t new_num_srcs) +{ + assert(new_num_srcs >= I->num_srcs); + unsigned info_size = jay_inst_info_size(I); + + jay_inst *clone = jay_alloc_inst(b, I->op, new_num_srcs, info_size); + + memcpy((uint8_t *) clone + sizeof(struct list_head), + (uint8_t *) I + sizeof(struct list_head), + sizeof(jay_inst) - sizeof(struct list_head)); + + clone->num_srcs = new_num_srcs; + + memcpy(clone->src, I->src, I->num_srcs * sizeof(jay_def)); + memcpy(&clone->src[new_num_srcs], &I->src[I->num_srcs], info_size); + return clone; +} + +static inline jay_inst * +jay_grow_sources(jay_builder *b, jay_inst *I, uint8_t new_num_srcs) +{ + jay_inst *clone = jay_clone_inst(b, I, new_num_srcs); + + if ((b->cursor.option == jay_cursor_before_inst || + b->cursor.option == jay_cursor_after_inst) && + b->cursor.inst == I) { + + b->cursor.inst = clone; + } + + jay_builder b_ = jay_init_builder(b->func, jay_before_inst(I)); + jay_builder_insert(&b_, clone); + jay_remove_instruction(I); + return clone; +} + +static inline jay_inst * +jay_add_predicate_else(jay_builder *b, + jay_inst *I, + jay_def predicate, + jay_def default_value) +{ + assert(!I->predication && "pre-condition"); + assert(jay_is_flag(predicate) && jay_is_ssa(default_value)); + + unsigned pred_index = I->num_srcs; + I = jay_grow_sources(b, I, pred_index + 2); + I->src[pred_index] = predicate; + I->src[pred_index + 1] = default_value; + I->predication = JAY_PREDICATED_DEFAULT; + return I; +} + +static inline jay_inst * +jay_add_predicate(jay_builder *b, jay_inst *I, jay_def predicate) +{ + assert(!I->predication && "pre-condition"); + assert(jay_is_flag(predicate)); + + unsigned pred_index = I->num_srcs; + I = jay_grow_sources(b, I, pred_index + 1); + I->src[pred_index] = predicate; + I->predication = JAY_PREDICATED; + return I; +} + +static inline jay_inst * +jay_set_cond_flag(jay_builder *b, jay_inst *I, jay_def cond_flag) +{ + assert(jay_is_flag(cond_flag) && jay_is_null(I->cond_flag)); + + I->cond_flag = cond_flag; + return I; +} + +static inline jay_inst * +jay_set_conditional_mod(jay_builder *b, + jay_inst *I, + jay_def cond_flag, + enum jay_conditional_mod cmod) +{ + I->conditional_mod = cmod; + return jay_set_cond_flag(b, I, cond_flag); +} + +static inline jay_def +jay_identity_def(jay_def x) +{ + return x; +} + +#ifdef __cplusplus +static inline jay_def +JAY_BUILD_SRC(jay_def x) +{ + return x; +} +static inline jay_def +JAY_BUILD_SRC(uint32_t x) +{ + return jay_imm(x); +} +#else +#define JAY_BUILD_SRC(X) \ + _Generic((X), \ + jay_def: jay_identity_def, \ + uint32_t: jay_imm, \ + int32_t: jay_imm, \ + uint8_t: jay_imm)(X) +#endif + +/* Include generated builder helpers */ +#include "jay_builder_opcodes.h" + +static inline jay_inst * +_jay_CMP(jay_builder *b, + enum jay_type src_type, + enum jay_conditional_mod cmod, + jay_def dst, + jay_def src0, + jay_def src1) +{ + jay_inst *I = jay_alloc_inst(b, JAY_OPCODE_CMP, 2, 0); + I->type = src_type; + I->src[0] = src0; + I->src[1] = src1; + + /* Even if we want to write a 32-bit 0/~0 result, we still need to + * register-allocate a flag, since the hardware will implicitly clobber one + * regardless. + */ + if (!jay_is_flag(dst)) { + I->dst = dst; + dst = jay_alloc_def(b, dst.file == UGPR ? UFLAG : FLAG, 1); + } + + jay_set_conditional_mod(b, I, dst, cmod); + jay_builder_insert(b, I); + return I; +} + +#define jay_CMP(b, st, cmod, dst, src0, src1) \ + _jay_CMP(b, st, cmod, dst, JAY_BUILD_SRC(src0), JAY_BUILD_SRC(src1)) + +struct jayb_send_params { + enum brw_sfid sfid; + uint64_t msg_desc; + jay_def dst; + jay_def header; + jay_def *srcs; + jay_def desc, ex_desc; + enum jay_type type; + enum jay_type src_type[2]; + unsigned nr_srcs; + uint32_t ex_desc_imm; + bool eot; + bool check_tdr; + bool uniform; + bool bindless; +}; + +static inline jay_inst * +_jay_SEND(jay_builder *b, const struct jayb_send_params p) +{ + const struct intel_device_info *devinfo = b->shader->devinfo; + jay_inst *I = jay_alloc_inst(b, JAY_OPCODE_SEND, 4, sizeof(jay_send_info)); + jay_send_info *info = jay_get_send_info(I); + bool has_header = !jay_is_null(p.header); + + I->dst = p.dst; + I->type = p.type; + + assert(I->type); + info->type_0 = p.src_type[0] ? p.src_type[0] : I->type; + info->type_1 = p.src_type[1] ? p.src_type[1] : info->type_0; + + if (has_header) { + assert(p.nr_srcs == 1 || info->type_0 == info->type_1); + + /* If there is a message header, split the send into
and + * since the header is UGPR but the payload is GPR. + */ + I->src[2] = p.header; + I->src[3] = jay_collect_vectors(b, &p.srcs[0], p.nr_srcs); + info->type_1 = info->type_0; + info->type_0 = JAY_TYPE_U32 /* header type */; + } else if (jay_type_size_bits(info->type_0) == 16 && + !p.uniform && + b->shader->dispatch_width == 32) { + /* Pack 16-bit vectors to match the hardware with the data model. + * + * XXX: This is a hack. Move to NIR for better + * codegen in tests like + * dEQP-GLES31.functional.texture.multisample.samples_4.use_texture_int_2d_array. + */ + assert(info->type_0 == info->type_1); + jay_def srcs[8]; + unsigned n = 0, i; + for (i = 0; i + 2 <= p.nr_srcs; i += 2) { + assert(p.srcs[i].file == p.srcs[i + 1].file); + assert(jay_num_values(p.srcs[i]) == jay_num_values(p.srcs[i + 1])); + + for (unsigned c = 1; c < jay_num_values(p.srcs[i]); ++c) { + assert(jay_channel(p.srcs[i], c) == 0); + assert(jay_channel(p.srcs[i + 1], c) == 0); + } + + jay_def lo = jay_extract(p.srcs[i], 0), + hi = jay_extract(p.srcs[i + 1], 0); + jay_def bfi = jay_BFI2_u32(b, 0xffff0000, hi, lo); + + if (p.srcs[i].file == UGPR) { + uint32_t defs[16] = { jay_index(bfi) }; + srcs[n++] = jay_collect(b, UGPR, defs, jay_ugpr_per_grf(b->shader)); + } else { + srcs[n++] = bfi; + } + } + if (i < p.nr_srcs) { + srcs[n++] = p.srcs[i++]; + } + assert(i == p.nr_srcs); + + I->src[2] = jay_collect_vectors(b, srcs, n); + I->src[3] = jay_null(); + } else if (p.nr_srcs <= 2) { + /* Easy case: keep everything scalar */ + I->src[2] = p.nr_srcs > 0 ? p.srcs[0] : jay_null(); + I->src[3] = p.nr_srcs > 1 ? p.srcs[1] : jay_null(); + } else { + /* Otherwise, we need to pick a point to split at. + * + * Heuristic: don't split render targer writes becuase RA gets confused + * with the EOT requirements. Split everything else in half. + * + * TODO: Come up with a better heuristic. + */ + assert(info->type_0 == info->type_1); + unsigned split = !p.check_tdr ? DIV_ROUND_UP(p.nr_srcs, 2) : p.nr_srcs; + I->src[2] = jay_collect_vectors(b, &p.srcs[0], split); + I->src[3] = jay_collect_vectors(b, &p.srcs[split], p.nr_srcs - split); + } + + /* For message headers we pack a UGPR vector as a single GRF */ + unsigned lens[3]; + for (unsigned i = 0; i < 3; ++i) { + jay_def x = i == 0 ? I->dst : I->src[1 + i]; + lens[i] = jay_num_values(x); + + /* XXX: For the non-transpose uniform case, do we need to pad out + * with undefs for correctness so we don't fall off the side of the + * regfile? for sends like: + * + * (1&W) mov.u32 u10.0, u0.8 | A@1 + (1&W) mov.u32 u10.1, u0.9 | A@1 + (1&W) send.u32 u12, g10, _, 0x04403580, 0x00000000 + ugm MsgDesc: ( load, a64, d32, V4, L1STATE_L3MOCS dst_len = + 4, src0_len = 2, src1_len = 0 flat ) base_offset 0 | A@1 $0 + + * We don't care what's in g11, but it has to *exist*. But that is + * probably implicitly correct as long as the reg file ends with GRFs. + * Which it has to shader)); + } else { + lens[i] *= jay_grf_per_gpr(b->shader); + } + + lens[i] *= reg_unit(devinfo); + } + + info->sfid = p.sfid; + info->eot = p.eot; + info->check_tdr = p.check_tdr; + info->uniform = p.uniform; + info->bindless = p.bindless; + info->ex_desc_imm = p.ex_desc_imm; + info->ex_mlen = lens[2]; + I->src[0] = jay_imm(((uint32_t) p.msg_desc) | + brw_message_desc(devinfo, lens[1], lens[0], has_header)); + + if (!jay_is_null(p.desc)) { + jay_def a = jay_alloc_def(b, J_ADDRESS, 1); + jay_OR(b, JAY_TYPE_U32, a, p.desc, I->src[0]); + I->src[0] = a; + } + + if (jay_is_null(p.ex_desc)) { + I->src[1] = + jay_imm(brw_message_ex_desc(devinfo, lens[2]) | (p.msg_desc >> 32)); + } else if (p.ex_desc.file == J_ADDRESS) { + I->src[1] = p.ex_desc; + } else { + I->src[1] = jay_alloc_def(b, J_ADDRESS, 1); + if (info->bindless) { + jay_MOV(b, I->src[1], p.ex_desc); + } else { + jay_OR(b, JAY_TYPE_U32, I->src[1], p.ex_desc, + brw_message_ex_desc(devinfo, info->ex_mlen)); + } + } + + assert(!info->uniform || jay_is_null(I->dst) || I->dst.file == UGPR); + jay_builder_insert(b, I); + return I; +} + +#define jay_SEND(b, ...) _jay_SEND(b, (struct jayb_send_params) { __VA_ARGS__ }) + +static inline void +jay_copy_strided(jay_builder *b, jay_def dst, jay_def src, bool src_strided) +{ + unsigned src_stride = src_strided ? jay_ugpr_per_grf(b->shader) : 1; + uint32_t n = MIN2(jay_num_values(dst), jay_num_values(src) / src_stride); + + for (unsigned i = 0; i < n; ++i) { + jay_MOV(b, jay_extract(dst, i), jay_extract(src, i * src_stride)); + } +} + +static inline void +jay_copy(jay_builder *b, jay_def dst, jay_def src) +{ + jay_copy_strided(b, dst, src, false); +} + +static inline jay_def +jay_as_gpr(jay_builder *b, jay_def src) +{ + if (src.file == GPR || jay_is_null(src)) + return src; + + jay_def def = jay_alloc_def(b, GPR, jay_num_values(src)); + jay_copy(b, def, src); + return def; +} + +static inline void +jay_i2i32(jay_builder *b, jay_def dst, unsigned src_bits, jay_def src) +{ + if (src_bits < 32) { + jay_CVT(b, JAY_TYPE_S32, dst, src, jay_type(JAY_TYPE_S, src_bits), + JAY_ROUND, 0); + } else if (src_bits == 32) { + jay_MOV(b, dst, src); + } else { + assert(src.reg == 0 && ".reg not preserved in this path but that's OK"); + jay_MOV(b, dst, jay_extract(src, 0)); + } +} diff --git a/src/intel/compiler/jay/jay_builder_opcodes.h.py b/src/intel/compiler/jay/jay_builder_opcodes.h.py new file mode 100644 index 00000000000..735a653f08e --- /dev/null +++ b/src/intel/compiler/jay/jay_builder_opcodes.h.py @@ -0,0 +1,153 @@ +# Copyright 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +from typing import TYPE_CHECKING +import argparse +import sys + +from mako import exceptions +from mako.template import Template + +from jay_opcodes import OPCODES + +if TYPE_CHECKING: + from jay_opcodes import Opcode + + +def infer_type(op: 'Opcode') -> bool: + return op.has_dest and (set(op.types) <= set(["u1", "u32", "u64"]) or + op.name == 'mov') + + +def signature(op: 'Opcode', with_dest: bool = True, with_types: bool = False, + mode: str = 'prototype', type_: str = 't', src: str = '{}') -> str: + arr = [('jay_builder *', 'b')] + + if with_types and len(op.types) > 1 and not infer_type(op): + arr += [('enum jay_type', type_)] + + if with_dest and op.has_dest: + arr += [('jay_def', 'dst')] + + arr += [('jay_def', src.format(f'src{i}')) for i in range(op.num_srcs)] + arr += [x for x in op.extra_struct if not x[1].startswith('pad')] + + return ', '.join([(t + ' ' if mode == 'prototype' else '') + v for t, v in arr]) + + +TEMPLATE = """ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#pragma once + +#include "jay_private.h" + +#ifndef NDEBUG +#define type_assert(op, ...) if (!(__VA_ARGS__)) { fprintf(stderr, "%s does not allow type: ", #op); jay_print_type(stderr, t); fprintf(stderr, "\\n"); } assert(__VA_ARGS__) +#else +#define type_assert(...) +#endif + +% for op in opcodes.values(): +<% + OPCODE = op.name.upper() + num_srcs = op.num_srcs + has_dest = op.has_dest + multi_type = len(op.types) > 1 + info_size = f'sizeof(jay_{op.name}_info)' if op.extra_struct else '0' + operands = ["dst"] + [f"src{i}" for i in range(num_srcs)] + if num_srcs > 0: + uniform = " && " .join([f"jay_is_uniform(src{i})" for i in range(num_srcs)]) + reg_file = f"({uniform}) ? UGPR : GPR" + else: + reg_file = "GPR" + if not op.types: + continue + # Ignore the lane index when determining the type of a shuffle + infer_operands = operands[0:-1] if op.name == "shuffle" else operands +%> +static inline jay_inst * +_jay_${OPCODE}(${signature(op, with_types = True)}) +{ +% if infer_type(op): + enum jay_type t = jay_num_values(dst) == 2 ? JAY_TYPE_U64 : + ${" && ".join([f"(jay_is_flag({x}) || jay_is_imm({x}))" for x in infer_operands])} + ? JAY_TYPE_U1 : JAY_TYPE_U32; +% elif multi_type: + type_assert(${OPCODE}, 0 +% for type in op.types: + || t == JAY_TYPE_${type.upper()} +% endfor + ); + +% else: + enum jay_type t = JAY_TYPE_${op.types[0].upper()}; + +% endif + jay_inst *inst = jay_alloc_inst(b, JAY_OPCODE_${OPCODE}, ${num_srcs}, ${info_size}); +% for _, prop in op.extra_struct: +% if not prop.startswith('pad'): + jay_set_${op.name}_${prop}(inst, ${prop}); +% endif +% endfor + + inst->type = t; +% if op.has_dest: + inst->dst = dst; +% endif +% for i in range(num_srcs): + inst->src[${i}] = src${i}; +% endfor + + jay_builder_insert(b, inst); + return inst; +} + +#define jay_${OPCODE}(${signature(op, with_types = True, mode = 'call')}) _jay_${OPCODE}(${signature(op, with_types = True, src = 'JAY_BUILD_SRC({})', mode='call')}) + +% for type in op.types: +static inline ${'jay_def' if op.has_dest else 'void'} +_jay_${OPCODE}_${type}(${signature(op, with_dest = False)}) +{ +% if op.has_dest: + jay_def dst = jay_alloc_def(b, ${reg_file}, ${2 if '64' in type else 1}); +%endif + jay_${OPCODE}(${signature(op, with_types = True, type_ = 'JAY_TYPE_'+type.upper(), mode = 'call')}); +% if op.has_dest: + return dst; +% endif +} +#define jay_${OPCODE}_${type}(${signature(op, with_dest = False, mode = +'call')}) _jay_${OPCODE}_${type}(${signature(op, src='JAY_BUILD_SRC({})', mode = 'call', with_dest = False)}) +% endfor + +% endfor + +#undef type_assert +""" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument('output', action='store') + args = parser.parse_args() + + ops = {op: v for (op, v) in OPCODES.items() if op not in {'cmp', 'send'}} + + try: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(Template(TEMPLATE).render( + opcodes=ops, + signature=signature, + infer_type=infer_type)) + except Exception: + print(exceptions.text_error_template().render()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/intel/compiler/jay/jay_extra_info.h.py b/src/intel/compiler/jay/jay_extra_info.h.py new file mode 100644 index 00000000000..cffe74fe5eb --- /dev/null +++ b/src/intel/compiler/jay/jay_extra_info.h.py @@ -0,0 +1,153 @@ +# Copyright 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +import argparse +import sys + +from mako import exceptions +from mako.template import Template + +from jay_opcodes import OPCODES, ENUMS + +TEMPLATE = """/* Do not include directly */ +PRAGMA_DIAGNOSTIC_PUSH +PRAGMA_DIAGNOSTIC_ERROR(-Wpadded) + +% for enum, (prefix, values) in enums.items(): +% if enum.startswith('jay'): +enum PACKED ${enum} { +% for v in values: + ${prefix}_${v.upper()}, +% endfor +}; +% endif +% endfor + +% for name, op in opcodes: +typedef struct jay_${name}_info { +% for T, prop in op.extra_struct: + ${T} ${prop}; +% endfor +} jay_${name}_info; + +% for prefix, _suffix in [('const ', '_const'), ('', '')]: +static inline ${prefix} struct jay_${name}_info * +jay_get_${name}_info${_suffix}(${prefix}jay_inst *I) +{ + assert(I->op == JAY_OPCODE_${name.upper()}); + return (${prefix}struct jay_${name}_info *) &I->src[I->num_srcs]; +} + +% endfor +% for T, prop in op.extra_struct: +% if not prop.startswith('pad'): +static inline ${T} +jay_${name}_${prop}(const jay_inst *I) +{ + return jay_get_${name}_info_const(I)->${prop}; +} + +static inline void +jay_set_${name}_${prop}(jay_inst *I, ${T} value) +{ + jay_get_${name}_info(I)->${prop} = value; +} + +% endif +% endfor +% endfor + +static inline unsigned +jay_inst_info_size(jay_inst *I) +{ + switch (I->op) { +% for name, op in opcodes: + case JAY_OPCODE_${name.upper()}: return sizeof(struct jay_${name}_info); +% endfor + default: return 0; + } +} + +#ifndef __cplusplus +static inline const char * +jay_print_inst_info(FILE *fp, const jay_inst *I, const char *sep) +{ + switch (I->op) { +% for name, op in opcodes: + case JAY_OPCODE_${name.upper()}: { +% for T, prop in op.extra_struct: +% if not (prop.startswith('pad') or name == 'bfn' or T == 'enum jay_type'): +<% + value = f"jay_{name}_{prop}(I)" + spec = '0x%"PRIx64"' if T == 'uint64_t' else "%u" +%> +% if T.startswith('enum') and T[5:] in enums: +<% + bare = T[5:] + prefix, values = enums[bare] +%> + const char *${bare}_str[] = { +% for v in values: + [${prefix}_${v.upper()}] = "${v}", +% endfor + }; + assert(${value} < ARRAY_SIZE(${bare}_str)); +<% + spec = "%s" + value = f'{T[5:]}_str[{value}]' +%> +% endif +% if T == 'enum jay_rounding_mode': + if (strcmp(${value}, "round")) { + fprintf(fp, "%s%s", sep, ${value}); + sep = ", "; + } +% elif T == 'bool': + if (${value}) { + fprintf(fp, "%s${prop}", sep); + sep = ", "; + } +% elif T.startswith('enum') or len(op.extra_struct) == 1: + fprintf(fp, "%s${spec}", sep, ${value}); + sep = ", "; +% else: + if (${value}) { + fprintf(fp, "%s${prop}=${spec}", sep, ${value}); + sep = ", "; + } +% endif +% endif +% endfor + break; + } +% endfor + default: break; + } + + return sep; +} +#endif + +PRAGMA_DIAGNOSTIC_POP +""" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument('output', action='store') + args = parser.parse_args() + + try: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(Template(TEMPLATE).render( + opcodes=[(k, v) for k, v in OPCODES.items() if v.extra_struct], + enums=ENUMS)) + except Exception: + print(exceptions.text_error_template().render()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c new file mode 100644 index 00000000000..de24701b7ad --- /dev/null +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -0,0 +1,3838 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/brw/brw_compiler.h" +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/brw/brw_nir.h" +#include "compiler/brw/brw_private.h" +#include "compiler/brw/brw_sampler.h" +#include "compiler/intel_nir.h" +#include "compiler/intel_shader_enums.h" +#include "compiler/list.h" +#include "intel/dev/intel_debug.h" +#include "util/bitpack_helpers.h" +#include "util/bitscan.h" +#include "util/bitset.h" +#include "util/lut.h" +#include "util/macros.h" +#include "util/u_math.h" +#include "intel_device_info_gen.h" +#include "jay.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" +#include "nir.h" +#include "nir_builder.h" +#include "nir_builder_opcodes.h" +#include "nir_defines.h" +#include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" +#include "nir_opcodes.h" +#include "shader_enums.h" +#include "shader_stats.h" + +static const struct debug_named_value jay_debug_options[] = { + { "noopt", JAY_DBG_NOOPT, "Disable backend optimizer" }, + { "printdemand", JAY_DBG_PRINTDEMAND, "Print demand per instruction" }, + { "spill", JAY_DBG_SPILL, "Shrink register file to test spilling" }, + { "sync", JAY_DBG_SYNC, "Sync after every instruction" }, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(jay_debug, "JAY_DEBUG", jay_debug_options, 0) +int jay_debug = 0; + +typedef struct jay_vs_payload { + /* "the maximum limit is 30 elements per vertex" (bspec 56124) */ + jay_def attributes[30 * 4]; +} jay_vs_payload; + +typedef struct jay_cs_payload { + jay_def local_invocation_ids; +} jay_cs_payload; + +typedef struct jay_fs_payload { + jay_def bary[INTEL_BARYCENTRIC_MODE_COUNT]; + + struct { + jay_def xy, z, w; + } coord; + + jay_def pixel_sample_mask; + jay_def deltas[64]; +} jay_fs_payload; + +struct nir_to_jay_state { + jay_shader *s; + jay_function *f; + const nir_shader *nir; + const struct intel_device_info *devinfo; + + jay_builder bld; + + jay_block *current_block; + jay_block *after_block; + jay_block *break_block; + + unsigned indent; + + /* We cache ballot(true), ctz(ballot(true)), and 4*ctz(ballot(true)) within a + * block. If we had competent backend CSE - or emitted uniformize in NIR and + * taught NIR's CSE about ballots - we could remove this kludge. + */ + jay_def active_lane_mask, active_lane, active_lane_x4; + + /* These defs contain the extracted payload. They are only valid while + * translating NIR->Jay since they aren't maintained by Jay passes. + */ + struct { + jay_def u0, u1; + jay_def sampler_state_pointer, scratch_surface; + jay_def inline_data; + jay_def push_data[512]; + jay_def lane_id; + jay_def urb_handle; + + union { + jay_vs_payload vs; + jay_cs_payload cs; + jay_fs_payload fs; + }; + } payload; +}; + +static jay_def +payload_u1(struct nir_to_jay_state *nj, unsigned idx, unsigned len) +{ + if (jay_is_null(nj->payload.u1)) + return jay_null(); + else + return jay_extract_range(nj->payload.u1, idx, len); +} + +static jay_def +emit_active_lane_mask(struct nir_to_jay_state *nj) +{ + /* TODO: We don't use jay_exec_mask yet due to hardware issues */ + if (jay_is_null(nj->active_lane_mask)) { + nj->active_lane_mask = jay_alloc_def(&nj->bld, FLAG, 1); + jay_MOV(&nj->bld, nj->active_lane_mask, 1); + } + + return nj->active_lane_mask; +} + +static jay_def +emit_active_lane(struct nir_to_jay_state *nj) +{ + /* For this instruction to execute, some lane must be active. Therefore there + * is a 1 in the lower [dispatch width] bits of the lane mask, so we may + * equivalently use fbl.u32 instead of fbl.u[dispatch width]. + */ + if (jay_is_null(nj->active_lane)) { + nj->active_lane = jay_alloc_def(&nj->bld, UGPR, 1); + jay_FBL(&nj->bld, nj->active_lane, emit_active_lane_mask(nj)); + } + + return nj->active_lane; +} + +static jay_def +emit_uniformize(struct nir_to_jay_state *nj, jay_def x) +{ + jay_builder *b = &nj->bld; + if (x.file != GPR && x.file != FLAG) { + return x; + } + + if (jay_is_null(nj->active_lane_x4)) { + nj->active_lane_x4 = jay_SHL_u32(b, emit_active_lane(nj), 2); + } + + jay_def u = jay_alloc_def(b, x.file == FLAG ? UFLAG : UGPR, 1); + jay_SHUFFLE(b, u, x, nj->active_lane_x4); + return u; +} + +static jay_block *jay_emit_cf_list(struct nir_to_jay_state *nj, + struct exec_list *list); + +/** Returns true if the entire compute workgroup fits in a single subgroup. */ +static bool +jay_workgroup_is_one_subgroup(jay_builder *b, const nir_shader *nir) +{ + return mesa_shader_stage_uses_workgroup(nir->info.stage) && + !nir->info.workgroup_size_variable && + nir_static_workgroup_size(nir) <= b->shader->dispatch_width; +} + +static enum jay_type +jay_base_type_for_nir(nir_alu_type nir_type) +{ + /* clang-format off */ + switch (nir_alu_type_get_base_type(nir_type)) { + case nir_type_int: return JAY_TYPE_S; + case nir_type_uint: return JAY_TYPE_U; + case nir_type_bool: return JAY_TYPE_S; + case nir_type_float: return JAY_TYPE_F; + default: UNREACHABLE("invalid NIR type"); + } + /* clang-format on */ +} + +static enum jay_file +jay_file_for_def(const nir_def *def) +{ + return def->bit_size == 1 ? (def->divergent ? FLAG : UFLAG) : + (def->divergent ? GPR : UGPR); +} + +/** + * Returns an jay_type for the ALU op's i-th source. + * (Useful for conversions and comparisons.) + */ +static enum jay_type +jay_alu_source_type(nir_alu_instr *alu, unsigned i) +{ + return jay_type(jay_base_type_for_nir(nir_op_infos[alu->op].input_types[i]), + nir_src_bit_size(alu->src[i].src)); +} + +static inline jay_def +nj_def(nir_def *def) +{ + unsigned bits = def->num_components * MAX2(def->bit_size, 32); + unsigned words = DIV_ROUND_UP(bits, 32); + + return jay_contiguous_def(jay_file_for_def(def), def->index, words); +} + +static inline jay_def +nj_src(nir_src src) +{ + return nj_def(src.ssa); +} + +static void +jay_emit_alu(struct nir_to_jay_state *nj, nir_alu_instr *alu) +{ + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&alu->def); + + nir_alu_type nir_type = nir_op_infos[alu->op].output_type; + enum jay_type base_type = jay_base_type_for_nir(nir_type); + enum jay_type type = jay_type(base_type, alu->def.bit_size); + + jay_def src[NIR_ALU_MAX_INPUTS]; + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + unsigned len = nir_src_bit_size(alu->src[i].src) == 64 ? 2 : 1; + src[i] = jay_extract_range(nj_src(alu->src[i].src), + len * alu->src[i].swizzle[0], len); + } + + switch (alu->op) { +#define CMP(op, jay) \ + case nir_op_##op: \ + jay_CMP(b, jay_alu_source_type(alu, 0), JAY_CONDITIONAL_##jay, dst, \ + src[0], src[1]); \ + break; + +#define UNOP(nir, jay_op) \ + case nir_op_##nir: \ + jay_##jay_op(b, type, dst, src[0]); \ + break; + +#define MATH(nir, jay_op) \ + case nir_op_##nir: \ + jay_MATH(b, type, dst, src[0], JAY_MATH_##jay_op); \ + break; + +#define UNOP_UNTYPED(nir, jay_op) \ + case nir_op_##nir: \ + jay_##jay_op(b, dst, src[0]); \ + break; + +#define BINOP(nir, jay_op) \ + case nir_op_##nir: \ + jay_##jay_op(b, type, dst, src[0], src[1]); \ + break; + +#define DP4A(nir, jay_op, sat_) \ + case nir_op_##nir: \ + jay_DP4A_##jay_op(b, dst, src[2], src[0], src[1])->saturate = sat_; \ + break; + + CMP(flt, LT) + CMP(ilt, LT) + CMP(ult, LT) + CMP(fge, GE) + CMP(ige, GE) + CMP(uge, GE) + CMP(feq, EQ) + CMP(ieq, EQ) + CMP(fneu, NE) + CMP(ine, NE) + + MATH(frcp, INV) + MATH(fexp2, EXP) + MATH(flog2, LOG) + MATH(fsin, SIN) + MATH(fcos, COS) + MATH(fsqrt, SQRT) + MATH(frsq, RSQ) + UNOP(ffract, FRC) + UNOP(ftrunc, RNDZ) + UNOP(ffloor, RNDD) + UNOP(fround_even, RNDE) + + UNOP_UNTYPED(mov, copy) + UNOP_UNTYPED(unpack_32_2x16_split_x, MOV) + UNOP_UNTYPED(b2b1, CAST_CANONICAL_TO_FLAG) + UNOP_UNTYPED(inot, NOT) + UNOP_UNTYPED(bitfield_reverse, BFREV) + UNOP_UNTYPED(bit_count, CBIT) + UNOP_UNTYPED(uclz, LZD) + UNOP_UNTYPED(find_lsb, FBL) + + BINOP(imin, MIN) + BINOP(umin, MIN) + BINOP(fmin, MIN) + BINOP(imax, MAX) + BINOP(umax, MAX) + BINOP(fmax, MAX) + BINOP(fadd, ADD) + BINOP(iadd, ADD) + BINOP(fmul, MUL) + BINOP(imul_32x16, MUL_32X16) + BINOP(umul_32x16, MUL_32X16) + BINOP(ishl, SHL) + BINOP(ishr, ASR) + BINOP(ushr, SHR) + BINOP(urol, ROL) + BINOP(uror, ROR) + BINOP(urhadd, AVG) + BINOP(irhadd, AVG) + BINOP(iand, AND) + BINOP(ior, OR) + BINOP(ixor, XOR) + + DP4A(sdot_4x8_iadd, SS, false) + DP4A(sdot_4x8_iadd_sat, SS, true) + DP4A(udot_4x8_uadd, UU, false) + DP4A(udot_4x8_uadd_sat, UU, true) + DP4A(sudot_4x8_iadd, SU, false) + DP4A(sudot_4x8_iadd_sat, SU, true) + +#undef CMP +#undef UNOP +#undef UNOP_UNTYPED +#undef BINOP +#undef DP4A + + case nir_op_imul: + if (jay_type_size_bits(type) == 32) { + jay_MUL_32(b, type, dst, src[0], src[1], false); + } else { + jay_MUL(b, type, dst, src[0], src[1]); + } + + break; + + case nir_op_imul_high: + case nir_op_umul_high: + jay_MUL_32(b, type, dst, src[0], src[1], true); + break; + + case nir_op_bfm: + jay_BFI1(b, dst, src[0], src[1]); + break; + + case nir_op_b2f64: + jay_SEL(b, JAY_TYPE_U32, jay_extract(dst, 1), 0x3ff00000, 0, src[0]); + jay_MOV(b, jay_extract(dst, 0), 0); + break; + + case nir_op_ufind_msb_rev: + case nir_op_ifind_msb_rev: + jay_FBH(b, jay_alu_source_type(alu, 0), dst, src[0]); + break; + + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: + assert(nir_src_bit_size(alu->src[0].src) > 1 && + "predicate conversions are lowered"); + + if (alu->def.bit_size <= nir_src_bit_size(alu->src[0].src)) { + /* Downconversion. Upper bits garbage convention makes this a no-op. + * The extract handles 64->32 narrowing conversions. + */ + jay_MOV(b, dst, jay_extract(src[0], 0)); + break; + } + + FALLTHROUGH; + case nir_op_i2f64: + case nir_op_i2i64: + case nir_op_u2u64: + case nir_op_u2f64: + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_f2i32: + case nir_op_f2u32: + case nir_op_f2i32_sat: + case nir_op_f2u32_sat: + case nir_op_i2f32: + case nir_op_u2f32: + case nir_op_f2f32: + case nir_op_i2f16: + case nir_op_u2f16: + case nir_op_f2f16: + case nir_op_f2i16: + case nir_op_f2u16: + case nir_op_f2i8: + case nir_op_f2u8: { + enum jay_type src_type = jay_alu_source_type(alu, 0); + + /* UGPR byte to float is not supported. Do it in 2 steps. */ + if (jay_type_size_bits(src_type) == 8 && + jay_base_type(type) == JAY_TYPE_F && + dst.file == UGPR) { + + enum jay_type integer = jay_type_rebase(type, jay_base_type(src_type)); + jay_def tmp = jay_alloc_def(b, UGPR, 1); + jay_CVT(b, integer, tmp, src[0], src_type, JAY_ROUND, 0); + jay_CVT(b, type, dst, tmp, integer, JAY_ROUND, 0); + } else { + jay_CVT(b, type, dst, src[0], src_type, JAY_ROUND, 0); + } + + break; + } + + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + jay_CVT(b, JAY_TYPE_F16, dst, src[0], jay_alu_source_type(alu, 0), + alu->op == nir_op_f2f16_rtz ? JAY_RTZ : JAY_RNE, 0); + break; + + case nir_op_fsat: + jay_MODIFIER(b, type, dst, src[0])->saturate = true; + break; + + case nir_op_fneg: + case nir_op_ineg: + jay_MODIFIER(b, type, dst, jay_negate(src[0])); + break; + + case nir_op_fabs: + case nir_op_iabs: + jay_MODIFIER(b, type, dst, jay_abs(src[0])); + break; + + case nir_op_iadd3: + jay_ADD3(b, type, dst, src[0], src[1], src[2]); + break; + + case nir_op_uadd_sat: + case nir_op_iadd_sat: + jay_ADD(b, type, dst, src[0], src[1])->saturate = true; + break; + + case nir_op_usub_sat: + case nir_op_isub_sat: + jay_ADD(b, type, dst, src[0], jay_negate(src[1]))->saturate = true; + break; + + case nir_op_ihadd: + case nir_op_uhadd: { + /* AVG(x, y) - ((x ^ y) & 1) */ + jay_def avg = jay_alloc_def(b, dst.file, 1); + jay_def bfn = jay_alloc_def(b, dst.file, 1); + jay_AVG(b, type, avg, src[0], src[1]); + jay_BFN(b, bfn, 1, src[0], src[1], UTIL_LUT3(a & (b ^ c))); + jay_ADD(b, type, dst, avg, jay_negate(bfn)); + break; + } + + case nir_op_unpack_64_2x32_split_x: + jay_MOV(b, dst, jay_extract(src[0], 0)); + break; + case nir_op_unpack_64_2x32_split_y: + jay_MOV(b, dst, jay_extract(src[0], 1)); + break; + case nir_op_unpack_32_2x16_split_y: + jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, 1); + break; + + case nir_op_pack_32_4x8_split: { + /* TODO: Optimize */ + jay_def r = jay_BFI2_u32(b, 0x0000ff00, src[1], src[0]); + r = jay_BFI2_u32(b, 0x00ff0000, src[2], r); + jay_BFI2(b, dst, 0xff000000, src[3], r); + break; + } + + case nir_op_pack_32_2x16_split: + /* TODO: Optimize */ + jay_BFI2(b, dst, 0xffff0000, src[1], src[0]); + break; + + case nir_op_pack_64_2x32_split: + jay_MOV(b, jay_extract(dst, 0), src[0]); + jay_MOV(b, jay_extract(dst, 1), src[1]); + break; + + case nir_op_bitfield_select: + assert(jay_type_size_bits(type) <= 32); + jay_BFN(b, dst, src[0], src[1], src[2], UTIL_LUT3((a & b) | (~a & c))); + break; + + case nir_op_ubfe: + case nir_op_ibfe: + jay_BFE(b, type, dst, src[0], src[1], src[2]); + break; + case nir_op_bfi: + jay_BFI2(b, dst, src[0], src[1], src[2]); + break; + + case nir_op_ffma: + jay_MAD(b, type, dst, src[0], src[1], src[2]); + break; + + case nir_op_fcsel: + jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod = + JAY_CONDITIONAL_NE; + break; + + case nir_op_fcsel_gt: + case nir_op_i32csel_gt: + jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod = + JAY_CONDITIONAL_GT; + break; + + case nir_op_fcsel_ge: + case nir_op_i32csel_ge: + jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod = + JAY_CONDITIONAL_GE; + break; + + case nir_op_bcsel: + assert(alu->def.bit_size < 64); + assert(jay_is_flag(src[0])); + + /* b2i8 gets lowered into 8-bit csel. Just use the upper bits garbage + * convention to implement with SEL.u16 instead. + */ + if (type == JAY_TYPE_U8) { + type = JAY_TYPE_U16; + } + + jay_SEL(b, type, dst, src[1], src[2], src[0]); + break; + + case nir_op_extract_u8: + jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U8, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + case nir_op_extract_i8: + jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S8, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + case nir_op_extract_u16: + jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + case nir_op_extract_i16: + jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S16, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + default: + if (nir_op_is_vec(alu->op)) { + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + unsigned len = jay_type_vector_length(type); + jay_copy(b, jay_extract_range(dst, len * i, len), src[i]); + } + + break; + } + + nir_print_instr(&alu->instr, stderr); + fprintf(stderr, "\n"); + UNREACHABLE("unhandled instruction"); + } +} + +static void +jay_emit_load_const(struct nir_to_jay_state *nj, nir_load_const_instr *lc) +{ + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&lc->def); + assert(lc->def.num_components == 1 && "must be scalarized"); + + if (lc->def.bit_size == 64 && lc->value[0].u64 >> 32) { + jay_MOV_IMM64(b, dst, lc->value[0].u64); + } else { + jay_MOV(b, dst, lc->value[0].u32); + } +} + +static jay_def +jay_resource_handle(jay_builder *b, + nir_src *nsrc, + unsigned *bti_const, + bool *internal, + bool *bindless) +{ + if (!nsrc) { + return jay_null(); + } + + nir_intrinsic_instr *rin = nir_src_as_intrinsic(*nsrc); + + if (nir_src_is_const(*nsrc)) { + *bti_const = nir_src_as_uint(*nsrc); + return jay_null(); + } else if (!rin || rin->intrinsic != nir_intrinsic_resource_intel) { + return nj_src(*nsrc); + } + + uint32_t flags = nir_intrinsic_resource_access_intel(rin); + if (internal) { + *internal = !!(flags & nir_resource_intel_internal); + } + if (bindless) { + *bindless = !!(flags & nir_resource_intel_bindless); + } + + if (nir_src_is_const(rin->src[1])) { + *bti_const = nir_src_as_uint(rin->src[1]); + return jay_null(); + } else { + return nj_src(rin->src[1]); + } +} + +static inline enum lsc_flush_type +translate_flush_type(nir_intrinsic_instr *intr) +{ + switch (nir_intrinsic_memory_semantics(intr)) { + case NIR_MEMORY_ACQUIRE: + return LSC_FLUSH_TYPE_INVALIDATE; + case NIR_MEMORY_RELEASE: + return LSC_FLUSH_TYPE_CLEAN; + case NIR_MEMORY_ACQ_REL: + return LSC_FLUSH_TYPE_EVICT; + case NIR_MEMORY_MAKE_AVAILABLE: + case NIR_MEMORY_MAKE_VISIBLE: + default: + UNREACHABLE("unexpected memory semantic"); + } +} + +static void +emit_lsc_fence(struct nir_to_jay_state *nj, + nir_intrinsic_instr *intr, + enum brw_sfid sfid) +{ + bool device = nir_intrinsic_memory_scope(intr) >= SCOPE_QUEUE_FAMILY; + enum lsc_fence_scope scope = device ? LSC_FENCE_TILE : LSC_FENCE_THREADGROUP; + enum lsc_flush_type type = + sfid == BRW_SFID_SLM ? LSC_FLUSH_TYPE_NONE : translate_flush_type(intr); + + jay_def notif = jay_alloc_def(&nj->bld, UGPR, jay_ugpr_per_grf(nj->s)); + uint32_t desc = lsc_fence_msg_desc(nj->s->devinfo, scope, type, false); + + jay_SEND(&nj->bld, .sfid = sfid, .msg_desc = desc, .srcs = &nj->payload.u0, + .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true, .dst = notif); +} + +static void +jay_emit_memory_barrier(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) +{ + nir_variable_mode modes = nir_intrinsic_memory_modes(intr); + + jay_SYNC(&nj->bld, TGL_SYNC_ALLWR); + + if (modes & nir_var_image) { + emit_lsc_fence(nj, intr, BRW_SFID_TGM); + assert(!nj->nir->info.use_lowered_image_to_global && "fix common code"); + } + + if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) { + emit_lsc_fence(nj, intr, BRW_SFID_UGM); + } + + if (modes & (nir_var_shader_out | nir_var_mem_task_payload)) { + emit_lsc_fence(nj, intr, BRW_SFID_URB); + } + + if ((modes & nir_var_mem_shared) && + !jay_workgroup_is_one_subgroup(&nj->bld, nj->nir)) { + emit_lsc_fence(nj, intr, BRW_SFID_SLM); + } +} + +static void +jay_emit_signal_barrier(struct nir_to_jay_state *nj) +{ + jay_builder *b = &nj->bld; + + /* Signal barrier / Active threads only (BSpec 72052). + * + * Source 0 is the number of subgroups in [31:24], which comes from the u0.2 + * payload in [31:24]. Mask out the other bits, then replicate to [23:15]. + * + * TODO: This can be done faster with a SIMD2 8-bit move. + */ + jay_def a = jay_AND_u32(b, jay_extract(nj->payload.u0, 2), 0xff000000); + jay_def m2 = jay_OR_u32(b, a, jay_SHR_u32(b, a, 8)); + + /* Use an active threads only barrier. TODO: I think we can optimize. */ + if (b->shader->devinfo->ver >= 20) { + m2 = jay_OR_u32(b, m2, BITFIELD_BIT(8)); + } + + uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; + indices[2] = jay_index(m2); + jay_def zipped = jay_collect(b, UGPR, indices, 3); + + jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, + .msg_desc = BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG, .srcs = &zipped, + .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true); + + jay_SYNC(b, TGL_SYNC_BAR); +} + +static void +jay_emit_derivative(jay_builder *b, + jay_def dst, + nir_intrinsic_instr *intr, + enum jay_quad_swizzle swz0, + enum jay_quad_swizzle swz1) +{ + assert(intr->def.bit_size == 32 && "todo"); + jay_def val = nj_src(intr->src[0]); + + jay_ADD(b, JAY_TYPE_F32, dst, jay_QUAD_SWIZZLE_u32(b, val, swz1), + jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0))); +} + +static void +jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr) +{ + jay_def data = nj_src(intr->src[0]); + jay_def srcs[8]; + + /* Optimize unconditional discards. Should probably do this in NIR. */ + bool trivial = + nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]); + + for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) { + srcs[i] = trivial ? jay_INDETERMINATE_u32(b) : + jay_as_gpr(b, jay_extract(data, i)); + } + + jay_inst *send = + jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true, + .msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs( + nir_get_scalar(intr->src[1].ssa, 0))) | + (nir_scalar_as_uint(nir_scalar_chase_movs( + nir_get_scalar(intr->src[1].ssa, 1))) + << 32), + .srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]), + .type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr)); + + /* Handle the disable predicate. It is logically inverted. */ + if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) { + jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2]))); + } +} + +static enum lsc_data_size +lsc_bits_to_data_size(unsigned bit_size) +{ + /* clang-format off */ + switch (bit_size / 8) { + case 1: return LSC_DATA_SIZE_D8U32; + case 2: return LSC_DATA_SIZE_D16U32; + case 4: return LSC_DATA_SIZE_D32; + case 8: return LSC_DATA_SIZE_D64; + default: UNREACHABLE("Unsupported data size."); + } + /* clang-format on */ +} + +static enum lsc_opcode +lsc_op_for_atomic(nir_atomic_op op) +{ + /* clang-format off */ + switch (op) { + case nir_atomic_op_iadd: return LSC_OP_ATOMIC_ADD; + case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN; + case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN; + case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX; + case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX; + case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND; + case nir_atomic_op_ior: return LSC_OP_ATOMIC_OR; + case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR; + case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE; + case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG; + case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN; + case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX; + case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG; + case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD; + default: UNREACHABLE("Unsupported NIR atomic"); + } + /* clang-format on */ +} + +static jay_def +jay_src_as_strided(jay_builder *b, + jay_def x, + unsigned element_sz, + enum jay_file dst_file) +{ + if (dst_file == UGPR) { + assert(jay_is_uniform(x) && "Uniform dests require uniform sources"); + + if (x.file != UGPR) { + jay_def tmp = jay_alloc_def(b, UGPR, jay_num_values(x)); + jay_copy(b, tmp, x); + x = tmp; + } + + uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; + unsigned nr = jay_num_values(x) * jay_ugpr_per_grf(b->shader); + assert(nr < ARRAY_SIZE(indices)); + + for (unsigned i = 0; i < jay_num_values(x) / element_sz; ++i) { + for (unsigned j = 0; j < element_sz; ++j) { + indices[(i * jay_ugpr_per_grf(b->shader)) + j] = + jay_channel(x, (i * element_sz) + j); + } + } + + return jay_collect(b, UGPR, indices, nr); + } else { + /* Could be a GPR or UGPR source */ + assert(dst_file == GPR); + return jay_as_gpr(b, x); + } +} + +static jay_def +jay_scratch_surface(struct nir_to_jay_state *nj) +{ + if (jay_is_null(nj->payload.scratch_surface)) { + jay_function *func = nj->f; + assert(func->is_entrypoint && "todo: this needs ABI"); + + jay_builder b = jay_init_builder(func, jay_before_function(func)); + nj->payload.scratch_surface = jay_alloc_def(&b, J_ADDRESS, 1); + + jay_def u0_5 = jay_extract(nj->payload.u0, 5); + jay_def state = jay_AND_u32(&b, u0_5, ~BITFIELD_MASK(10)); + jay_SHR(&b, JAY_TYPE_U32, nj->payload.scratch_surface, state, 4); + } + + return nj->payload.scratch_surface; +} + +static void +jay_emit_mem_access(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) +{ + jay_builder *b = &nj->bld; + bool slm = nir_is_shared_access(intr); + bool tgm = nir_intrinsic_has_image_dim(intr); + bool urb = intr->intrinsic == nir_intrinsic_store_urb_lsc_intel || + intr->intrinsic == nir_intrinsic_store_urb_vec4_intel; + enum brw_sfid sfid = slm ? BRW_SFID_SLM : + tgm ? BRW_SFID_TGM : + urb ? BRW_SFID_URB : + BRW_SFID_UGM; + + nir_src *data_src = nir_get_io_data_src(intr); + bool scratch = intr->intrinsic == nir_intrinsic_load_scratch_intel || + intr->intrinsic == nir_intrinsic_store_scratch_intel; + + enum lsc_opcode op; + if (nir_intrinsic_has_atomic_op(intr)) + op = lsc_op_for_atomic(nir_intrinsic_atomic_op(intr)); + else if (sfid == BRW_SFID_TGM) + op = data_src ? LSC_OP_STORE_CMASK : LSC_OP_LOAD_CMASK; + else + op = data_src ? LSC_OP_STORE : LSC_OP_LOAD; + + nir_src *bti = nir_get_io_index_src(intr), *ubo = NULL; + nir_src *offset_src = tgm ? &intr->src[1] : nir_get_io_offset_src(intr); + + if (intr->intrinsic == nir_intrinsic_load_ubo || + intr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) { + ubo = bti; + bti = NULL; + b->shader->prog_data->base.has_ubo_pull = true; + } + + const struct intel_device_info *devinfo = b->shader->devinfo; + bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest; + jay_def data = data_src ? nj_src(*data_src) : jay_null(); + unsigned bti_const = 0; + bool internal = false; + bool bindless = false; + jay_def bti_indirect = + jay_resource_handle(b, bti ?: ubo, &bti_const, &internal, &bindless); + jay_def offset = nj_src(*offset_src); + nir_def *ndata = data_src ? data_src->ssa : &intr->def; + jay_def dst = has_dest ? nj_def(&intr->def) : jay_null(); + int32_t base_offset = + nir_intrinsic_has_base(intr) ? nir_intrinsic_base(intr) : 0; + + /* Optimize increment/decrement */ + if (op == LSC_OP_ATOMIC_ADD && nir_src_is_const(*data_src)) { + int64_t add_val = nir_src_as_int(*data_src); + if (add_val == 1 || add_val == -1) { + op = add_val == 1 ? LSC_OP_ATOMIC_INC : LSC_OP_ATOMIC_DEC; + data = jay_null(); + } + } + + /* Pack the coordinates. TODO: MSAA */ + if (tgm) { + unsigned nr = nir_image_intrinsic_coord_components(intr); + offset = jay_extract_range(offset, 0, nr); + } + + internal |= scratch; + enum lsc_addr_surface_type surf_type = internal ? LSC_ADDR_SURFTYPE_SS : + bindless ? LSC_ADDR_SURFTYPE_BSS : + (bti || ubo) ? LSC_ADDR_SURFTYPE_BTI : + LSC_ADDR_SURFTYPE_FLAT; + + bool a64 = surf_type == LSC_ADDR_SURFTYPE_FLAT && sfid == BRW_SFID_UGM; + enum lsc_addr_size addr_size = a64 ? LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32; + enum jay_type offset_type = a64 ? JAY_TYPE_U64 : JAY_TYPE_U32; + + bool cmask = op == LSC_OP_LOAD_CMASK || op == LSC_OP_STORE_CMASK; + bool uniform = !(has_dest && dst.file != UGPR); + + if (nir_intrinsic_has_align(intr)) { + assert(nir_intrinsic_align(intr) >= (ndata->bit_size / 8)); + } + + if (!has_dest) { + uniform &= jay_is_null(data) || data.file == UGPR; + uniform &= jay_is_null(offset) || offset.file == UGPR; + uniform &= !(cmask || urb); + } + + /* Per bspec 57330, 8-bit/16-bit are not supported for transpose */ + bool transpose = uniform && !cmask && ndata->bit_size >= 32; + bool scalar_uniform = uniform && !cmask && ndata->bit_size < 32; + + if (!uniform) { + offset = jay_as_gpr(b, offset); + } else if (!transpose) { + offset = jay_src_as_strided(b, offset, a64 ? 2 : 1, UGPR); + } + + if (!jay_is_null(data) && !transpose && !scalar_uniform) + data = jay_as_gpr(b, data); + + unsigned access = + nir_intrinsic_has_access(intr) ? nir_intrinsic_access(intr) : 0; + + bool volatile_access = access & ACCESS_VOLATILE; + bool coherent_access = access & ACCESS_COHERENT; + + /* Bspec: Atomic instruction -> Cache section: + * + * Atomic messages are always forced to "un-cacheable" in the L1 + * cache. + * + * Bspec: Overview of memory Access: + * + * If a read from a Null tile gets a cache-hit in a virtually-addressed + * GPU cache, then the read may not return zeroes. + * + * If a shader writes to a null tile and wants to be able to read it back + * as zero, it will use the 'volatile' decoration for the access, otherwise + * the compiler may choose to optimize things out, breaking the + * residencyNonResidentStrict guarantees. Due to the above, we need to make + * these operations uncached. + */ + unsigned cache = + urb ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) : + lsc_opcode_is_atomic(op) ? + LSC_CACHE(devinfo, STORE, L1UC_L3WB) : + volatile_access ? + (devinfo->ver >= 20 ? + /* Xe2 has a better L3 that can deal with null tiles.*/ + (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) : + LSC_CACHE(devinfo, LOAD, L1UC_L3C)) : + /* On older platforms, all caches have to be bypassed. */ + (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) : + LSC_CACHE(devinfo, LOAD, L1UC_L3UC))) : + /* Skip L1 for coherent accesses */ + coherent_access ? (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) : + LSC_CACHE(devinfo, LOAD, L1UC_L3C)) : + !has_dest ? LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) : + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS); + + unsigned max_imm_bits = brw_max_immediate_offset_bits(surf_type); + assert(base_offset >= u_intN_min(max_imm_bits)); + assert(base_offset <= u_intN_max(max_imm_bits)); + assert(base_offset == 0 || sfid != BRW_SFID_TGM); + + const unsigned base_offs_bits = + util_bitpack_sint(base_offset, 0, max_imm_bits - 1); + + unsigned nr = ndata->num_components; + uint64_t desc = + lsc_msg_desc(devinfo, op, surf_type, addr_size, + lsc_bits_to_data_size(ndata->bit_size), + cmask ? BITFIELD_MASK(nr) : nr, transpose, cache); + + jay_def tmp = dst; + + if (dst.file == UGPR) { + if (transpose) { + /* Transpose writes whole GRFs, so round up */ + tmp = jay_alloc_def(b, UGPR, + ALIGN_POT(jay_num_values(dst), + jay_ugpr_per_grf(b->shader))); + } else { + /* Without transpose we write at GRF granularity. Pad out. */ + tmp = jay_alloc_def(b, UGPR, + jay_ugpr_per_grf(b->shader) * jay_num_values(dst)); + } + } + + jay_def srcs[] = { offset, data }; + + /* Second data source immediately follows the first */ + if (op == LSC_OP_ATOMIC_CMPXCHG || op == LSC_OP_ATOMIC_FCMPXCHG) { + jay_def data2 = nj_src(*(data_src + 1)); + + if (!transpose) { + data2 = jay_as_gpr(b, data2); + } + + srcs[1] = jay_collect_two(b, data, data2); + } + + jay_def ex_desc = jay_null(); + uint32_t ex_desc_imm = 0; + if (scratch) { + ex_desc = jay_scratch_surface(nj); + + if (has_dest) { + b->shader->fills++; + } else { + b->shader->spills++; + } + } else if (surf_type == LSC_ADDR_SURFTYPE_FLAT) { + desc |= ((uint64_t) lsc_flat_ex_desc(devinfo, base_offs_bits) << 32); + } else if (jay_is_null(bti_indirect)) { + desc |= + ((uint64_t) lsc_bti_ex_desc(devinfo, bti_const, base_offs_bits) << 32); + } else if (!jay_is_null(bti_indirect)) { + ex_desc = bti_indirect; + + if (surf_type == LSC_ADDR_SURFTYPE_SS || + surf_type == LSC_ADDR_SURFTYPE_BSS) { + ex_desc_imm = SET_BITS(GET_BITS(base_offs_bits, 16, 4), 31, 19) | + SET_BITS(GET_BITS(base_offs_bits, 3, 0), 15, 12); + } else { + /* TODO: Move the SHL to NIR for CSE? */ + assert(surf_type == LSC_ADDR_SURFTYPE_BTI); + assert(base_offs_bits == 0); + ex_desc = jay_SHL_u32(b, bti_indirect, 24); + } + } + + enum jay_type data_type = jay_type(JAY_TYPE_U, MAX2(ndata->bit_size, 32)); + jay_SEND(b, .sfid = sfid, .msg_desc = desc, .srcs = srcs, + .nr_srcs = jay_is_null(data) ? 1 : 2, .dst = tmp, .type = data_type, + .src_type = { offset_type, data_type }, .uniform = uniform, + .bindless = surf_type == LSC_ADDR_SURFTYPE_BSS, .ex_desc = ex_desc, + .ex_desc_imm = ex_desc_imm); + + if (has_dest && !jay_defs_equivalent(tmp, dst)) { + jay_copy_strided(b, dst, tmp, !transpose); + } +} + +static void +jay_emit_barycentric(struct nir_to_jay_state *nj, + nir_intrinsic_instr *intr, + enum intel_barycentric_mode mode) +{ + assert(nj->s->stage == MESA_SHADER_FRAGMENT); + enum glsl_interp_mode glsl_mode = nir_intrinsic_interp_mode(intr); + + if (glsl_mode == INTERP_MODE_NOPERSPECTIVE) { + mode += INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL; + } else { + assert(glsl_mode == INTERP_MODE_SMOOTH); + } + + jay_copy(&nj->bld, nj_def(&intr->def), nj->payload.fs.bary[mode]); +} + +static void +jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) +{ + jay_shader *s = nj->s; + jay_function *f = nj->f; + jay_builder *b = &nj->bld; + jay_cs_payload *cs = + mesa_shader_stage_is_compute(s->stage) ? &nj->payload.cs : NULL; + + const bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest; + jay_def dst = has_dest ? nj_def(&intr->def) : jay_null(); + + switch (intr->intrinsic) { + case nir_intrinsic_resource_intel: + /* No code to generate here */ + break; + + case nir_intrinsic_global_atomic: + case nir_intrinsic_global_atomic_swap: + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_global_constant_uniform_block_intel: + case nir_intrinsic_load_scratch_intel: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_shared_uniform_block_intel: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ssbo_intel: + case nir_intrinsic_load_ssbo_uniform_block_intel: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_uniform_block_intel: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_store_global: + case nir_intrinsic_store_urb_lsc_intel: + case nir_intrinsic_store_scratch_intel: + case nir_intrinsic_store_shared: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_ssbo_intel: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: + jay_emit_mem_access(nj, intr); + break; + + case nir_intrinsic_load_push_data_intel: { + unsigned sz = intr->def.bit_size / 8; + unsigned base_offset = nir_intrinsic_base(intr); + assert(util_is_aligned(base_offset, sz)); + + if (nir_src_is_const(intr->src[0])) { + unsigned load_offset = nir_src_as_uint(intr->src[0]); + unsigned offs = base_offset + load_offset; + assert(util_is_aligned(load_offset, sz)); + + if (sz >= 4) { + jay_foreach_comp(dst, c) { + jay_MOV(b, jay_extract(dst, c), + nj->payload.push_data[(offs / 4) + c]); + } + } else { + jay_foreach_comp(dst, c) { + unsigned comp_offs = offs + c * sz; + if (util_is_aligned(comp_offs, 4)) { + jay_MOV(b, jay_extract(dst, c), + nj->payload.push_data[comp_offs / 4]); + } else { + jay_CVT(b, JAY_TYPE_U32, jay_extract(dst, c), + nj->payload.push_data[comp_offs / 4], + JAY_TYPE_U | intr->def.bit_size, JAY_ROUND, + (comp_offs % 4) / sz); + } + } + } + } else { + UNREACHABLE("todo: indirect push data"); + } + break; + } + + case nir_intrinsic_barrier: + if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) { + jay_emit_memory_barrier(nj, intr); + } + + if (cs) { + if (nir_intrinsic_execution_scope(intr) == SCOPE_WORKGROUP) { + if (jay_workgroup_is_one_subgroup(b, nj->nir)) { + // XXX: when we have a scheduler, jay_SCHEDULE_BARRIER(b); + } else { + jay_emit_signal_barrier(nj); + s->prog_data->cs.uses_barrier = true; + } + } + } else { + // XXX: when we have a scheduler, jay_SCHEDULE_BARRIER(b); + } + break; + + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: + UNREACHABLE("TODO"); + + case nir_intrinsic_load_reloc_const_intel: + jay_RELOC(b, dst, nir_intrinsic_param_idx(intr), + nir_intrinsic_base(intr)); + break; + + case nir_intrinsic_store_render_target_intel: + assert(nj->nir->info.stage == MESA_SHADER_FRAGMENT); + jay_emit_fb_write(b, intr); + break; + + case nir_intrinsic_shader_clock: + /* We must access the timestamp register atomically, but 64-bit + * instructions cannot read ARF. Instead use a 2x32-bit vectorized move. + */ + assert(dst.file == UGPR && "required for vectorization"); + jay_MOV(b, dst, jay_contiguous_def(J_ARF, JAY_ARF_TIMESTAMP, 2))->type = + JAY_TYPE_U32; + break; + + case nir_intrinsic_load_sample_mask_in: { + jay_def mask = jay_extract(nj->payload.u0, 15); + + if (nj->s->dispatch_width == 32) { + /* TODO: Optimize */ + jay_def hi = jay_extract(nj->payload.u1, 15); + mask = jay_BFI2_u32(b, 0xffff0000, hi, mask); + } + + jay_MOV(b, dst, mask); + break; + } + + case nir_intrinsic_load_subgroup_invocation: + /* TODO: Lower this in NIR? */ + jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16, + JAY_ROUND, 0); + break; + + case nir_intrinsic_demote: + case nir_intrinsic_demote_if: + /* TODO: Already lowered, but need to implement for performance. */ + break; + + case nir_intrinsic_ddx: + case nir_intrinsic_ddx_coarse: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX, + JAY_QUAD_SWIZZLE_YYYY); + break; + case nir_intrinsic_ddx_fine: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXZZ, + JAY_QUAD_SWIZZLE_YYWW); + break; + + case nir_intrinsic_ddy: + case nir_intrinsic_ddy_coarse: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX, + JAY_QUAD_SWIZZLE_ZZZZ); + break; + case nir_intrinsic_ddy_fine: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XYXY, + JAY_QUAD_SWIZZLE_ZWZW); + break; + + case nir_intrinsic_first_invocation: + jay_MOV(b, dst, emit_active_lane(nj)); + break; + + case nir_intrinsic_read_first_invocation: + jay_MOV(b, dst, emit_uniformize(nj, nj_src(intr->src[0]))); + break; + + case nir_intrinsic_ballot: + case nir_intrinsic_ballot_relaxed: { + jay_def val = nj_src(intr->src[0]); + if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) { + val = emit_active_lane_mask(nj); + } else if (val.file == UFLAG) { + /* Move to a FLAG temporary so we can ballot it. */ + val = jay_MOV(b, jay_alloc_def(b, FLAG, 1), val)->dst; + } else { + assert(val.file == FLAG); + } + + assert(intr->def.bit_size == b->shader->dispatch_width); + jay_MOV(b, dst, val); + break; + } + + /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a + * GPR input, we could uniformize (as behaviour is undefined for + * non-uniform inputs) but a lowered bit extract is cheaper than uniformize. + */ + case nir_intrinsic_inverse_ballot: { + assert(dst.file == FLAG); + jay_def x = nj_src(intr->src[0]); + if (x.file == GPR) { + jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id); + jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1); + jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_NE); + } else { + jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width; + } + + break; + } + + case nir_intrinsic_load_local_invocation_id: + assert(cs); + UNREACHABLE("todo: implement me from payload"); + jay_copy(b, dst, cs->local_invocation_ids); + break; + + case nir_intrinsic_load_barycentric_pixel: + jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL); + break; + + case nir_intrinsic_load_barycentric_sample: + jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); + break; + + case nir_intrinsic_load_barycentric_centroid: + jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID); + break; + + case nir_intrinsic_load_pixel_coord_intel: + jay_MOV(b, dst, nj->payload.fs.coord.xy); + break; + + case nir_intrinsic_load_frag_coord_z: + jay_MOV(b, dst, nj->payload.fs.coord.z); + break; + + case nir_intrinsic_load_frag_coord_w_rcp: + jay_MOV(b, dst, nj->payload.fs.coord.w); + break; + + case nir_intrinsic_load_urb_output_handle_intel: + jay_MOV(b, dst, nj->payload.urb_handle); + break; + + case nir_intrinsic_load_layer_id: + jay_EXTRACT_LAYER(b, dst, jay_extract(nj->payload.u0, 9), + payload_u1(nj, 9, 1)); + break; + + case nir_intrinsic_load_front_face: { + /* Bit 11 is facingness for the first polygon. TODO: Multipolygon. */ + jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), + jay_extract(nj->payload.u0, 9), BITFIELD_BIT(11)); + + /* The bit is actually backfacingness so check for equality with 0 */ + jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_EQ); + break; + } + + /* Sample ID comes in as 4-bit numbers in g1.0: + * + * 15:12 Slot 3 SampleID + * 11:8 Slot 2 SampleID + * 7:4 Slot 1 SampleID + * 3:0 Slot 0 SampleID + * + * Each slot corresponds to four channels, so we want to replicate each + * half-byte value to 4 channels in a row: + * + * dst+0: .7 .6 .5 .4 .3 .2 .1 .0 + * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0 + * + * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 + * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8 + * + * First, we read g1.0 with a <1,8,0>UB region, causing the first 8 + * channels to read the first byte (7:0), and the second group of 8 + * channels to read the second byte (15:8). Then, we shift right by + * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3 + * values into place. Finally, we AND with 0xf to keep the low nibble. + * + * According to the "PS Thread Payload for Normal Dispatch" + * pages on the BSpec, the sample ids are stored in R0.8/R1.8 + * on gfx20+ and in R1.0/R2.0 on gfx8+. + */ + case nir_intrinsic_load_sample_id: { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_EXTRACT_BYTE_PER_8LANES(b, x, jay_extract(nj->payload.u0, 8), + payload_u1(nj, 8, 1)); + jay_AND_U32_U16(b, dst, jay_SHR_ODD_SUBSPANS_BY_4_u16(b, x), 0xF); + break; + } + + case nir_intrinsic_load_input: + if (s->stage == MESA_SHADER_VERTEX) { + unsigned offs = nir_intrinsic_base(intr) * 4; + offs += nir_intrinsic_component(intr); + assert(intr->def.bit_size == 32 && "todo"); + + jay_copy(b, dst, + jay_collect_vectors(b, nj->payload.vs.attributes + offs, + intr->def.num_components)); + break; + } + + FALLTHROUGH; + case nir_intrinsic_load_fs_input_interp_deltas: { + assert(s->stage == MESA_SHADER_FRAGMENT); + unsigned location = nir_intrinsic_io_semantics(intr).location + + nir_src_as_uint(intr->src[0]); + unsigned i = (s->prog_data->fs.urb_setup[location] * 4) + + nir_intrinsic_component(intr); + + if (intr->intrinsic == nir_intrinsic_load_input) { + assert(intr->def.num_components == 1 && "should be scalarized"); + } + + /* Zeroth delta is the flat value */ + jay_copy(b, dst, nj->payload.fs.deltas[i]); + break; + } + + case nir_intrinsic_load_subgroup_id: + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + /* Subgroup ID in Thread Group is u0.2 bits 7:0 */ + jay_AND(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 0xFF); + break; + + case nir_intrinsic_load_num_subgroups: + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + /* Number of subgroups in Thread Group is u0.2 bits 31:24 */ + jay_SHR(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 24); + break; + + case nir_intrinsic_load_workgroup_id: + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + jay_MOV(b, jay_extract(dst, 0), jay_extract(nj->payload.u0, 1)); + jay_MOV(b, jay_extract(dst, 1), jay_extract(nj->payload.u0, 6)); + jay_MOV(b, jay_extract(dst, 2), jay_extract(nj->payload.u0, 7)); + break; + + case nir_intrinsic_shuffle_intel: { + jay_def data = nj_src(intr->src[0]); + + if (nir_src_is_const(intr->src[1])) { + /* Broadcast takes a lane index, with only 32-bit registers */ + jay_BROADCAST_IMM(b, dst, data, nir_src_as_uint(intr->src[1]) / 4); + } else { + /* Shuffle takes a byte index */ + jay_SHUFFLE(b, dst, data, nj_src(intr->src[1])); + } + + break; + } + + case nir_intrinsic_quad_broadcast: + jay_QUAD_SWIZZLE(b, dst, nj_src(intr->src[0]), + JAY_QUAD_SWIZZLE_XXXX + nir_src_as_uint(intr->src[1])); + break; + + case nir_intrinsic_load_inline_data_intel: { + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + b->shader->prog_data->cs.uses_inline_data = true; + + unsigned offset = nir_intrinsic_base(intr) / 4; + unsigned nr = jay_num_values(dst); + jay_copy(b, dst, jay_extract_range(nj->payload.inline_data, offset, nr)); + break; + } + + default: +#ifndef NDEBUG + assert(intr->intrinsic < nir_num_intrinsics); + fprintf(stdout, "intrinsic: %s\n", + nir_intrinsic_infos[intr->intrinsic].name); +#endif + UNREACHABLE("unknown intrinsic"); + } +} + +static bool +sampler_needs_header(enum brw_sampler_opcode op, + nir_texop nir_op, + const struct intel_device_info *devinfo) +{ + switch (op) { + case BRW_SAMPLER_OPCODE_SAMPLEINFO: + return true; + case BRW_SAMPLER_OPCODE_LD: + case BRW_SAMPLER_OPCODE_LD_LZ: + /* Xe3 HW does not seem to work unless we force a header. */ + return devinfo->ver >= 30; + default: + return nir_op == nir_texop_tg4; + } +} + +static void +jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex) +{ + /* SKL PRMs: Volume 7: 3D-Media-GPGPU: + * + * "The Pixel Null Mask field, when enabled via the Pixel Null Mask + * Enable will be incorect for sample_c when applied to a surface with + * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask + * Enable may incorrectly report pixels as referencing a Null surface." + * + * We'll take care of this in NIR. + */ + assert(!tex->is_sparse || + nir_tex_instr_src_index(tex, nir_tex_src_comparator) == -1); + + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&tex->def); + jay_def tmp = dst; + + const enum brw_sampler_opcode op = (enum brw_sampler_opcode)( + tex->backend_flags & ~BRW_TEX_INSTR_FUSED_EU_DISABLE); + const struct brw_sampler_payload_desc *payload_desc = + brw_get_sampler_payload_desc(op); + + /* First deal with surface & sampler */ + unsigned payload_type_bit_size = 0; + bool surface_bindless = false; + bool sampler_bindless = false; + jay_def surface, sampler, packed_offsets = jay_null(); + jay_def payload[JAY_MAX_SAMPLER_MESSAGE_SIZE]; + int i; + if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle)) >= 0) { + unsigned x; + surface = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless); + if (jay_is_null(surface)) + surface = jay_imm(x); + assert(tex->texture_index == 0); + } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset)) >= + 0) { + unsigned x; + surface = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless); + if (jay_is_null(surface)) + surface = jay_imm(x + tex->texture_index); + else if (tex->texture_index) + surface = jay_ADD_u32(b, surface, tex->texture_index); + } else { + surface = jay_imm(tex->texture_index); + } + + if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle)) >= 0) { + unsigned x; + sampler = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless); + if (jay_is_null(sampler)) + surface = jay_imm(x); + assert(tex->sampler_index == 0); + } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset)) >= + 0) { + unsigned x; + sampler = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless); + if (jay_is_null(sampler)) + sampler = jay_imm(x + tex->sampler_index); + else + sampler = jay_ADD_u32(b, sampler, tex->sampler_index); + } else { + sampler = jay_imm(tex->sampler_index); + } + + surface = emit_uniformize(nj, surface); + sampler = emit_uniformize(nj, sampler); + + /* Now the sampler payload */ + bool has_offset_in_payload = false; + bool payload_uniform = true; + uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0; + for (uint32_t i = 0; + payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; + i++) { + nir_tex_src_type nir_source; + unsigned nir_comp; + +#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name +#define S(name, component) \ + do { \ + nir_source = nir_tex_src_##name; \ + nir_comp = component; \ + } while (0) + + struct brw_sampler_payload_src sampler_src = payload_desc->sources[i]; + + switch (sampler_src.param) { + case P(U): + S(coord, 0); + break; + case P(V): + S(coord, 1); + break; + case P(R): + S(coord, 2); + break; + case P(AI): + S(coord, 3); + break; + case P(BIAS): + S(bias, 0); + break; + case P(LOD): + S(lod, 0); + break; + case P(MLOD): + S(min_lod, 0); + break; + case P(REF): + S(comparator, 0); + break; + case P(DUDX): + S(ddx, 0); + break; + case P(DUDY): + S(ddy, 0); + break; + case P(DVDX): + S(ddx, 1); + break; + case P(DVDY): + S(ddy, 1); + break; + case P(DRDX): + S(ddx, 2); + break; + case P(DRDY): + S(ddy, 2); + break; + case P(SI): + S(ms_index, 0); + break; + case P(MCSL): + S(ms_mcs_intel, 0); + break; + case P(MCSH): + S(ms_mcs_intel, 1); + break; + case P(MCS0): + S(ms_mcs_intel, 0); + break; + case P(MCS1): + S(ms_mcs_intel, 1); + break; + case P(MCS2): + S(ms_mcs_intel, 2); + break; + case P(MCS3): + S(ms_mcs_intel, 3); + break; + + case P(OFFU): + S(offset, 0); + has_offset_in_payload = true; + break; + case P(OFFV): + S(offset, 1); + has_offset_in_payload = true; + break; + case P(OFFUV4): + case P(OFFUVR4): + case P(OFFUV6): + case P(OFFUVR6): + case P(BIAS_OFFUV6): + case P(BIAS_OFFUVR4): + case P(LOD_OFFUV6): + case P(LOD_OFFUVR4): + case P(OFFUV4_R): + case P(OFFUV6_R): + case P(OFFUVR4_R): + /* There is no payload with 2 packed entries, so backend1 is always + * the one payload parameter packed. */ + S(backend1, 0); + has_offset_in_payload = true; + break; + + case P(BIAS_AI): + case P(LOD_AI): + case P(MLOD_R): + /* There is no payload with 2 packed entries, so backend1 is always + * the one payload parameter packed. */ + S(backend1, 0); + break; + + default: + UNREACHABLE("unhandled sampler param"); + } + +#undef P +#undef S + + jay_def param_val = jay_null(); + + int j = nir_tex_instr_src_index(tex, nir_source); + if (j >= 0 && nir_comp < tex->src[j].src.ssa->num_components) { + param_val = jay_extract(nj_src(tex->src[j].src), nir_comp); + + unsigned bitsize = nir_src_bit_size(tex->src[j].src); + assert(payload_type_bit_size == 0 || payload_type_bit_size == bitsize); + payload_type_bit_size = bitsize; + } + + /* The hardware requires a LOD for buffer textures */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF && + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) { + sampler_src.optional = false; + } + + /* Wa_14012688258: + * + * Don't trim zeros at the end of payload for sample operations + * in cube and cube arrays. + * + * Compiler should send U,V,R parameters even if V,R are 0. + */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + intel_needs_workaround(nj->devinfo, 14012688258) && + (sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U || + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V || + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) { + sampler_src.optional = false; + } + + /* The last source present in the payload dictates the number of + * sources, unless it's required. + * + * We can skip the last source if it's zero. + */ + if (!sampler_src.optional || !jay_is_null(param_val)) + n_sources = i + 1; + + if (jay_is_null(param_val)) { + param_val = jay_alloc_def(b, dst.file, 1); + jay_MOV(b, param_val, 0); + } + + payload[i] = param_val; + payload_uniform &= jay_is_uniform(payload[i]); + } + + i = nir_tex_instr_src_index(tex, nir_tex_src_backend2); + if (i >= 0) { + packed_offsets = nj_src(tex->src[i].src); + } + + /* Xe2+ should never used packed offsets since it has enough opcodes to + * handle any programmable offset. + */ + assert(jay_is_null(packed_offsets) || nj->devinfo->ver < 20); + + /* If the NIR instruction has an offset param but the sampler payload + * doesn't, we can put the offset into the header of the message. + * + * The restriction though is that it should be a constant value. + */ + int offs_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset); + bool has_const_offsets = offs_idx != -1 && !has_offset_in_payload; + + bool is_high_sampler = !jay_is_imm(sampler) || jay_as_uint(sampler) >= 16; + bool residency = tex->is_sparse; + unsigned null_mask_component = 0; + + const bool needs_header = sampler_needs_header(op, tex->op, nj->devinfo) || + has_const_offsets || + !jay_is_null(packed_offsets) || + sampler_bindless || + is_high_sampler || + residency; + + uint8_t component_mask; + if (tex->op == nir_texop_tg4) { + component_mask = WRITEMASK_XYZW; + } else if (residency) { + /* intel_nir_lower_sparse guarantees that texturing operations only + * read the data, or the sparse residency code, but not both at once. + * + * We need to use UGPRs for the residency result because the sampler + * returns the null pixel mask in lane 0, regardless of lanemasking. + * + * Unfortunately, the sampler doesn't allow us to writemask out all + * four colour channels, so we have to needlessly return red. This + * isn't uniform data, but we store it in an array of UGPRs anyway + * in order to have a consistent def file. The colour data will be + * immediately dead anyway. + */ + assert(tex->op == nir_texop_sparse_residency_intel || + tex->op == nir_texop_sparse_residency_txf_intel); + assert(nir_def_components_read(&tex->def) == WRITEMASK_Y); + component_mask = WRITEMASK_X; + unsigned red_grfs = payload_uniform ? 1 : jay_grf_per_gpr(b->shader); + unsigned grfs = red_grfs + 1; + tmp = jay_alloc_def(b, UGPR, grfs * jay_ugpr_per_grf(b->shader)); + null_mask_component = red_grfs * jay_ugpr_per_grf(b->shader); + } else { + component_mask = nir_def_components_read(&tex->def); + + /* We can reduce the return length of the message to drop unused + * trailing components, but shrinking with a discontiguous mask + * requires a message header. We only do that if we need a header + * for other reasons, as it's more expensive than writing extra data. + */ + if (!needs_header) { + component_mask = + (uint8_t) BITFIELD_MASK(util_last_bit(component_mask)); + } + + /* TODO: Shrink 16-bit textures too. Shrinking is problematic for some + * component masks due to 32-bit granularity of ISA registers. + */ + if (tex->def.bit_size != 32 || (jay_debug & JAY_DBG_NOOPT)) + component_mask = nir_component_mask(tex->def.num_components); + + /* If we shrunk the destination, we need a temporary */ + if (component_mask != BITFIELD_MASK(tex->def.num_components)) { + tmp = jay_alloc_def(b, GPR, util_bitcount(component_mask)); + } + } + + /* SENDs always write entire GRFs so we need to pad out for uniform dests */ + if (dst.file == UGPR && !residency) { + unsigned nr = jay_ugpr_per_grf(b->shader) * jay_num_values(tmp); + tmp = jay_alloc_def(b, UGPR, nr); + } + + if (tex->op == nir_texop_texture_samples) { + assert(needs_header); + payload_type_bit_size = 32; + n_sources = 0; + } + + jay_def header = jay_null(); + if (needs_header) { + uint32_t header2; + if (tex->op == nir_texop_tg4) { + /* Gathers have a component but no write mask */ + header2 = (tex->component << 16); + } else { + /* If present, the header write mask are inverted compared to NIR */ + header2 = (~component_mask & 0xf) << 12; + } + + if (residency) + header2 |= 1 << 23; /* g0.2 bit 23: Pixel Null Mask Enable */ + + if (has_const_offsets) { + const unsigned num_components = nir_tex_instr_src_size(tex, offs_idx); + for (unsigned i = 0; i < num_components; i++) { + nir_scalar s = nir_get_scalar(tex->src[offs_idx].src.ssa, i); + s = nir_scalar_chase_movs(s); + assert(nir_scalar_is_const(s)); + int offset = nir_scalar_as_int(s); + + /* Offsets are 4-bits, reversed order */ + header2 |= (offset & 0xf) << ((2 - i) * 4); + } + } + + /* Vectorized zeroing of the header. TODO: This can be optimized more. */ + jay_def zeroes = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader)); + jay_MOV(b, zeroes, 0); + + jay_def ugprs[JAY_MAX_DEF_LENGTH]; + jay_foreach_comp(zeroes, i) { + ugprs[i] = jay_extract(zeroes, i); + } + + /* Set the main immediate part of the header */ + if (header2 != 0) { + ugprs[2] = jay_MOV_u32(b, header2); + } + + if (sampler_bindless) { + /* Bindless sampler handles aren't relative to the sampler state + * pointer passed into the shader through SAMPLER_STATE_POINTERS_*. + * Instead, it's an absolute pointer relative to dynamic state base + * address. + * + * Sampler states are 16 bytes each and the pointer we give here has + * to be 32-byte aligned. In order to avoid more indirect messages + * than required, we assume that all bindless sampler states are + * 32-byte aligned. This sacrifices a bit of general state base + * address space but means we can do something more efficient in the + * shader. + */ + ugprs[3] = sampler; + } else { + /* Select the default dynamic state base address + offset */ + jay_def sampler_ptr = nj->payload.sampler_state_pointer; + + /* Gfx11+ sampler message headers include bits in 4:0 which conflict + * with the ones included in g0.3 bits 4:0. Mask them out. + */ + if (b->shader->devinfo->ver >= 11) { + sampler_ptr = jay_AND_u32(b, sampler_ptr, INTEL_MASK(31, 5)); + } + + /* TODO: We should probably lower this in NIR. */ + if (is_high_sampler) { + if (jay_is_imm(sampler)) { + unsigned s = jay_as_uint(sampler); + const int sampler_state_size_B = 16; + unsigned offs_B = ROUND_DOWN_TO(s, 16) * sampler_state_size_B; + assert(offs_B > 0 && "since s > 0"); + sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B); + } else { + jay_def offs_B = + jay_SHL_u32(b, jay_AND_u32(b, sampler, 0xf0), 4); + sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B); + } + } + + ugprs[3] = sampler_ptr; + } + /* Zip it all up into a vector of UGPRs which will RA to a single GRF */ + header = jay_collect_vectors(b, ugprs, jay_num_values(zeroes)); + } + + assert(payload_type_bit_size == 16 || payload_type_bit_size == 32); + unsigned simd_mode = 0; + unsigned simd_width = payload_uniform ? 1 : nj->s->dispatch_width; + if (nj->devinfo->ver < 20) { + if (payload_type_bit_size == 16) { + assert(nj->devinfo->ver >= 11); + simd_mode = simd_width <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H : + GFX10_SAMPLER_SIMD_MODE_SIMD16H; + } else { + simd_mode = simd_width <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : + BRW_SAMPLER_SIMD_MODE_SIMD16; + } + } else { + if (payload_type_bit_size == 16) { + simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H : + XE2_SAMPLER_SIMD_MODE_SIMD32H; + } else { + simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 : + XE2_SAMPLER_SIMD_MODE_SIMD32; + } + } + + uint64_t desc = 0; + jay_def desc_src = jay_null(), desc_ex_src = jay_null(); + + unsigned sampler_imm = 0; + if (jay_is_imm(sampler) && !sampler_bindless) { + sampler_imm = jay_as_uint(sampler) % 16; + } + + const unsigned msg_type = brw_get_sampler_hw_opcode(op); + bool is_16 = false; /* TODO */ + unsigned ret_type = is_16 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS : + GFX8_SAMPLER_RETURN_FORMAT_32BITS; + + if (!surface_bindless && + jay_is_imm(surface) && + (jay_is_imm(sampler) || sampler_bindless)) { + desc = brw_sampler_desc(nj->devinfo, jay_as_uint(surface), sampler_imm, + msg_type, simd_mode, ret_type); + } else if (surface_bindless) { + /* Bindless surface */ + desc = brw_sampler_desc(nj->devinfo, GFX9_BTI_BINDLESS, sampler_imm, + msg_type, simd_mode, ret_type); + + /* For bindless samplers, the entire address is included in the message + * header so we can leave the portion in the message descriptor 0. + */ + if (!sampler_bindless && !jay_is_imm(sampler)) { + desc_src = jay_SHL_u32(b, sampler, 8); + } + + /* We assume that the driver provided the handle in the top 20 bits so + * we can use the surface handle directly as the extended descriptor. + */ + desc_ex_src = jay_alloc_def(b, J_ADDRESS, 1); + jay_MOV(b, desc_ex_src, surface); + } else { + /* Immediate portion of the descriptor */ + desc = brw_sampler_desc(nj->devinfo, 0, 0, msg_type, simd_mode, ret_type); + + if (sampler_bindless) { + desc_src = surface; + } else if (!sampler_bindless && jay_is_imm(sampler)) { + desc_src = jay_OR_u32(b, surface, jay_as_uint(sampler) << 8); + } else { + desc_src = jay_OR_u32(b, jay_SHL_u32(b, sampler, 8), surface); + } + + desc_src = jay_AND_u32(b, desc_src, 0xfff); + } + + if (n_sources > 2 || !jay_is_null(header)) { + for (unsigned i = 0; i < n_sources; ++i) { + payload[i] = + jay_src_as_strided(b, payload[i], 1, payload_uniform ? UGPR : GPR); + } + } + + enum jay_type src_type = jay_type(JAY_TYPE_U, payload_type_bit_size); + jay_SEND(b, .sfid = BRW_SFID_SAMPLER, .msg_desc = desc, .desc = desc_src, + .ex_desc = desc_ex_src, .header = header, .srcs = payload, + .nr_srcs = n_sources, .type = JAY_TYPE_U32, + .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform, + .bindless = surface_bindless); + + /* If we sampled into a temporary, copy out to the final */ + if (residency) { + jay_MOV(b, jay_extract(dst, 1), jay_extract(tmp, null_mask_component)); + } else if (!jay_defs_equivalent(dst, tmp)) { + unsigned i = 0; + unsigned tmp_stride = dst.file == UGPR ? jay_ugpr_per_grf(b->shader) : 1; + + u_foreach_bit(c, component_mask) { + jay_MOV(b, jay_extract(dst, c), jay_extract(tmp, (i++) * tmp_stride)); + } + } + + if (mesa_shader_stage_is_compute(b->shader->stage)) { + b->shader->prog_data->cs.uses_sampler |= !nir_tex_instr_is_query(tex); + } +} + +static void +jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + jay_block_add_successor(nj->current_block, nj->break_block); + jay_BREAK(&nj->bld); + break; + case nir_jump_halt: + // TODO: Do we want a predicated EOT here, or a jump to the end? + assert(!"TODO: implement HALT"); + break; + case nir_jump_return: + /* Should be lowered */ + default: + UNREACHABLE("unknown jump"); + } +} + +static void +jay_emit_instr(struct nir_to_jay_state *nj, jay_block *block, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + jay_emit_alu(nj, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + jay_emit_intrinsic(nj, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_tex: + jay_emit_texture(nj, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_load_const: + jay_emit_load_const(nj, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_phi: + case nir_instr_type_undef: { + jay_def def = nj_def(nir_instr_def(instr)); + + jay_foreach_comp(def, c) { + if (instr->type == nir_instr_type_phi) { + jay_PHI_DST(&nj->bld, jay_extract(def, c)); + } else { + jay_INDETERMINATE(&nj->bld, jay_extract(def, c)); + } + } + + break; + } + + case nir_instr_type_jump: + jay_emit_jump(nj, nir_instr_as_jump(instr)); + break; + + case nir_instr_type_deref: + UNREACHABLE("All derefs should've been lowered"); + + default: + UNREACHABLE("unknown instruction type"); + } +} + +static jay_block * +jay_create_block(struct nir_to_jay_state *nj) +{ + jay_block *block = jay_new_block(nj->f); + block->indent = nj->indent; + return block; +} + +static jay_inst * +jay_block_ending_unconditional_jump(jay_block *block) +{ + jay_inst *jump = jay_block_ending_jump(block); + return jump && !jump->predication ? jump : NULL; +} + +static void +jay_emit_if(struct nir_to_jay_state *nj, nir_if *nif) +{ + jay_builder *b = &nj->bld; + jay_def condition = nj_src(nif->condition); + + jay_block *before_block = nj->current_block; + jay_block *after_block = jay_create_block(nj); + + /* Push */ + ++nj->indent; + + jay_block *else_first = jay_create_block(nj); + + jay_block *then_first = jay_emit_cf_list(nj, &nif->then_list); + jay_block *then_last = nj->current_block; + + nj->after_block = else_first; + + jay_block *else_first_2 = jay_emit_cf_list(nj, &nif->else_list); + jay_block *else_last = nj->current_block; + assert(else_first == else_first_2); + + /* Pop */ + --nj->indent; + + jay_block_add_successor(before_block, then_first); + jay_block_add_successor(before_block, else_first); + + if (!jay_block_ending_unconditional_jump(then_last)) + jay_block_add_successor(then_last, after_block); + + if (!jay_block_ending_unconditional_jump(else_last)) + jay_block_add_successor(else_last, after_block); + + nj->after_block = after_block; + + /* Emit the if-else-endif sequence */ + b->cursor = jay_after_block(before_block); + jay_add_predicate(b, jay_IF(b), condition); + + b->cursor = jay_before_block(else_first); + jay_ELSE(b); + + b->cursor = jay_after_block(else_last); + jay_ENDIF(b); +} + +static void +jay_emit_loop(struct nir_to_jay_state *nj, nir_loop *nloop) +{ + assert(!nir_loop_has_continue_construct(nloop)); + + jay_builder *b = &nj->bld; + jay_block *saved_break = nj->break_block; + + /* Make the block that will be after the loop exit */ + nj->break_block = jay_create_block(nj); + ++nj->indent; + + /* Make a block for the loop body, which is also the loop header */ + jay_block *loop_header = jay_create_block(nj); + loop_header->loop_header = true; + + /* The current block falls through to the start of the loop */ + jay_block_add_successor(nj->current_block, loop_header); + + /* Emit the loop body */ + nj->after_block = loop_header; + jay_emit_cf_list(nj, &nloop->body); + + /* Emit the backedge */ + jay_inst *jump = jay_block_ending_jump(nj->current_block); + if (jump && jump->op == JAY_OPCODE_BREAK) { + jump->op = JAY_OPCODE_LOOP_ONCE; + } else { + jay_block_add_successor(nj->current_block, loop_header); + jay_WHILE(b); + } + + /* Pop */ + --nj->indent; + nj->after_block = nj->break_block; + nj->break_block = saved_break; + + b->cursor = jay_after_block(nj->after_block); +} + +static jay_block * +jay_emit_block(struct nir_to_jay_state *nj, nir_block *nb) +{ + jay_builder *b = &nj->bld; + + if (nj->after_block) { + nj->current_block = nj->after_block; + nj->after_block = NULL; + } else { + nj->current_block = jay_create_block(nj); + } + + jay_block *block = nj->current_block; + block->uniform = !nb->divergent; + list_addtail(&block->link, &nj->f->blocks); + + b->cursor = jay_after_block(block); + + /* Emit the contents of the block */ + nir_foreach_instr(instr, nb) { + jay_emit_instr(nj, block, instr); + } + + /* Look in the current NIR block's successors for any phis. Each of them + * should have a source corresponding to a value coming from our current + * block. Create PHI_SRC opcodes in the current block for those values. + * The corresponding PHI_DST may not have been emitted yet, but that's ok. + */ + for (unsigned bs = 0; bs < ARRAY_SIZE(nb->successors); ++bs) { + nir_block *nb_successor = nb->successors[bs]; + if (!nb_successor) + continue; + + nir_foreach_phi(nphi, nb_successor) { + jay_def val = nj_src(nir_phi_get_src_from_block(nphi, nb)->src); + + /* The phi def might be nonuniform but have uniform source (like a + * constant). Move to the correct file in the the source block and + * reference that in PHI_SRC. + */ + if (jay_file_for_def(&nphi->def) != val.file) { + b->cursor = jay_after_block_logical(block); + jay_def tmp = val; + val = jay_alloc_def(b, jay_file_for_def(&nphi->def), + jay_num_values(val)); + jay_copy(b, val, tmp); + } + + jay_foreach_comp(val, c) { + b->cursor = jay_before_jump(block); + jay_PHI_SRC(b, JAY_TYPE_U32, jay_extract(val, c), + nphi->def.index + c); + } + } + } + + b->cursor = jay_after_block(block); + nj->active_lane_mask = jay_null(); + nj->active_lane = jay_null(); + nj->active_lane_x4 = jay_null(); + + return block; +} + +static jay_block * +jay_emit_cf_list(struct nir_to_jay_state *nj, struct exec_list *list) +{ + jay_block *start_block = NULL; + + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + jay_block *block = jay_emit_block(nj, nir_cf_node_as_block(node)); + + if (!start_block) + start_block = block; + break; + } + + case nir_cf_node_if: + jay_emit_if(nj, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + jay_emit_loop(nj, nir_cf_node_as_loop(node)); + break; + + default: + UNREACHABLE("Unknown NIR control flow node"); + } + } + + return start_block; +} + +static void +jay_emit_eot(struct nir_to_jay_state *nj) +{ + jay_builder *b = &nj->bld; + + if (mesa_shader_stage_is_compute(nj->nir->info.stage)) { + /* Vectorized copy into the EOT register. Not necessary for correctness + * but keeps RA from inserting 16 scalar copies instead. + */ + jay_def copy = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader)); + jay_MOV(b, copy, nj->payload.u0); + + jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, .eot = true, .msg_desc = 0, + .srcs = ©, .nr_srcs = 1, .type = JAY_TYPE_U32, + .uniform = true); + } else if (nj->nir->info.stage == MESA_SHADER_VERTEX) { + jay_block *block = jay_last_block(nj->f); + jay_inst *I = jay_last_inst(block); + + /* TODO: What if this isn't the case? Do we need a no-op store...? */ + assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == BRW_SFID_URB); + jay_set_send_eot(I, true); + } +} + +static void +set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired) +{ + /* Only touch cr0 if we are changing bits */ + if ((*cr0) != desired) { + jay_builder b = jay_init_builder(f, cursor); + jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired); + *cr0 = desired; + } +} + +static void +jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes) +{ + /* First, work out the global float control mode for the shader */ + uint32_t global = 0x0; + + /* Initially fp16 denorms are flushed-to-zero, handle preserve. */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) { + global |= BRW_CR0_FP16_DENORM_PRESERVE; + } + + /* Initially fp32 denorms are flushed-to-zero, handle preserve. + * + * TODO: Optimize this, we have a dispatch bit. + */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) { + global |= BRW_CR0_FP32_DENORM_PRESERVE; + } + + /* Initially fp64 denorms are flushed to zero, handle preserve. */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) { + global |= BRW_CR0_FP64_DENORM_PRESERVE; + } + + /* By default, we are in round-to-even mode. Note we do not permit setting + * round mode separately by bitsize but this is ok for current APIs. The + * Vulkan driver sets roundingModeIndependence = NONE. + * + * TODO: Optimize this, there is a command buffer bit for it. + */ + if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) || + ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) || + ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) { + global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT); + } + + uint32_t cr0 = 0; + jay_function *entrypoint = jay_shader_get_entrypoint(shader); + set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global); + + /* Now handle per-instruction deltas to the global mode */ + jay_foreach_function(shader, func) { + jay_foreach_block(func, block) { + uint32_t current = cr0; + + jay_foreach_inst_in_block(block, I) { + uint32_t required = cr0; + enum jay_rounding_mode round = + (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND; + + if (round != JAY_ROUND) { + required &= ~BRW_CR0_RND_MODE_MASK; + required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT); + } + + if (jay_type_is_any_float(I->type)) { + set_cr0(func, jay_before_inst(I), ¤t, required); + } + } + + /* Restore to global state on block boundaries */ + if (jay_num_successors(block) > 0) { + set_cr0(func, jay_after_block(block), ¤t, cr0); + } + } + } +} + +struct payload_builder { + jay_builder *b; + unsigned offsets[JAY_NUM_SSA_FILES]; + jay_def vecs[JAY_NUM_SSA_FILES]; +}; + +static jay_def +read_payload(struct payload_builder *b, enum jay_file file) +{ + unsigned granularity = file == UGPR ? 16 : 1; + unsigned channel = b->offsets[file] % granularity; + + if (channel == 0) { + b->vecs[file] = jay_alloc_def(b->b, file, granularity); + jay_PRELOAD(b->b, b->vecs[file], b->offsets[file]); + } + + b->offsets[file]++; + return jay_extract(b->vecs[file], channel); +} + +static jay_def +read_vector_payload(struct payload_builder *b, enum jay_file file, unsigned len) +{ + jay_def defs[JAY_MAX_DEF_LENGTH]; + assert(len < ARRAY_SIZE(defs)); + + for (unsigned i = 0; i < len; ++i) { + defs[i] = read_payload(b, file); + } + + return jay_collect_vectors(b->b, defs, len); +} + +static void +setup_payload_push(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + unsigned push_size_B = 0; + for (int i = 0; i < ARRAY_SIZE(nj->s->prog_data->base.push_sizes); i++) { + push_size_B += nj->s->prog_data->base.push_sizes[i]; + } + + assert(util_is_aligned(push_size_B, 32)); + for (unsigned i = 0; i < (push_size_B / 4); ++i) { + nj->payload.push_data[i] = read_payload(p, UGPR); + } + + nj->s->push_grfs = push_size_B / (4 * jay_ugpr_per_grf(nj->s)); +} + +static void +setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + nj->payload.urb_handle = read_payload(p, GPR); + + /* XXX: This is a hack to line up with the partition chosen in RA. This whole + * thing needs an overhaul. Need to think harder about partitioning. + */ + p->offsets[GPR] += 7; + + for (unsigned i = 0; i < (8 * nj->s->prog_data->vue.urb_read_length); ++i) { + assert(i < ARRAY_SIZE(nj->payload.vs.attributes)); + nj->payload.vs.attributes[i] = read_payload(p, GPR); + } + + setup_payload_push(nj, p); +} + +static void +setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + assert(!nj->s->prog_data->cs.generate_local_id); + assert(!nj->s->prog_data->cs.uses_btd_stack_ids); + + nj->payload.inline_data = + read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); +} + +static inline enum intel_barycentric_mode +brw_barycentric_mode(const struct brw_fs_prog_key *key, + nir_intrinsic_instr *intr) +{ + const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr); + + /* Barycentric modes don't make sense for flat inputs. */ + assert(mode != INTERP_MODE_FLAT); + + unsigned bary; + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: + /* When per sample interpolation is dynamic, assume sample interpolation. + * We'll dynamically remap things so that the FS payload is not affected. + */ + bary = key->persample_interp == INTEL_SOMETIMES ? + INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE : + INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL; + break; + case nir_intrinsic_load_barycentric_centroid: + bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID; + break; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE; + break; + default: + UNREACHABLE("invalid intrinsic"); + } + + if (mode == INTERP_MODE_NOPERSPECTIVE) + bary += 3; + + return (enum intel_barycentric_mode) bary; +} + +struct fs_info_ctx { + const struct brw_fs_prog_key *key; + struct brw_fs_prog_data *prog_data; + const struct intel_device_info *devinfo; +}; + +static bool +gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + struct fs_info_ctx *ctx = data; + struct brw_fs_prog_data *prog_data = ctx->prog_data; + + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + prog_data->barycentric_interp_modes |= + 1 << brw_barycentric_mode(ctx->key, intr); + break; + + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: { + unsigned mode = brw_barycentric_mode(ctx->key, intr); + prog_data->barycentric_interp_modes |= 1 << mode; + prog_data->uses_sample_offsets |= + mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE || + mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE; + + if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) + prog_data->uses_npc_bary_coefficients = true; + else + prog_data->uses_pc_bary_coefficients = true; + break; + } + + case nir_intrinsic_load_frag_coord_z: + prog_data->uses_src_depth = true; + break; + + case nir_intrinsic_load_frag_coord_w_rcp: + prog_data->uses_src_w = true; + break; + + case nir_intrinsic_load_sample_mask_in: + /* TODO: Sample masks are broken and discards are broken and simd32 + * layouts are broken too. XXX. + */ + // prog_data->uses_sample_mask = true; + break; + + case nir_intrinsic_load_pixel_coord_intel: + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + break; + + default: + break; + } + + return false; +} + +static void +brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data, + const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_shader_in_variable(var, shader) { + if (var->data.interpolation != INTERP_MODE_FLAT || + var->data.per_primitive) + continue; + + unsigned slots = glsl_count_attribute_slots(var->type, false); + for (unsigned s = 0; s < slots; s++) { + int input_index = prog_data->urb_setup[var->data.location + s]; + + if (input_index >= 0) + prog_data->flat_inputs |= 1 << input_index; + } + } +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + /* We initially set this to OFF, but having the shader write the + * depth means we allocate register space in the SEND message. The + * difference between the SEND register count and the OFF state + * programming makes the HW hang. + * + * Removing the depth writes also leads to test failures. So use + * LesserThanOrEqual, which fits writing the same value + * (unchanged/equal). + * + */ + return BRW_PSCDEPTH_ON_LE; + } + } + return BRW_PSCDEPTH_OFF; +} + +/* + * Build up an array of indices into the urb_setup array that + * references the active entries of the urb_setup array. + * Used to accelerate walking the active entries of the urb_setup array + * on each upload. + */ +static void +brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data) +{ + /* TODO(mesh): Review usage of this in the context of Mesh, we may want to + * skip per-primitive attributes here. + */ + + /* Make sure uint8_t is sufficient */ + static_assert(VARYING_SLOT_MAX <= 0xff); + uint8_t index = 0; + for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (fs_prog_data->urb_setup[attr] >= 0) { + fs_prog_data->urb_setup_attribs[index++] = attr; + } + } + fs_prog_data->urb_setup_attribs_count = index; +} + +static void +calculate_urb_setup(const struct intel_device_info *devinfo, + const struct brw_fs_prog_key *key, + struct brw_fs_prog_data *prog_data, + nir_shader *nir, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) +{ + memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); + int urb_next = 0; /* in vec4s */ + + /* Figure out where the PrimitiveID lives, either in the per-vertex block + * or in the per-primitive block or both. + */ + const uint64_t per_vert_primitive_id = + key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t per_prim_primitive_id = + key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t inputs_read = + nir->info.inputs_read & + (~nir->info.per_primitive_inputs | per_vert_primitive_id); + const uint64_t per_primitive_header_bits = + VARYING_BIT_PRIMITIVE_SHADING_RATE | + VARYING_BIT_LAYER | + VARYING_BIT_VIEWPORT | + VARYING_BIT_CULL_PRIMITIVE; + const uint64_t per_primitive_inputs = + nir->info.inputs_read & + (nir->info.per_primitive_inputs | per_prim_primitive_id) & + ~per_primitive_header_bits; + struct intel_vue_map vue_map; + uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX; + + if (mue_map != NULL) { + memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map)); + memcpy(per_primitive_offsets, mue_map->per_primitive_offsets, + sizeof(mue_map->per_primitive_offsets)); + + if (!mue_map->wa_18019110168_active) { + u_foreach_bit64(location, per_primitive_inputs) { + assert(per_primitive_offsets[location] != -1); + + first_read_offset = + MIN2(first_read_offset, + (uint32_t) per_primitive_offsets[location]); + per_primitive_stride = + MAX2((uint32_t) per_primitive_offsets[location] + 16, + per_primitive_stride); + } + } else { + first_read_offset = per_primitive_stride = 0; + } + } else { + brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout, + 1 /* pos_slots, TODO */); + brw_compute_per_primitive_map(per_primitive_offsets, + &per_primitive_stride, &first_read_offset, + 0, nir, nir_var_shader_in, + per_primitive_inputs, + true /* separate_shader */); + } + + if (per_primitive_stride > first_read_offset) { + first_read_offset = ROUND_DOWN_TO(first_read_offset, 32); + + /* Remove the first few unused registers */ + for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_primitive_offsets[i] == -1) + continue; + per_primitive_offsets[i] -= first_read_offset; + } + + prog_data->num_per_primitive_inputs = + 2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32); + } else { + prog_data->num_per_primitive_inputs = 0; + } + + /* Now do the per-vertex stuff (what used to be legacy pipeline) */ + + /* If Mesh is involved, we cannot do any packing. Documentation doesn't say + * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when + * using Mesh. + */ + if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) { + /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do + * arbitrary rearrangement of the first 16 varying inputs, so we can put + * them wherever we want. Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with a + * different vertex (or geometry) shader. + */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (inputs_read & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them in + * an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + int first_slot = 0; + for (int i = 0; i < vue_map.num_slots; i++) { + int varying = vue_map.slot_to_varying[i]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) { + first_slot = ROUND_DOWN_TO(i, 2); + break; + } + } + + for (int slot = first_slot; slot < vue_map.num_slots; slot++) { + int varying = vue_map.slot_to_varying[slot]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = vue_map.num_slots - first_slot; + } + + prog_data->num_varying_inputs = urb_next; + prog_data->inputs = inputs_read; + prog_data->per_primitive_inputs = per_primitive_inputs; + + brw_compute_urb_setup_index(prog_data); +} + +static void +populate_fs_prog_data(nir_shader *shader, + const struct intel_device_info *devinfo, + const struct brw_fs_prog_key *key, + struct brw_fs_prog_data *prog_data, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) +{ + struct fs_info_ctx ctx = { + .key = key, + .prog_data = prog_data, + .devinfo = devinfo, + }; + nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx); + + prog_data->uses_kill = shader->info.fs.uses_discard; + prog_data->uses_omask = + !key->ignore_sample_mask_out && + (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->max_polygons = 1; + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->sample_shading = shader->info.fs.uses_sample_shading; + prog_data->api_sample_shading = key->api_sample_shading; + prog_data->min_sample_shading = key->min_sample_shading; + + assert(key->multisample_fbo != INTEL_NEVER || + key->persample_interp == INTEL_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (prog_data->sample_shading) + prog_data->persample_dispatch = INTEL_ALWAYS; + + /* We can only persample dispatch if we have a multisample FBO */ + prog_data->persample_dispatch = + MIN2(prog_data->persample_dispatch, key->multisample_fbo); + + /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If + * persample_dispatch & multisample_fbo are not dynamic, Anv should be able + * to definitively tell whether alpha_to_coverage is on or off. + */ + prog_data->alpha_to_coverage = key->alpha_to_coverage; + + assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER); + prog_data->mesh_input = key->mesh_input; + + assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER); + prog_data->provoking_vertex_last = key->provoking_vertex_last; + + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = + prog_data->persample_dispatch != INTEL_NEVER && + (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; + prog_data->inner_coverage = shader->info.fs.inner_coverage; + + /* From the BDW PRM documentation for 3DSTATE_WM: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non- perspective Sample barycentric coordinates." + * + * So cleanup any potentially set sample barycentric mode when not in per + * sample dispatch. + */ + if (prog_data->persample_dispatch == INTEL_NEVER) { + prog_data->barycentric_interp_modes &= + ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + if (devinfo->ver >= 20) { + prog_data->vertex_attributes_bypass = + brw_needs_vertex_attributes_bypass(shader); + } + + prog_data->uses_nonperspective_interp_modes = + (prog_data->barycentric_interp_modes & + INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) || + prog_data->uses_npc_bary_coefficients; + + /* The current VK_EXT_graphics_pipeline_library specification requires + * coarse to specified at compile time. But per sample interpolation can be + * dynamic. So we should never be in a situation where coarse & + * persample_interp are both respectively true & INTEL_ALWAYS. + * + * Coarse will dynamically turned off when persample_interp is active. + */ + assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS); + + prog_data->coarse_pixel_dispatch = + intel_sometimes_invert(prog_data->persample_dispatch); + if (!key->coarse_pixel || + /* DG2 should support this, but Wa_22012766191 says there are issues + * with CPS 1x1 + MSAA + FS writing to oMask. + */ + (devinfo->verx10 < 200 && + (prog_data->uses_omask || prog_data->uses_sample_mask)) || + prog_data->sample_shading || + (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || + prog_data->computed_stencil || + devinfo->ver < 11) { + prog_data->coarse_pixel_dispatch = INTEL_NEVER; + } + + /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, + * Message Descriptor : + * + * "Message Type. Specifies the type of message being sent when + * pixel-rate evaluation is requested : + * + * Format = U2 + * 0: Per Message Offset (eval_snapped with immediate offset) + * 1: Sample Position Offset (eval_sindex) + * 2: Centroid Position Offset (eval_centroid) + * 3: Per Slot Offset (eval_snapped with register offset) + * + * Message Type. Specifies the type of message being sent when + * coarse-rate evaluation is requested : + * + * Format = U2 + * 0: Coarse to Pixel Mapping Message (internal message) + * 1: Reserved + * 2: Coarse Centroid Position (eval_centroid) + * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" + * + * The Sample Position Offset is marked as reserved for coarse rate + * evaluation and leads to hangs if we try to use it. So disable coarse + * pixel shading if we have any intrinsic that will result in a pixel + * interpolater message at sample. + */ + if (intel_nir_pulls_at_sample(shader)) + prog_data->coarse_pixel_dispatch = INTEL_NEVER; + + /* We choose to always enable VMask prior to XeHP, as it would cause + * us to lose out on the eliminate_find_live_channel() optimization. + */ + prog_data->uses_vmask = + devinfo->verx10 < 125 || + shader->info.fs.needs_coarse_quad_helper_invocations || + shader->info.uses_wide_subgroup_intrinsics || + prog_data->coarse_pixel_dispatch != INTEL_NEVER; + + prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients; + + if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) { + prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth; + prog_data->uses_src_depth = false; + } + + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, + per_primitive_offsets); + brw_compute_flat_inputs(prog_data, shader); +} + +static void +populate_vs_prog_data(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + unsigned nr_packed_regs, + bool debug) +{ + unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); + BITSET_WORD *sysvals = nir->info.system_values_read; + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) || + BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) || + BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) || + BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) { + nr_attribute_slots++; + } + + /* gl_DrawID and IsIndexedDraw share its very own vec4 */ + if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) || + BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) { + nr_attribute_slots++; + } + + const struct { + bool *data; + gl_system_value val; + } bool_sysvals[] = { + { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW }, + { &prog_data->uses_firstvertex, SYSTEM_VALUE_FIRST_VERTEX }, + { &prog_data->uses_baseinstance, SYSTEM_VALUE_BASE_INSTANCE }, + { &prog_data->uses_vertexid, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE }, + { &prog_data->uses_instanceid, SYSTEM_VALUE_INSTANCE_ID }, + { &prog_data->uses_drawid, SYSTEM_VALUE_DRAW_ID }, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) { + *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val); + } + + unsigned nr_attribute_regs; + if (key->vf_component_packing) { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8); + nr_attribute_regs = nr_packed_regs; + } else { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); + nr_attribute_regs = 4 * nr_attribute_slots; + } + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4), + prog_data->base.vue_map.num_slots); + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; + + if (unlikely(debug)) { + fprintf(stderr, "VS Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX); + } +} + +static void +setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + jay_fs_payload *fs = &nj->payload.fs; + + if (nj->s->dispatch_width == 32) { + nj->payload.u1 = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); + } + + setup_payload_push(nj, p); + + u_foreach_bit(i, nj->s->prog_data->fs.barycentric_interp_modes) { + fs->bary[i] = read_vector_payload(p, GPR, 2); + } + + if (nj->s->prog_data->fs.uses_src_depth) { + fs->coord.z = read_payload(p, GPR); + } + + if (nj->s->prog_data->fs.uses_src_w) { + fs->coord.w = read_payload(p, GPR); + } + + unsigned nr_attribs = 16 * 4; /* TODO */ + for (unsigned i = 0; i < nr_attribs; ++i) { + jay_def comps[] = { read_payload(p, UGPR), read_payload(p, UGPR), + read_payload(p, UGPR) }; + + /* The .yz components are swizzled in the hardware compared to NIR. */ + SWAP(comps[1], comps[2]); + fs->deltas[i] = jay_collect_vectors(&nj->bld, comps, ARRAY_SIZE(comps)); + + /* Padding */ + if ((i % 5) == 4) { + read_payload(p, UGPR); + } + } + + /* XXX: I do not love this */ + if (BITSET_TEST(nj->nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) { + jay_def t = jay_alloc_def(&nj->bld, GPR, 1); + jay_def lo = jay_extract_range(nj->payload.u0, 10, 4); + jay_EXPAND_QUAD(&nj->bld, t, lo, payload_u1(nj, 10, 4)); + fs->coord.xy = jay_OFFSET_PACKED_PIXEL_COORDS_u32(&nj->bld, t); + } + + /* Due to complexities of the physical payload, the logical payload is split + * into even/odd halves. Fix up the offsets and insert copies. + */ + if (nj->s->dispatch_width == 32) { + jay_builder *b = &nj->bld; + jay_foreach_inst_in_block(nj->after_block, I) { + if (I->op == JAY_OPCODE_PRELOAD && I->dst.file == GPR) { + unsigned base = (jay_preload_reg(I) % 2) ? p->offsets[GPR] : 0; + jay_set_preload_reg(I, base + (jay_preload_reg(I) / 2)); + } + } + + b->cursor = jay_before_block(nj->after_block); + unsigned size = p->offsets[GPR]; + + /* Odd: copy both halves to contiguous pair after payload */ + for (unsigned i = 1; i < size; i += 2) { + jay_DESWIZZLE_16(b, size + size + i + 1, 2 + i); + jay_DESWIZZLE_16(b, size + size + i + 2, 2 + i + size); + } + + /* Even: leave the bottom half in place, copy top half. If size=1 (rare + * but possible), this would be a no-op move so skip it. + */ + if (size > 1) { + for (unsigned i = 0; i < size; i += 2) { + jay_inst *I = jay_DESWIZZLE_16(b, 2 + i + 1, 2 + size + i); + + /* Stall in between to avoid a write-after-read hazard */ + if (i == 0) { + I->dep = (struct tgl_swsb) { 1, TGL_PIPE_INT }; + } + } + } + } +} + +static void +jay_setup_payload(struct nir_to_jay_state *nj) +{ + jay_shader *s = nj->s; + jay_builder *b = &nj->bld; + nj->after_block = jay_create_block(nj); + b->cursor = jay_after_block(nj->after_block); + + struct payload_builder p = { .b = &nj->bld }; + nj->payload.u0 = read_vector_payload(&p, UGPR, jay_ugpr_per_grf(s)); + nj->payload.sampler_state_pointer = jay_extract(nj->payload.u0, 3); + + switch (s->stage) { + case MESA_SHADER_VERTEX: + setup_vertex_payload(nj, &p); + break; + case MESA_SHADER_FRAGMENT: + setup_fragment_payload(nj, &p); + break; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + setup_compute_payload(nj, &p); + break; + default: + UNREACHABLE("unimplemented shader stages"); + } + + /* Lane ID calculations require &W and therefore are calculated in + * uniform control flow to sidestep RA problems. The easy solution is + * calculating the lane ID in the first block. + * + * XXX: This doesn't work for multi-function. Reconsider. + */ + nj->payload.lane_id = jay_LANE_ID_8_u16(b); + + for (unsigned i = 8; i < s->dispatch_width; i *= 2) { + nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i); + } +} + +/* + * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These + * blocks have no predecessors, but do have successors and can contribute to + * phis. They are dead and violate the IR invariant: + * + * Live-in sources are live-out in all predecessors. + * + * ...which RA (validation) depends on. The simplest solution is to simply + * delete these dead blocks. Fortunately, because they are unreachable, this + * does not have any ill effects. Notably, this cannot introduce critical edges. + * + * Deleting a block may cause a successor to become unreachable, so we use a + * fixed-point algorithm to converge. + */ +static void +jay_remove_unreachable_blocks(jay_function *func) +{ + bool progress; + do { + progress = false; + + jay_foreach_block(func, pred) { + if (pred != jay_first_block(func) && + jay_num_predecessors(pred) == 0 && + jay_num_successors(pred) > 0) { + + jay_foreach_successor(pred, succ) { + util_dynarray_delete_unordered(&succ->predecessors, jay_block *, + pred); + } + + pred->successors[0] = NULL; + pred->successors[1] = NULL; + progress = true; + } + } + } while (progress); +} + +static void +jay_from_nir_function(const struct intel_device_info *devinfo, + nir_shader *nir, + jay_shader *s, + nir_function_impl *impl) +{ + jay_function *f = jay_new_function(s); + f->is_entrypoint = impl->function->is_entrypoint; + + struct nir_to_jay_state nj = { + .s = s, + .f = f, + .nir = nir, + .devinfo = devinfo, + .bld = (jay_builder) { .shader = s, .func = f }, + }; + + /* Jay indices match NIR indices. Therefore the first impl->ssa_alloc + * indices are reserved. Our own temporaries go after. + */ + f->ssa_alloc = impl->ssa_alloc; + + if (f->is_entrypoint) { + jay_setup_payload(&nj); + } + + jay_emit_cf_list(&nj, &impl->body); + jay_emit_eot(&nj); + jay_remove_unreachable_blocks(f); +} + +static void +jay_gather_stats(const jay_shader *s, struct genisa_stats *stats) +{ + jay_foreach_inst_in_shader(s, f, I) { + stats->instrs += I->op != JAY_OPCODE_SYNC; + stats->loops += I->op == JAY_OPCODE_WHILE; + stats->sends += I->op == JAY_OPCODE_SEND; + + /* XXX: Write a real cycle model */ + stats->cycles++; + + /* Calculate register usage */ + if (I->dst.file == GPR) + stats->grf_registers = + MAX2(stats->grf_registers, I->dst.reg + jay_num_values(I->dst)); + } + + stats->spills = s->spills; + stats->fills = s->fills; + stats->sends -= (s->spills + s->fills); +} + +/* + * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has + * its own index. Vectors/64-bit use contiguous indices. We therefore run a + * modified version of nir_index_ssa_defs right before translating NIR->Jay. + */ +static bool +index_ssa_def_cb(nir_def *def, void *state) +{ + unsigned *index = (unsigned *) state; + def->index = *index; + *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32); + return true; +} + +static void +nj_index_ssa_defs(nir_shader *nir) +{ + nir_foreach_function_impl(impl, nir) { + /* The zero index means null in Jay, so start SSA indices at 1 */ + unsigned index = 1; + + nir_foreach_block_unstructured(block, impl) { + nir_foreach_instr(instr, block) + nir_foreach_def(instr, index_ssa_def_cb, &index); + } + + impl->ssa_alloc = index; + } +} + +static bool +lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_) +{ + if (intr->intrinsic != nir_intrinsic_load_helper_invocation) + return false; + + /* TODO: Is this right for multisampling? */ + b->cursor = nir_before_instr(&intr->instr); + nir_def *active = + nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b))); + + nir_def_replace(&intr->def, active); + return true; +} + +static bool +lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) +{ + if (intr->intrinsic != nir_intrinsic_load_frag_coord && + intr->intrinsic != nir_intrinsic_load_pixel_coord) + return false; + + b->cursor = nir_before_instr(&intr->instr); + nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b)); + + if (intr->intrinsic == nir_intrinsic_load_frag_coord) { + c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)), + nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b), + nir_frcp(b, nir_load_frag_coord_w_rcp(b))); + } + + nir_def_replace(&intr->def, c); + return true; +} + +static bool +jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) +{ + b->cursor = nir_after_instr(&intr->instr); + unsigned *simd_width = simd_; + + /* mask & -mask isolates the lowest set bit in the mask. */ + if (intr->intrinsic == nir_intrinsic_elect) { + nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b)); + mask = nir_iand(b, mask, nir_ineg(b, mask)); + nir_def_replace(&intr->def, nir_inverse_ballot(b, mask)); + return true; + } + + /* Ballots must match the SIMD size */ + if (intr->intrinsic == nir_intrinsic_ballot || + intr->intrinsic == nir_intrinsic_ballot_relaxed) { + unsigned old_bitsize = intr->def.bit_size; + intr->def.bit_size = *simd_width; + nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize); + nir_def_rewrite_uses_after(&intr->def, u2uN); + return true; + } + + /* Note: we don't treat read_invocation specially because there's little + * benefit but doing so would require expensive uniformizing in some cases. + */ + if (intr->intrinsic != nir_intrinsic_shuffle && + intr->intrinsic != nir_intrinsic_read_invocation) + return false; + + nir_def *data = intr->src[0].ssa; + assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized"); + + nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4); + nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B)); + return true; +} + +struct frag_out_ctx { + nir_def *colour[8], *depth, *stencil, *sample_mask; +}; + +static bool +collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_) +{ + struct frag_out_ctx *ctx = ctx_; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + unsigned wrmask = nir_intrinsic_write_mask(intr); + assert(nir_intrinsic_component(intr) == 0 && "component should be lowered"); + assert(util_is_power_of_two_nonzero(wrmask + 1) && + "complex writemasks should be lowered"); + + /* TODO: Optimize with write mask? */ + + gl_frag_result loc = nir_intrinsic_io_semantics(intr).location; + assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo"); + nir_def **out; + if (loc == FRAG_RESULT_COLOR) { + out = &ctx->colour[0]; + } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) { + out = &ctx->colour[loc - FRAG_RESULT_DATA0]; + } else if (loc == FRAG_RESULT_DEPTH) { + out = &ctx->depth; + } else if (loc == FRAG_RESULT_STENCIL) { + UNREACHABLE("todo"); + out = &ctx->stencil; + } else if (loc == FRAG_RESULT_SAMPLE_MASK) { + UNREACHABLE("todo"); + out = &ctx->sample_mask; + } else { + UNREACHABLE("invalid location"); + } + + assert((*out) == NULL && "each location written exactly once"); + *out = intr->src[0].ssa; + + nir_instr_remove(&intr->instr); + return true; +} + +static void +append_payload(nir_builder *b, + nir_def **payload, + unsigned *len, + unsigned max_len, + nir_def *value) +{ + if (value != NULL) { + for (unsigned i = 0; i < value->num_components; ++i) { + payload[*len] = nir_channel(b, value, i); + (*len)++; + assert((*len) <= max_len); + } + } +} + +static void +insert_rt_store(nir_builder *b, + const struct intel_device_info *devinfo, + signed target, + bool last, + nir_def *colour, + nir_def *src0_alpha, + nir_def *depth, + nir_def *stencil, + nir_def *sample_mask, + unsigned dispatch_width) +{ + bool null_rt = target < 0; + target = MAX2(target, 0); + + if (!colour) { + colour = nir_undef(b, 4, 32); + } + + colour = nir_pad_vec4(b, colour); + + if (null_rt) { + /* Even if we don't write a RT, we still need to write alpha for + * alpha-to-coverage and alpha testing. Optimize the other channels out. + */ + colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32), + nir_channel(b, colour, 3), 3); + } + + /* TODO: Not sure I like this. We'll see what 2src looks like. */ + unsigned op = dispatch_width == 32 ? + XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE : + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + uint64_t desc = + brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */); + + uint64_t ex_desc = 0; + if (devinfo->ver >= 20) { + ex_desc = target << 21 | + null_rt << 20 | + (src0_alpha ? (1 << 15) : 0) | + (stencil ? (1 << 14) : 0) | + (depth ? (1 << 13) : 0) | + (sample_mask ? (1 << 12) : 0); + } else if (devinfo->ver >= 11) { + /* Set the "Render Target Index" and "Src0 Alpha Present" fields + * in the extended message descriptor, in lieu of using a header. + */ + ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0); + } + + /* Build the payload */ + nir_def *payload[8] = { NULL }; + unsigned len = 0; + append_payload(b, payload, &len, ARRAY_SIZE(payload), colour); + append_payload(b, payload, &len, ARRAY_SIZE(payload), depth); + /* TODO */ + + nir_def *disable = b->shader->info.fs.uses_discard ? + nir_is_helper_invocation(b, 1) : + nir_imm_false(b); + + nir_store_render_target_intel(b, nir_vec(b, payload, len), + nir_imm_ivec2(b, desc, ex_desc), disable, + .eot = last); +} + +static void +lower_fragment_outputs(nir_function_impl *impl, + const struct intel_device_info *devinfo, + unsigned nr_color_regions, + unsigned dispatch_width) +{ + struct frag_out_ctx ctx = { { NULL } }; + nir_function_intrinsics_pass(impl, collect_fragment_output, + nir_metadata_control_flow, &ctx); + nir_builder b_ = nir_builder_at(nir_after_impl(impl)); + nir_builder *b = &b_; + assert(nr_color_regions <= ARRAY_SIZE(ctx.colour)); + + signed first = -1; + for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) { + if (ctx.colour[i]) { + first = i; + break; + } + } + + /* Do the later render targets first */ + for (unsigned i = first + 1; i < nr_color_regions; ++i) { + if (ctx.colour[i]) { + insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL, + NULL, dispatch_width); + } + } + + /* Finally do render target zero attaching all the sideband things and + * setting the LastRT bit. This needs to exist even if nothing is written + * since it also signals end-of-thread. + */ + insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true, + first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth, + ctx.stencil, ctx.sample_mask, dispatch_width); +} + +struct jay_shader_bin * +jay_compile(const struct intel_device_info *devinfo, + void *mem_ctx, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key) +{ + jay_debug = debug_get_option_jay_debug(); + enum mesa_shader_stage stage = nir->info.stage; + bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage)); + struct brw_compiler compiler = { .devinfo = devinfo }; + unsigned nr_packed_regs = 0; + + brw_pass_tracker pt_ = { + .nir = nir, + .key = &key->base, + .dispatch_width = 0, + .compiler = &compiler, + .archiver = NULL, //params->base.archiver, + }, *pt = &pt_; + + BRW_NIR_SNAPSHOT("first"); + + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.stage = stage; + // TODO: Make the driver do this? + // prog_data->base.source_hash = params->source_hash; + prog_data->base.total_shared = nir->info.shared_size; + + /* TODO: Real heuristic */ + bool do_simd32 = INTEL_SIMD(FS, 32); + do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT; + unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16; + + if (stage == MESA_SHADER_VERTEX) { + /* We only expect slot compaction to be disabled when using device + * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS + * programming. This should always be enabled together with VF component + * packing to minimize the size of the payload. + */ + assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing); + + /* When using Primitive Replication for multiview, each view gets its own + * position slot. + */ + const uint32_t pos_slots = + (nir->info.per_view_outputs & VARYING_BIT_POS) ? + MAX2(1, util_bitcount(key->base.view_mask)) : + 1; + + /* Only position is allowed to be per-view */ + assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS)); + + brw_compute_vue_map(devinfo, &prog_data->vue.vue_map, + nir->info.outputs_written, key->base.vue_layout, + pos_slots); + + brw_nir_apply_key(pt, &key->base, simd_width); + + prog_data->vs.inputs_read = nir->info.inputs_read; + prog_data->vs.double_inputs_read = nir->info.vs.double_inputs; + prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction; + + brw_nir_lower_vs_inputs(nir); + brw_nir_lower_vue_outputs(nir); + BRW_NIR_SNAPSHOT("after_lower_io"); + + memset(prog_data->vs.vf_component_packing, 0, + sizeof(prog_data->vs.vf_component_packing)); + if (key->vs.vf_component_packing) { + nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs); + } + + /* Get constant offsets out of the way for proper clip/cull handling */ + BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); + BRW_NIR_PASS(nir_opt_constant_folding); + BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo, + &prog_data->vue.vue_map, 0, 0); + } else if (stage == MESA_SHADER_FRAGMENT) { + assert(key->fs.mesh_input == INTEL_NEVER && "todo"); + assert(!key->fs.force_dual_color_blend && "todo"); + brw_nir_apply_key(pt, &key->base, 32); + brw_nir_lower_fs_inputs(nir, devinfo, &key->fs); + brw_nir_lower_fs_outputs(nir); + NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); + + if (!brw_can_coherent_fb_fetch(devinfo)) + NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs); + + NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord, + nir_metadata_control_flow, NULL); + NIR_PASS(_, nir, nir_opt_barycentric, true); + + lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo, + key->fs.nr_color_regions, simd_width); + NIR_PASS(_, nir, nir_lower_helper_writes, true); + NIR_PASS(_, nir, nir_lower_is_helper_invocation); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation, + nir_metadata_control_flow, NULL); + + if (key->fs.alpha_to_coverage != INTEL_NEVER) { + /* Run constant fold optimization in order to get the correct source + * offset to determine render target 0 store instruction in + * emit_alpha_to_coverage pass. + */ + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage); + } + + // TODO + // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); + + if (!brw_fs_prog_key_is_dynamic(&key->fs)) { + uint32_t f = 0; + + if (key->fs.multisample_fbo == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO; + + if (key->fs.alpha_to_coverage == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE; + + if (key->fs.provoking_vertex_last == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST; + + if (key->fs.persample_interp == INTEL_ALWAYS) { + f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH | + INTEL_FS_CONFIG_PERSAMPLE_INTERP; + } + + NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel, + f); + } + } else { + brw_nir_apply_key(pt, &key->base, simd_width); + } + + brw_postprocess_nir_opts(pt); + + NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd, + nir_metadata_control_flow, &simd_width); + NIR_PASS(_, nir, nir_opt_algebraic_late); + NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16); + + /* Late postprocess while remaining in SSA */ + /* Run fsign lowering again after the last time brw_nir_optimize is called. + * As is the case with conversion lowering (below), brw_nir_optimize can + * create additional fsign instructions. + */ + NIR_PASS(_, nir, jay_nir_lower_fsign); + NIR_PASS(_, nir, jay_nir_lower_bool); + NIR_PASS(_, nir, nir_opt_cse); + NIR_PASS(_, nir, nir_opt_dce); + NIR_PASS(_, nir, jay_nir_opt_sel_zero); + + /* Run nir_split_conversions only after the last tiem + * brw_nir_optimize is called. Various optimizations invoked there can + * rematerialize the conversions that the lowering pass eliminates. + */ + const nir_split_conversions_options split_conv_opts = { + .callback = intel_nir_split_conversions_cb, + }; + NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts); + + /* Do this only after the last opt_gcm. GCM will undo this lowering. */ + if (stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample); + } + + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + + /* Run divergence analysis at the end */ + nir_sweep(nir); + nj_index_ssa_defs(nir); + nir_divergence_analysis(nir); + + if (debug) { + /* We can't use nir_print_shader since it reindexes SSA defs. */ + fprintf(stdout, "NIR right before from_nir:\n\n"); + nir_print_shader_annotated(nir, stdout, NULL); + fflush(stdout); + } + + if (stage == MESA_SHADER_VERTEX) { + populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs, + nr_packed_regs, debug); + } else if (stage == MESA_SHADER_FRAGMENT) { + int per_primitive_offsets[VARYING_SLOT_MAX]; + memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); + + populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs, + NULL /* TODO: mue_map */, per_primitive_offsets); + } + + jay_shader *s = jay_new_shader(NULL, stage); + s->dispatch_width = simd_width; + s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width; + s->devinfo = devinfo; + s->prog_data = prog_data; + + nir_foreach_function_impl(impl, nir) { + jay_from_nir_function(devinfo, nir, s, impl); + } + + /* Re-number block indices to be sequential and match the NIR. This ensures + * block indices are ordered with respect to the control flow graph which is + * a convenient IR invariant. + */ + jay_foreach_function(s, f) { + unsigned index = 0; + + jay_foreach_block(f, b) { + b->index = index++; + } + } + + jay_validate(s, "NIR->Jay translation"); + + if (!(jay_debug & JAY_DBG_NOOPT)) { + JAY_PASS(s, jay_opt_propagate_forwards); + JAY_PASS(s, jay_opt_propagate_backwards); + JAY_PASS(s, jay_opt_dead_code); + } + + if (debug) { + fprintf(stdout, "Jay shader:\n\n"); + jay_print(stdout, s); + } + + JAY_PASS(s, jay_assign_flags); + if (!(jay_debug & JAY_DBG_NOOPT)) { + JAY_PASS(s, jay_opt_dead_code); + } + + JAY_PASS(s, jay_lower_pre_ra); + JAY_PASS(s, jay_partition_grf); + JAY_PASS(s, jay_register_allocate); + JAY_PASS(s, jay_lower_post_ra); + JAY_PASS(s, jay_insert_fp_mode, nir->info.float_controls_execution_mode, + nir->info.bit_sizes_float); + + if (!(jay_debug & JAY_DBG_NOOPT)) { + JAY_PASS(s, jay_opt_control_flow); + } + + JAY_PASS(s, jay_lower_scoreboard); + + if (debug) { + fprintf(stdout, "Jay shader (post-RA):\n\n"); + jay_print(stdout, s); + } + + struct jay_shader_bin *bin = + jay_to_binary(s, nir->constant_data, nir->constant_data_size); + assert(bin->kernel); + ralloc_steal(mem_ctx, bin); + + jay_gather_stats(s, &bin->stats); + bin->stats.code_size = bin->size; + + if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) { + if (nir->info.label) { + printf("%s - ", nir->info.label); + } + + const char *shader_name = + ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(stage), + s->dispatch_width); + genisa_stats_fprintf(stdout, shader_name, &bin->stats); + } + + bin->stats.workgroup_memory_size = nir->info.shared_size; + bin->stats.dispatch_width = simd_width; + + if (stage == MESA_SHADER_FRAGMENT) { + if (simd_width == 8) { + prog_data->fs.dispatch_8 = true; + } else if (simd_width == 16) { + prog_data->fs.dispatch_16 = true; + prog_data->fs.prog_offset_16 = 0; + } else if (simd_width == 32) { + prog_data->fs.dispatch_32 = true; + prog_data->fs.prog_offset_32 = 0; + } + + prog_data->fs.has_side_effects = nir->info.writes_memory; + } else if (mesa_shader_stage_is_compute(stage)) { + unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2; + prog_data->cs.prog_offset[i] = 0; + prog_data->cs.prog_mask = BITFIELD_BIT(i); + prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr; + prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr; + prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */ + } + + prog_data->base.program_size = bin->size; + + if (s->scratch_size > 0) { + /* We currently only support up to 2MB of scratch space. If we + * need to support more eventually, the documentation suggests + * that we could allocate a larger buffer, and partition it out + * ourselves. We'd just have to undo the hardware's address + * calculation by subtracting (FFTID * Per Thread Scratch Space) + * and then add FFTID * (Larger Per Thread Scratch Space). + * + * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > + * Thread Group Tracking > Local Memory/Scratch Space. + */ + assert(s->scratch_size <= devinfo->max_scratch_size_per_thread && + "maximum scratch size"); + + /* Take the max of any previously compiled variant of the shader. In the + * case of bindless shaders with return parts, this will also take the + * max of all parts. + */ + prog_data->base.total_scratch = + MAX2(prog_data->base.total_scratch, + util_next_power_of_two(s->scratch_size)); + } + + if (stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY || + stage == MESA_SHADER_MESH) { + + uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size); + uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size, + nir->info.cull_distance_array_size); + + if (stage == MESA_SHADER_MESH) { + prog_data->mesh.clip_distance_mask = clip_mask; + prog_data->mesh.cull_distance_mask = cull_mask; + } else { + prog_data->vue.clip_distance_mask = clip_mask; + prog_data->vue.cull_distance_mask = cull_mask; + } + } + + /* Scratch is allocated in 1KiB increments. */ + prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024); + + ralloc_free(s); + return bin; +} diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h new file mode 100644 index 00000000000..37d0b722319 --- /dev/null +++ b/src/intel/compiler/jay/jay_ir.h @@ -0,0 +1,1408 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/brw/brw_compiler.h" +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/shader_enums.h" +#include "util/bitset.h" +#include "util/list.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "util/sparse_bitset.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "jay_opcodes.h" + +/* TODO: switch to brw_conditional_mod */ +enum PACKED jay_conditional_mod { + JAY_CONDITIONAL_EQ = 1, /**< Equal to zero */ + JAY_CONDITIONAL_NE = 2, /**< Not equal to zero */ + JAY_CONDITIONAL_GT = 3, /**< Greater than zero */ + JAY_CONDITIONAL_LT = 5, /**< Less than zero */ + JAY_CONDITIONAL_GE = 4, /**< Greater than or equal to zero */ + JAY_CONDITIONAL_LE = 6, /**< Less than or equal to zero */ + JAY_CONDITIONAL_OV = 8, /**< Overflow has occurred */ + JAY_CONDITIONAL_NAN = 9, /**< Result is NaN */ +}; + +static inline enum jay_conditional_mod +jay_conditional_mod_swap_sources(enum jay_conditional_mod mod) +{ + /* clang-format off */ + switch (mod) { + case JAY_CONDITIONAL_GT: return JAY_CONDITIONAL_LT; + case JAY_CONDITIONAL_LT: return JAY_CONDITIONAL_GT; + case JAY_CONDITIONAL_GE: return JAY_CONDITIONAL_LE; + case JAY_CONDITIONAL_LE: return JAY_CONDITIONAL_GE; + default: return mod; + } + /* clang-format on */ +} + +enum PACKED jay_arf { + JAY_ARF_NULL = 0, + JAY_ARF_MASK = BRW_ARF_MASK, + JAY_ARF_CONTROL = BRW_ARF_CONTROL, + JAY_ARF_TIMESTAMP = BRW_ARF_TIMESTAMP, +}; + +enum PACKED jay_file { + /** Non-uniform general purpose registers: 32-bits per SIMT lane. */ + GPR, + + /** Uniform general purpose registers: 32-bit uniform values */ + UGPR, + + /** Memory registers representing spilled values: 32-bits per SIMT lane. */ + MEM, + + /** Memory registers representing spilled values: 32-bits uniform values */ + UMEM, + + /** Non-uniform flags (predicates): 1-bit per SIMT lane */ + FLAG, + + /** Uniform flags (predicates): 1-bit uniform value */ + UFLAG, + + /** Address registers */ + J_ADDRESS, + + /* Non-SSA files below: */ + + /** Accumulators: 32-bits per SIMT lane */ + ACCUM, + + /** Uniform accumulators: 32-bit uniform value */ + UACCUM, + + /** Architecture registers: direct access scalar */ + J_ARF, + + /** Inputs within Jay unit tests */ + TEST_FILE, + + /* Immediate value */ + J_IMM, + + JAY_FILE_LAST = J_IMM, + JAY_NUM_SSA_FILES = J_ADDRESS + 1, + + /* Set of files that the main RA (and not eg flag RA) allocates. */ + JAY_NUM_RA_FILES = UMEM + 1, + JAY_NUM_GRF_FILES = UGPR + 1, +}; +static_assert(JAY_FILE_LAST <= 0b1111, "must fit in 4 bits (see jay_def)"); + +#define jay_foreach_ssa_file(file) \ + for (enum jay_file file = 0; file < JAY_NUM_SSA_FILES; ++file) + +/* Value stuffed into the index field of instructions post-RA that are not + * null (0) but do not have an associated SSA index (as they are post-RA). + */ +#define JAY_SENTINEL (0xffffffffu) + +/* Maximum number of words in an jay_def */ +#define JAY_MAX_DEF_LENGTH (128) + +/* Maximum number of sources/destinations other than for phis */ +#define JAY_MAX_SRCS (16) +#define JAY_MAX_DESTS (2) +#define JAY_MAX_OPERANDS (JAY_MAX_SRCS + JAY_MAX_DESTS) +#define JAY_MAX_FLAGS (8) +#define JAY_MAX_SAMPLER_MESSAGE_SIZE (11) +#define JAY_NUM_LAST_USE_BITS (32) +#define JAY_NUM_PHYS_GRF (128) +#define JAY_NUM_UGPR (1024) +#define JAY_REG_BITS (17) + +/* + * An jay_def represents a contiguous array of registers or a 32-bit immediate. + * It is used for sources or (in restricted form) for destinations. + */ +typedef struct jay_def { + /* Mode-dependent payload. + * + * File = IMMEDIATE: Immediate. + * Collect = false: Base SSA index. + * Collect = true: Pointer to SSA indices. + * + * SSA indices must be unique even across register files, so that we can + * easily track them all in e.g. a bitfield without needing to have + * separate data structures for each file. + * + * Each index represents a single 32-bit (or 1-bit if a predicate) value in + * the specified register file. 64-bit or vec4 values use multiple indices. + * + * Index 0 is reserved as the null value. + */ + uint32_t _payload; + + /* After register allocation, the register assigned to this def. + * + * Also used for additional pointer bits for collect pre-RA, which is why + * this is as large as it is. Could be shrunk with more pointer compression. + */ + unsigned reg:JAY_REG_BITS; + + /* (Post-RA) only, access only the top half of the indexed 32-bit register */ + bool hi:1; + + /** The associated file (must be < JAY_NUM_SSA_FILES for SSA) */ + enum jay_file file:4; + + /* Represents either a negation or a bitwise inversion (depending on the + * instruction type.) + */ + bool negate:1; + + /* Represents absolute value (on floating point sources) */ + bool abs:1; + + /* Number of values minus 1 */ + unsigned num_values_m1:7; + + /* If true, collects many discontiguous SSA indices into a single def. + * Requires file = GPR or file = UGPR. Cannot be used post-RA. + * + * Canonical form is required: the indices pointed to by the payload must NOT + * be contiguous. Also, the payload is not owned by the def: the def may be + * cheaply copied around, but mutating the payload requires copy-on-write and + * maintaining the canonical form. + */ + bool collect:1; +} jay_def; +static_assert(sizeof(jay_def) == 8, "packed"); + +/* + * Construct an jay_def representing a bare register with no associated SSA + * index, for use post-RA only. + */ +static inline jay_def +jay_bare_reg(enum jay_file file, uint16_t reg) +{ + return (jay_def) { ._payload = JAY_SENTINEL, .reg = reg, .file = file }; +} + +/* + * Set the register for a def (called by RA only). This drops the collect + * indices since we do not have space to encode both simultaneously. + */ +static inline void +jay_set_reg(jay_def *d, unsigned r) +{ + if (d->collect) { + d->collect = false; + d->_payload = JAY_SENTINEL; + } + + d->reg = r; +} + +static inline uint32_t +jay_base_index(jay_def d) +{ + assert(d.file != J_IMM && !d.collect); + return d._payload; +} + +/** + * True if the value is null. + */ +static inline bool +jay_is_null(jay_def d) +{ + return d._payload == 0 && d.file != J_IMM; +} + +static inline bool +jay_is_imm(jay_def d) +{ + return d.file == J_IMM; +} + +/** + * True if the def is a 1-bit flag regardless of whether it is uniform. + */ +static inline bool +jay_is_flag(jay_def d) +{ + return d.file == FLAG || d.file == UFLAG; +} + +/** + * Return the number of SSA indices referenced by an jay_def. + */ +static inline unsigned +jay_num_values(jay_def d) +{ + return jay_is_imm(d) || jay_is_null(d) ? 0 : (d.num_values_m1 + 1); +} + +/** + * True if the def is an SSA def (and not, say, an arch register). + */ +static inline bool +jay_is_ssa(jay_def d) +{ + return d.file < JAY_NUM_SSA_FILES; +} + +#define jay_foreach_comp(def, c) \ + for (unsigned c = 0; c < jay_num_values(def); ++c) + +#define jay_foreach_comp_rev(def, c) \ + for (signed c = jay_num_values(def) - 1; c >= 0; --c) + +/* + * Alias for jay_base_index for use with scalar defs. + */ +static inline uint32_t +jay_index(jay_def d) +{ + assert(jay_num_values(d) == 1); + return jay_base_index(d); +} + +/** + * Return a reference to the array of indices of a collect source. + */ +static inline uint32_t * +_jay_collect_indices(jay_def d) +{ + assert(d.collect); + + /* reg has upper bits of the pointer */ + uint64_t payload = (((uint64_t) d.reg) << 32) | d._payload; + return (uint32_t *) (uintptr_t) payload; +} + +/** + * Return the n'th channel of an SSA def. + * + * Note: this is specifically read-only. To mutate, use jay_set_channel. + */ +static inline uint32_t +jay_channel(jay_def d, unsigned c) +{ + assert(d.file != J_IMM); + assert(c <= d.num_values_m1); + + if (likely(!d.collect)) { + return jay_base_index(d) + c; + } else { + return _jay_collect_indices(d)[c]; + } +} + +/** + * Build a contiguous jay_def. + */ +static inline jay_def +jay_contiguous_def(enum jay_file file, uint32_t index, unsigned count) +{ + assert(count > 0 && count <= (1 << 7) && "max def width"); + + return (jay_def) { + ._payload = index, + .file = file, + .num_values_m1 = count - 1, + }; +} + +/* + * Replaces a source, preserving the negate/abs if present. + */ +static inline void +jay_replace_src(jay_def *old, jay_def replacement) +{ + replacement.negate = old->negate; + replacement.abs = old->abs; + *old = replacement; +} + +static inline jay_def +jay_scalar(enum jay_file file, uint32_t index) +{ + return jay_contiguous_def(file, index, 1); +} + +static inline jay_def +jay_null() +{ + return jay_scalar(J_ARF, 0); +} + +/** + * Return a contiguous subrange inside an SSA def. + */ +static inline jay_def +jay_extract_range(jay_def def, unsigned chan, unsigned count) +{ + assert(!jay_is_imm(def)); + assert((count == 1 || !def.collect) && "slicing collects unsupported"); + assert(chan + count <= jay_num_values(def)); + + uint32_t base = jay_channel(def, chan); + jay_replace_src(&def, jay_contiguous_def(def.file, base, count)); + return def; +} + +/** + * Return a scalar SSA def equal to a single channel from an SSA def. + */ +static inline jay_def +jay_extract(jay_def def, unsigned chan) +{ + return jay_extract_range(def, chan, 1); +} + +/** + * Like jay_extract but working on bare registers. This could be unified to + * preserve indices and such but meh. + */ +static inline jay_def +jay_extract_post_ra(jay_def def, unsigned chan) +{ + return jay_bare_reg(def.file, def.reg + chan); +} + +/** + * Construct an immediate source from a raw 32-bit data pattern. + */ +static inline jay_def +jay_imm(uint32_t imm) +{ + return (jay_def) { ._payload = imm, .file = J_IMM }; +} + +/** + * True if both jay_defs are equivalent up to source modifiers. + */ +static inline bool +jay_defs_equivalent(jay_def a, jay_def b) +{ + if (a.file != b.file || + a.num_values_m1 != b.num_values_m1 || + a.collect != b.collect) + return false; + + if (likely(!a.collect)) { + /* Contiguous or immediate */ + return a._payload == b._payload && a.reg == b.reg; + } else { + /* Collect. Component-wise compare. */ + return !memcmp(_jay_collect_indices(a), _jay_collect_indices(b), + sizeof(uint32_t) * jay_num_values(a)); + } +} + +/** + * True if both registers are equal (for use post-RA). + */ +static inline bool +jay_regs_equal(jay_def a, jay_def b) +{ + return a.file == b.file && + a.num_values_m1 == b.num_values_m1 && + a.reg == b.reg; +} + +/** + * Return a reference to the execution mask (mask0) architecture register. + */ +static inline jay_def +jay_exec_mask(void) +{ + return jay_scalar(J_ARF, JAY_ARF_MASK); +} + +/** + * Return a reference to the control (cr0) architecture register. + */ +static inline jay_def +jay_control(void) +{ + return jay_scalar(J_ARF, JAY_ARF_CONTROL); +} + +/** + * Construct an immediate from a floating point constant. + */ +static inline jay_def +jay_imm_f(float imm) +{ + return jay_imm(fui(imm)); +} + +/** + * Return the negation of a source. + */ +static inline jay_def +jay_negate(jay_def src) +{ + src.negate = !src.negate; + return src; +} + +/** + * Return the absolute value of a source. + */ +static inline jay_def +jay_abs(jay_def src) +{ + src.negate = false; + src.abs = true; + return src; +} + +/** + * Returns true if the given source reads the same value in all lanes. + */ +static inline bool +jay_is_uniform(jay_def d) +{ + return d.file == UGPR || + d.file == UFLAG || + d.file == UACCUM || + jay_is_imm(d); +} + +/** + * Returns true if the given definition represents a spilled variable. + */ +static inline bool +jay_is_mem(jay_def x) +{ + return x.file == MEM || x.file == UMEM; +} + +static inline uint32_t +jay_as_uint(jay_def src) +{ + assert(jay_is_imm(src)); + return src._payload; +} + +static inline bool +jay_is_zero(jay_def src) +{ + return jay_is_imm(src) && jay_as_uint(src) == 0; +} + +/* Chosen so that sized type is the unsized type OR the number bits */ +#define JAY_TYPE_BASE_MASK (128 | 2 | 4) + +enum PACKED jay_type { + JAY_TYPE_UNTYPED = 0, + JAY_TYPE_U = 2, + JAY_TYPE_S = 4, + JAY_TYPE_F = 6, + JAY_TYPE_BF = 128, + + /** Unsigned integers */ + JAY_TYPE_U64 = JAY_TYPE_U | 64, + JAY_TYPE_U32 = JAY_TYPE_U | 32, + JAY_TYPE_U16 = JAY_TYPE_U | 16, + JAY_TYPE_U8 = JAY_TYPE_U | 8, + JAY_TYPE_U1 = JAY_TYPE_U | 1, + + /** Signed integers */ + JAY_TYPE_S64 = JAY_TYPE_S | 64, + JAY_TYPE_S32 = JAY_TYPE_S | 32, + JAY_TYPE_S16 = JAY_TYPE_S | 16, + JAY_TYPE_S8 = JAY_TYPE_S | 8, + JAY_TYPE_S1 = JAY_TYPE_S | 1, + + /** IEEE floating point */ + JAY_TYPE_F64 = JAY_TYPE_F | 64, + JAY_TYPE_F32 = JAY_TYPE_F | 32, + JAY_TYPE_F16 = JAY_TYPE_F | 16, + + /** Other floating point variants */ + JAY_TYPE_BF16 = JAY_TYPE_BF | 16, +}; +static_assert(sizeof(enum jay_type) == 1); + +static inline enum jay_type +jay_type(enum jay_type base, unsigned bits) +{ + /* Normalize booleans */ + if (bits == 1) { + base = JAY_TYPE_U; + } + + return (enum jay_type)(base | bits); +} + +static inline enum jay_type +jay_base_type(enum jay_type t) +{ + return (enum jay_type)(t & JAY_TYPE_BASE_MASK); +} + +static inline unsigned +jay_type_size_bits(enum jay_type t) +{ + return t & ~JAY_TYPE_BASE_MASK; +} + +static inline enum jay_type +jay_type_rebase(enum jay_type t, enum jay_type new_base) +{ + return jay_type(new_base, jay_type_size_bits(t)); +} + +static inline enum jay_type +jay_type_resize(enum jay_type t, unsigned bits) +{ + return jay_type(jay_base_type(t), bits); +} + +/** + * Returns the number of 32-bit values needed to hold a type t. + */ +static inline unsigned +jay_type_vector_length(enum jay_type t) +{ + return jay_type_size_bits(t) == 64 ? 2 : 1; +} + +static inline bool +jay_type_is_any_float(enum jay_type t) +{ + return jay_base_type(t) == JAY_TYPE_F || jay_base_type(t) == JAY_TYPE_BF; +} + +enum jay_predication : uint8_t { + /** No predication. */ + JAY_NOT_PREDICATED = 0, + + /** + * Predicated with no default value. Used post-RA and for instructions that + * do not write a destination. + */ + JAY_PREDICATED = 1, + + /** Predicated with 1 default value. Used pre-RA. */ + JAY_PREDICATED_DEFAULT = 2, +}; + +/** + * Representation of a shader instruction in the Jay IR. + */ +typedef struct jay_inst { + struct list_head link; + + /** + * Metadata calculated by liveness analysis: bit i is set if the i'th + * non-null SSA index read by the instruction is killed by that read. + */ + BITSET_DECLARE(last_use, JAY_NUM_LAST_USE_BITS); + + enum jay_opcode op; + enum jay_type type; /**< execution type of the instruction */ + + /** Software scoreboarding dependencies (for non-SYNC instructions) */ + struct tgl_swsb dep; + + /** Number of sources */ + uint8_t num_srcs; + + /** + * Indicates an instruction reading only uniform sources but writing a FLAG + * and no GPR/UGPR that expects the flag to replicate for all SIMD lanes. + * This is okay in our data model but cannot be inferred from the files, so + * we have a secondary bit to express this. + */ + bool broadcast_flag:1; + bool saturate :1; + + /** + * In a SIMD split instruction, whether the regdist dependency is replicated + * to each physical instruction. If false, only the first instruction waits. + * + * If decrement_dep is also set, the regdist is decremented by the macro + * length for each instruction (modelling cross-pipe dependencies). + */ + bool replicate_dep:1; + bool decrement_dep:1; + unsigned padding :12; + + enum jay_predication predication; + enum jay_conditional_mod conditional_mod; + + jay_def cond_flag; /**< conditional flag */ + jay_def dst; + + jay_def src[]; +} jay_inst; + +static_assert(sizeof(jay_inst) == 32 + (sizeof(uintptr_t) * 2), "packed"); + +/* + * Return the number of instruction set defined sources, ignoring implicit + * predication and accumulator sources. + */ +static inline unsigned +jay_num_isa_srcs(const jay_inst *I) +{ + return I->num_srcs - I->predication - (I->op == JAY_OPCODE_SEL); +} + +static inline bool +jay_uses_flag(const jay_inst *I) +{ + return I->predication || + !jay_is_null(I->cond_flag) || + I->op == JAY_OPCODE_SEL; +} + +static inline void +jay_remove_instruction(jay_inst *inst) +{ + list_del(&inst->link); +} + +static inline bool +jay_has_src_mods(jay_inst *I, unsigned s) +{ + return jay_opcode_infos[I->op].src_mods & BITFIELD_BIT(s); +} + +static inline bool +jay_inst_has_default(jay_inst *I) +{ + return I->predication >= JAY_PREDICATED_DEFAULT; +} + +static inline jay_def * +jay_inst_get_predicate(jay_inst *I) +{ + assert(I->predication); + return &I->src[I->num_srcs - I->predication]; +} + +static inline jay_def * +jay_inst_get_default(jay_inst *I) +{ + assert(jay_inst_has_default(I)); + return &I->src[I->num_srcs - 1]; +} + +/* Must be included late since it depends on jay_inst but the rest of this file + * depends on the inline functions it defines. + */ +#include "jay_extra_info.h" + +static inline enum jay_type +jay_src_type(const jay_inst *I, unsigned s) +{ + /* Predicates */ + if (s == (unsigned) (I->num_srcs - I->predication) || + (I->op == JAY_OPCODE_SEL && s == 2) || + (I->op == JAY_OPCODE_PHI_SRC && jay_is_flag(I->src[s]))) + return JAY_TYPE_U1; + + /* Conversions have an explicit source type, use that. */ + if (I->op == JAY_OPCODE_CVT) + return jay_cvt_src_type(I); + + /* 16-bit operand */ + if (I->op == JAY_OPCODE_MUL_32X16 && s == 1) + return jay_type_resize(I->type, jay_type_size_bits(I->type) / 2); + + if (I->op == JAY_OPCODE_SEND) { + if (s < 2) + return JAY_TYPE_U32; + else if (s < 4) + return s == 3 ? jay_send_type_1(I) : jay_send_type_0(I); + } + + if (I->op == JAY_OPCODE_CAST_CANONICAL_TO_FLAG) + return JAY_TYPE_U32; + + /* Shifts are always small even with 64-bit destinations */ + if ((I->op == JAY_OPCODE_SHL || + I->op == JAY_OPCODE_SHR || + I->op == JAY_OPCODE_ASR) && + s == 1) + return JAY_TYPE_U16; + + /* TODO: Do we want to allow zero-extension generally? */ + if (I->op == JAY_OPCODE_AND_U32_U16) + return JAY_TYPE_U16; + + /* Mixed-signedness integer dot product opcode */ + if (I->op == JAY_OPCODE_DP4A_SU && s == 2) + return JAY_TYPE_U32; + + /* Shuffle lane index distinct from data type */ + if (I->op == JAY_OPCODE_SHUFFLE && s == 1) + return JAY_TYPE_U32; + + /* Other instructions inherit the destination type. */ + return I->type; +} + +enum jay_stride { + JAY_STRIDE_2 = 0, + JAY_STRIDE_4, + JAY_STRIDE_8, + JAY_NUM_STRIDES, +}; + +static inline unsigned +jay_stride_to_bits(enum jay_stride s) +{ + assert(s <= JAY_STRIDE_8); + return 16 << s; +} + +#define JAY_PARTITION_BLOCKS (3) + +struct jay_register_block { + uint16_t start, len; +}; + +struct jay_partition { + /** Consecutive ranges of GRFs in GPR/UGPRs. */ + struct jay_register_block blocks[JAY_NUM_GRF_FILES][JAY_PARTITION_BLOCKS]; + + /** Number of GPR/UGPRs per GRF, times 16. For example, 16 encodes SIMD16 + * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 UGPRs). + * 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR). + */ + unsigned units_x16[JAY_NUM_GRF_FILES]; + + /** Base GPR for each stride. The file is partitioned (4, 8, 2, 4=EOT). */ + unsigned base8, base2, base_eot; + + /** Region of the UGPR partition suitable for large UGPR vectors */ + struct jay_register_block large_ugpr_block; +}; + +static inline enum jay_stride +jay_gpr_to_stride(struct jay_partition *p, unsigned reg) +{ + return (reg < p->base8 || reg >= p->base_eot) ? JAY_STRIDE_4 : + reg >= p->base2 ? JAY_STRIDE_2 : + JAY_STRIDE_8; +} + +/** + * Representation of a shader in the Jay IR. + */ +typedef struct jay_shader { + mesa_shader_stage stage; + struct list_head functions; + const struct intel_device_info *devinfo; + union brw_any_prog_data *prog_data; + unsigned spills, fills; + unsigned scratch_size; + unsigned push_grfs; + + /** + * Ralloc linear context. Since we don't typically free as we go, + * most allocations should go through this context for efficiency. + */ + struct linear_ctx *lin_ctx; + + /* Dispatch width of the current compile: 8, 16, or 32. */ + unsigned dispatch_width; + + /** + * Number of GPR/UGPRs used across all functions in the shader. This is the + * limit that must be allocated for the shader. + */ + unsigned num_regs[JAY_NUM_RA_FILES]; + + /** + * Register file partition chosen for the whole shader. + */ + struct jay_partition partition; + + /** Current compilation phase (for printing & validation) */ + bool post_ra; +} jay_shader; + +static inline jay_shader * +jay_new_shader(void *memctx, mesa_shader_stage stage) +{ + jay_shader *s = rzalloc(NULL, jay_shader); + s->stage = stage; + s->lin_ctx = linear_context(s); + list_inithead(&s->functions); + return s; +} + +static inline unsigned +jay_ugpr_per_grf(jay_shader *s) +{ + unsigned B_per_unit = 32 /* see reg_unit */; + unsigned B_per_ugpr = 4; + + return reg_unit(s->devinfo) * (B_per_unit / B_per_ugpr); +} + +static inline unsigned +jay_grf_per_gpr(jay_shader *s) +{ + assert(reg_unit(s->devinfo) == 1 || reg_unit(s->devinfo) == 2); + return reg_unit(s->devinfo) == 2 ? (s->dispatch_width / 16) : + (s->dispatch_width / 8); +} + +static inline unsigned +jay_phys_flag_per_virt(jay_shader *s) +{ + /* TODO: Check if this holds on older platforms */ + return jay_grf_per_gpr(s); +} + +/* + * Returns whether an instruction will lower to a SEND post-RA: either a SEND or + * a spill/fill that has not yet been lowered. + */ +static inline bool +jay_is_send_like(const jay_inst *I) +{ + if (I->op == JAY_OPCODE_MOV) + return jay_is_mem(I->dst) || jay_is_mem(I->src[0]); + else + return I->op == JAY_OPCODE_SEND; +} + +/* + * Returns whether an instruction contains cross-lane access. + */ +static inline bool +jay_is_shuffle_like(const jay_inst *I) +{ + return I->op == JAY_OPCODE_SHUFFLE || + I->op == JAY_OPCODE_QUAD_SWIZZLE || + I->op == JAY_OPCODE_BROADCAST_IMM; +} + +/* + * Return the required alignment for the register assigned to a given source. + */ +static inline unsigned +jay_src_alignment(jay_shader *shader, const jay_inst *I, unsigned s) +{ + /* SENDs operate on entire GRFs at a time, so align UGPRs to GRFs. This + * includes UGPR->UMEM moves which lower to SENDs. + */ + if ((I->op == JAY_OPCODE_SEND && I->src[s].file == UGPR) || + (I->dst.file == UMEM)) { + return jay_ugpr_per_grf(shader); + } + + /* If the destination is 64-bit, we need the sources to be aligned. Along + * with a suitable partitioning, this ensures only the aligned low half of + * a strided register is used, preventing invalid assembly like: + * + * mov.s64 g40, g42.1<2>:s32 + * + * ..which would violate the rule: + * + * Register Regioning patterns where register data bit location of the LSB + * of the channels are changed between source and destination are not + * supported except for broadcast of a scalar. + */ + return jay_type_vector_length(I->type); +} + +/* + * Return the required alignment for the register assigned to a destination. + */ +static inline unsigned +jay_dst_alignment(jay_shader *shader, const jay_inst *I) +{ + /* SENDs write entire GRFs, so align UGPRs to GRFs. Similarly for any + * instructions involving accumulators: + * + * Register Regioning patterns where register data bit locations are + * changed between source and destination are not supported when an + * accumulator is used as an implicit source or an explicit source in an + * instruction. (TODO) + */ + if (I->dst.file == UGPR && + (I->op == JAY_OPCODE_SEND || + (I->op == JAY_OPCODE_MOV && I->src[0].file == UMEM) || + I->op == JAY_OPCODE_MUL_32)) { + + return jay_ugpr_per_grf(shader); + } + + /* If any source is 64-bit, align the destination to 64-bit too. As above. */ + return jay_type_vector_length(jay_src_type(I, 0)); +} + +static inline bool +jay_inst_is_uniform(const jay_inst *I) +{ + if (I->op == JAY_OPCODE_SEND) + return jay_send_uniform(I); + + return jay_is_uniform(I->dst) || + (I->dst.file == J_ADDRESS && jay_is_uniform(I->src[0])) || + I->cond_flag.file == UFLAG || + I->op == JAY_OPCODE_SYNC || + I->dst.file == FLAG || + (I->dst.file == J_ARF && !jay_is_null(I->dst)); +} + +unsigned jay_simd_split(jay_shader *s, const jay_inst *I); + +static inline unsigned +jay_simd_width_logical(jay_shader *s, const jay_inst *I) +{ + unsigned base = jay_inst_is_uniform(I) ? 1 : s->dispatch_width; + + /* Handle vectors-of-UGPR operations with special care for 64-bit */ + unsigned vec_per_channel = jay_type_vector_length(I->type); + unsigned dst_size = jay_num_values(I->dst); + assert(util_is_aligned(dst_size, vec_per_channel)); + + if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) { + assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1); + base = dst_size; + } + + return base; +} + +static inline unsigned +jay_simd_width_physical(jay_shader *s, const jay_inst *I) +{ + return jay_simd_width_logical(s, I) >> jay_simd_split(s, I); +} + +/* + * Returns the number of physical instructions emitted for each logical + * instruction not accounting for SIMD split. That is, the number of + * instructions that macros will expand to in jay_to_binary or 1 for non-macros. + */ +static inline unsigned +jay_macro_length(const jay_inst *I) +{ + bool macro = (I->op == JAY_OPCODE_MUL_32 || + I->op == JAY_OPCODE_SHUFFLE || + I->op == JAY_OPCODE_LOOP_ONCE); + return macro ? 2 : 1; +} + +static inline bool +jay_is_no_mask(const jay_inst *I) +{ + return jay_inst_is_uniform(I) || + I->broadcast_flag || + I->op == JAY_OPCODE_QUAD_SWIZZLE || + I->op == JAY_OPCODE_DESWIZZLE_16 || + I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || + I->op == JAY_OPCODE_LANE_ID_8 || + I->op == JAY_OPCODE_LANE_ID_EXPAND; +} + +/** + * Representation of an (implemented) function in the Jay IR. This corresponds + * to nir_function_impl in NIR. + */ +typedef struct jay_function { + struct list_head link; + + /* Parent pointer for convenience */ + struct jay_shader *shader; + + /* Set of SSA indices of defs that are dead immediately after being written + * (because they are never read but cannot be DCE'd). + */ + BITSET_WORD *dead_defs; + + /* Register demand metadata calculated & used in RA */ + unsigned demand[JAY_NUM_SSA_FILES]; + + unsigned num_blocks; + struct list_head blocks; + bool is_entrypoint; + + uint32_t ssa_alloc; +} jay_function; + +static inline jay_function * +jay_new_function(jay_shader *s) +{ + jay_function *f = rzalloc(s, jay_function); + list_inithead(&f->blocks); + + f->shader = s; + f->ssa_alloc = 1; /* skip null */ + + list_add(&f->link, &s->functions); + return f; +} + +static inline jay_function * +jay_shader_get_entrypoint(jay_shader *s) +{ + /* TODO: Multifunction shaders */ + assert(list_is_singular(&s->functions)); + return list_first_entry(&s->functions, jay_function, link); +} + +static inline unsigned +jay_num_regs(jay_shader *shader, enum jay_file file) +{ + assert(file < JAY_NUM_SSA_FILES); + + if (file < JAY_NUM_RA_FILES) + return shader->num_regs[file]; + else if (file == FLAG) + return shader->dispatch_width == 32 ? 4 : 8; + else if (file == UFLAG) + return 0; + else + return 1 /* TODO: We don't have address or accumulator RA yet */; +} + +static inline enum jay_stride +jay_def_stride(jay_shader *shader, jay_def x) +{ + assert(x.file == GPR); + return jay_gpr_to_stride(&shader->partition, x.reg); +} + +/* Represents an allocated register number with file in the top 3 bits. */ +typedef uint16_t jay_reg; + +/** Represents a set of registers that may be clobbered for lowering swaps */ +struct jay_temp_regs { + jay_reg gpr, gpr2, ugpr, ugpr2; +}; + +/** + * A basic block representation + */ +typedef struct jay_block { + struct list_head link; + struct list_head instructions; + + /** Control flow graph */ + struct jay_block *successors[2]; + struct util_dynarray predecessors; + + /** Index of the block in source order */ + unsigned index; + + /** Liveness analysis results */ + struct u_sparse_bitset live_in; + struct u_sparse_bitset live_out; + + /** + * After register allocation but before going out-of-SSA, registers that + * are free at the logical end of the block (before phi_src). These will + * be clobbered by the out-of-SSA pass. + */ + struct jay_temp_regs temps_out; + + /** + * Is this block a loop header? If not, all of its predecessors precede it + * in source order. + */ + bool loop_header; + + /** True if all non-exited lanes execute this block together */ + bool uniform; + + /** Pretty printing based on original structured control flow */ + uint8_t indent; +} jay_block; + +static inline jay_block * +jay_new_block(jay_function *f) +{ + jay_block *block = rzalloc(f, jay_block); + + util_dynarray_init(&block->predecessors, block); + list_inithead(&block->instructions); + + block->index = f->num_blocks++; + return block; +} + +static inline bool +jay_op_is_control_flow(enum jay_opcode op) +{ + return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE; +} + +/** + * Returns the control flow instruction at the end of a block or NULL. + */ +static inline jay_inst * +jay_block_ending_jump(jay_block *block) +{ + jay_inst *last = list_is_empty(&block->instructions) ? + NULL : + list_last_entry(&block->instructions, jay_inst, link); + return last && jay_op_is_control_flow(last->op) ? last : NULL; +} + +static inline unsigned +jay_num_predecessors(jay_block *block) +{ + return util_dynarray_num_elements(&block->predecessors, jay_block *); +} + +static inline unsigned +jay_num_successors(jay_block *block) +{ + static_assert(ARRAY_SIZE(block->successors) == 2); + return !!block->successors[0] + !!block->successors[1]; +} + +static inline jay_block * +jay_first_predecessor(jay_block *block) +{ + if (jay_num_predecessors(block) == 0) + return NULL; + + return *util_dynarray_element(&block->predecessors, struct jay_block *, 0); +} + +/* Block worklist helpers */ + +#define jay_worklist_push_head(w, block) u_worklist_push_head(w, block, index) +#define jay_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index) +#define jay_worklist_peek_head(w) u_worklist_peek_head(w, jay_block, index) +#define jay_worklist_pop_head(w) u_worklist_pop_head(w, jay_block, index) +#define jay_worklist_peek_tail(w) u_worklist_peek_tail(w, jay_block, index) +#define jay_worklist_pop_tail(w) u_worklist_pop_tail(w, jay_block, index) + +/* Iterators */ + +#define jay_foreach_function(s, v) \ + list_for_each_entry(jay_function, v, &s->functions, link) + +#define jay_foreach_block(f, v) \ + list_for_each_entry(jay_block, v, &f->blocks, link) + +#define jay_foreach_block_safe(f, v) \ + list_for_each_entry_safe(jay_block, v, &f->blocks, link) + +#define jay_foreach_block_rev(f, v) \ + list_for_each_entry_rev(jay_block, v, &f->blocks, link) + +#define jay_foreach_block_from(f, from, v) \ + list_for_each_entry_from(jay_block, v, from, &f->blocks, link) + +#define jay_foreach_block_from_rev(f, from, v) \ + list_for_each_entry_from_rev(jay_block, v, from, &f->blocks, link) + +#define jay_foreach_inst_in_block(block, v) \ + list_for_each_entry(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_rev(block, v) \ + list_for_each_entry_rev(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_safe(block, v) \ + list_for_each_entry_safe(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_safe_rev(block, v) \ + list_for_each_entry_safe_rev(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_from(block, v, from) \ + list_for_each_entry_from(jay_inst, v, from, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(jay_inst, v, from, &(block)->instructions, link) + +#define jay_foreach_inst_in_func(func, block, v) \ + jay_foreach_block(func, block) \ + jay_foreach_inst_in_block(block, v) + +#define jay_foreach_inst_in_func_rev(func, block, v) \ + jay_foreach_block_rev(func, block) \ + jay_foreach_inst_in_block_rev(block, v) + +#define jay_foreach_inst_in_func_safe(func, block, v) \ + jay_foreach_block(func, block) \ + jay_foreach_inst_in_block_safe(block, v) + +#define jay_foreach_inst_in_func_safe_rev(func, block, v) \ + jay_foreach_block_rev(func, block) \ + jay_foreach_inst_in_block_safe_rev(block, v) + +#define jay_foreach_inst_in_shader(s, func, inst) \ + jay_foreach_function(s, func) \ + jay_foreach_inst_in_func(func, v_block, inst) + +#define jay_foreach_inst_in_shader_safe(s, func, inst) \ + jay_foreach_function(s, func) \ + jay_foreach_inst_in_func_safe(func, v_block, inst) + +#define jay_foreach_successor(blk, v) \ + jay_block *v; \ + jay_block **_v; \ + for (_v = (jay_block **) &blk->successors[0], v = *_v; \ + v != NULL && _v < (jay_block **) &blk->successors[2]; _v++, v = *_v) + +#define jay_foreach_predecessor(blk, v) \ + util_dynarray_foreach(&blk->predecessors, jay_block *, v) + +#define jay_foreach_src(inst, s) for (unsigned s = 0; s < inst->num_srcs; ++s) + +#define jay_foreach_src_rev(inst, s) \ + for (signed s = inst->num_srcs - 1; s >= 0; --s) + +#define jay_foreach_ssa_src(I, s) \ + jay_foreach_src(I, s) \ + if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) + +#define jay_foreach_ssa_src_rev(I, s) \ + jay_foreach_src_rev(I, s) \ + if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) + +#define jay_foreach_index(def, c, idx) \ + jay_foreach_comp(def, c) \ + for (uint32_t idx = jay_channel(def, c); idx != 0; idx = 0) + +#define jay_foreach_index_rev(def, c, idx) \ + jay_foreach_comp_rev(def, c) \ + for (uint32_t idx = jay_channel(def, c); idx != 0; idx = 0) + +#define jay_foreach_src_index(I, s, c, i) \ + jay_foreach_ssa_src(I, s) \ + jay_foreach_index(I->src[s], c, i) + +#define jay_foreach_src_index_rev(I, s, c, i) \ + jay_foreach_ssa_src_rev(I, s) \ + jay_foreach_index_rev(I->src[s], c, i) + +#define jay_foreach_dst(I, d) \ + for (unsigned _d = 0; _d < 2; ++_d) \ + for (jay_def d = (_d ? I->cond_flag : I->dst); !jay_is_null(d); \ + d = jay_null()) + +#define jay_foreach_dst_index(I, d, i) \ + jay_foreach_dst(I, d) \ + jay_foreach_index(d, _c, i) + +/* + * Phi iterators take advantage of the known position of phis in the block. + */ +#define jay_foreach_phi_src_in_block(block, phi) \ + jay_foreach_inst_in_block_safe_rev(block, phi) \ + if (jay_op_is_control_flow(phi->op)) \ + continue; \ + else if (phi->op != JAY_OPCODE_PHI_SRC) \ + break; \ + else + +#define jay_foreach_phi_dst_in_block(block, phi) \ + jay_foreach_inst_in_block(block, phi) \ + if (phi->op != JAY_OPCODE_PHI_DST) \ + break; \ + else + +#define jay_foreach_preload(func, preload) \ + jay_foreach_inst_in_block_safe(jay_first_block(func), preload) \ + if (I->op != JAY_OPCODE_PRELOAD) \ + break; \ + else + +static inline jay_block * +jay_first_block(jay_function *f) +{ + assert(!list_is_empty(&f->blocks)); + jay_block *first_block = list_first_entry(&f->blocks, jay_block, link); + assert(first_block->index == 0); + return first_block; +} + +static inline jay_inst * +jay_first_inst(jay_block *block) +{ + if (list_is_empty(&block->instructions)) + return NULL; + else + return list_first_entry(&block->instructions, jay_inst, link); +} + +static inline jay_block * +jay_last_block(jay_function *f) +{ + if (list_is_empty(&f->blocks)) + return NULL; + else + return list_last_entry(&f->blocks, jay_block, link); +} + +static inline jay_inst * +jay_last_inst(jay_block *block) +{ + if (list_is_empty(&block->instructions)) + return NULL; + else + return list_last_entry(&block->instructions, jay_inst, link); +} + +static inline jay_block * +jay_next_block(jay_block *block) +{ + return list_first_entry(&(block->link), jay_block, link); +} + +static inline void +jay_block_add_successor(jay_block *block, jay_block *succ) +{ + unsigned i = block->successors[0] ? 1 : 0; + + assert(succ && block->successors[0] != succ && block->successors[1] != succ); + assert(block->successors[i] == NULL && "at most 2 successors"); + + block->successors[i] = succ; + util_dynarray_append(&(succ->predecessors), block); +} + +static inline unsigned +jay_source_last_use_bit(const jay_def *srcs, unsigned src_idx) +{ + assert(jay_is_ssa(srcs[src_idx]) && "precondition"); + unsigned i = 0; + + for (unsigned s = 0; s < src_idx; ++s) { + jay_foreach_index(srcs[s], c, idx) { + i++; + } + } + + return i; +} + +#define jay_foreach_killed(I, s, c) \ + for (unsigned _kill_idx = 0; _kill_idx == 0; _kill_idx = 1) \ + jay_foreach_src_index(I, s, c, idx) \ + for (unsigned _k = _kill_idx++; _k != ~0; _k = ~0) \ + if (BITSET_TEST(I->last_use, _k)) + +/* Helper to run a pass */ +#define JAY_PASS(shader, pass, ...) \ + do { \ + pass(shader, ##__VA_ARGS__); \ + jay_validate(shader, #pass); \ + } while (0) + +#define JAY_DEFINE_FUNCTION_PASS(name, per_func) \ + void name(jay_shader *s) \ + { \ + jay_foreach_function(s, f) { \ + per_func(f); \ + } \ + } diff --git a/src/intel/compiler/jay/jay_liveness.c b/src/intel/compiler/jay/jay_liveness.c new file mode 100644 index 00000000000..ebe89f7504f --- /dev/null +++ b/src/intel/compiler/jay/jay_liveness.c @@ -0,0 +1,203 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "util/macros.h" +#include "util/sparse_bitset.h" +#include "util/u_math.h" +#include "util/u_worklist.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* LiveIn = GEN + (LiveOut - KILL) */ +static void +update_liveness_for_inst(BITSET_WORD *dead_defs, + struct u_sparse_bitset *live_in, + jay_inst *I) +{ + /* No destination is live-in before the instruction, but any destination not + * live-in after is immediately dead. + */ + jay_foreach_dst_index(I, _, def) { + if (u_sparse_bitset_test(live_in, def)) { + u_sparse_bitset_clear(live_in, def); + } else { + BITSET_SET(dead_defs, def); + } + } + + if (I->op == JAY_OPCODE_PHI_SRC) { + /* Phi sources do not require last-use bits. */ + jay_foreach_src_index(I, src_idx, comp, index) { + u_sparse_bitset_set(live_in, index); + } + } else { + BITSET_ZERO(I->last_use); + unsigned last_use_i = 0; + + jay_foreach_src_index(I, s, comp, index) { + /* If the source is not live after this instruction, but becomes + * live at this instruction, this is the last use. + */ + if (!u_sparse_bitset_test(live_in, index)) { + assert(last_use_i < JAY_NUM_LAST_USE_BITS); + BITSET_SET(I->last_use, last_use_i); + } + + u_sparse_bitset_set(live_in, index); + ++last_use_i; + } + } +} + +/** + * Calculate liveness information for SSA values. + * + * This populates the jay_block::live_in/live_out bitsets and last_use flags. + */ +void +jay_compute_liveness(jay_function *f) +{ + u_worklist worklist; + u_worklist_init(&worklist, f->num_blocks, NULL); + + ralloc_free(f->dead_defs); + f->dead_defs = BITSET_RZALLOC(f, f->ssa_alloc); + + jay_foreach_block(f, block) { + u_sparse_bitset_free(&block->live_in); + u_sparse_bitset_free(&block->live_out); + + u_sparse_bitset_init(&block->live_in, f->ssa_alloc, block); + u_sparse_bitset_init(&block->live_out, f->ssa_alloc, block); + + jay_worklist_push_head(&worklist, block); + } + + while (!u_worklist_is_empty(&worklist)) { + /* Pop in reverse order since liveness is a backwards pass */ + jay_block *block = jay_worklist_pop_head(&worklist); + + /* Update its liveness information: + * 1. Assume everything liveout from this block was live_in + * 2. Clear live_in for anything defined in this block + */ + u_sparse_bitset_dup(&block->live_in, &block->live_out); + + jay_foreach_inst_in_block_rev(block, inst) { + update_liveness_for_inst(f->dead_defs, &block->live_in, inst); + } + + /* Propagate block->live_in[] to the live_out[] of predecessors. Since + * phis are split, they are handled naturally without special cases. + */ + jay_foreach_predecessor(block, p) { + if (u_sparse_bitset_merge(&(*p)->live_out, &block->live_in)) { + jay_worklist_push_tail(&worklist, *p); + } + } + } + +#ifndef NDEBUG + jay_block *first_block = jay_first_block(f); + jay_block *last_block = list_last_entry(&f->blocks, jay_block, link); + + assert(u_sparse_bitset_count(&first_block->live_in) == 0 && "invariant"); + assert(u_sparse_bitset_count(&last_block->live_out) == 0 && "invariant"); +#endif + + u_worklist_fini(&worklist); +} + +/* + * Calculate the register demand for each SSA file using the previously + * calculated liveness analysis. SSA makes this exact in linear-time. + */ +void +jay_calculate_register_demands(jay_function *func) +{ + enum jay_file *files = calloc(func->ssa_alloc, sizeof(enum jay_file)); + BITSET_WORD *killed = BITSET_CALLOC(func->ssa_alloc); + unsigned *max_demand = func->demand; + memset(max_demand, 0, sizeof(func->demand)); + + jay_foreach_inst_in_func(func, block, I) { + jay_foreach_dst_index(I, def, index) { + files[index] = def.file; + } + } + + jay_foreach_block(func, block) { + unsigned demands[JAY_NUM_SSA_FILES] = {}; + + /* Everything live-in. */ + U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) { + ++demands[files[i]]; + } + + jay_foreach_ssa_file(f) { + max_demand[f] = MAX2(demands[f], max_demand[f]); + } + + jay_foreach_inst_in_block(block, I) { + /* We must have enough register file space for the register payload */ + if (I->op == JAY_OPCODE_PRELOAD) { + uint32_t max = jay_preload_reg(I) + jay_num_values(I->dst); + max_demand[I->dst.file] = MAX2(max_demand[I->dst.file], max); + } + + /* Collect source values to kill */ + jay_foreach_killed(I, s, c) { + BITSET_SET(killed, jay_channel(I->src[s], c)); + } + + /* Make destinations live */ + jay_foreach_dst(I, d) { + demands[d.file] += util_next_power_of_two(jay_num_values(d)); + } + + /* Update maximum demands */ + jay_foreach_ssa_file(f) { + max_demand[f] = MAX2(demands[f], max_demand[f]); + } + + /* Dead destinations are those written by the instruction but killed + * immediately after the instruction finishes. + */ + jay_foreach_dst_index(I, d, index) { + if (BITSET_TEST(func->dead_defs, index)) { + assert(demands[d.file] > 0); + --demands[d.file]; + } + } + + jay_foreach_dst(I, d) { + unsigned n = jay_num_values(d); + demands[d.file] -= util_next_power_of_two(n) - n; + } + + /* Late-kill sources */ + jay_foreach_killed(I, s, c) { + uint32_t index = jay_channel(I->src[s], c); + + if (BITSET_TEST(killed, index)) { + BITSET_CLEAR(killed, index); + + assert(demands[I->src[s].file] > 0); + --demands[I->src[s].file]; + } + } + + if (jay_debug & JAY_DBG_PRINTDEMAND) { + printf("(LA) [G:%u\tU:%u] ", demands[GPR], demands[UGPR]); + jay_print_inst(stdout, I); + } + } + } + + free(files); + free(killed); +} diff --git a/src/intel/compiler/jay/jay_lower_post_ra.c b/src/intel/compiler/jay/jay_lower_post_ra.c new file mode 100644 index 00000000000..db8661b011d --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_post_ra.c @@ -0,0 +1,153 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/macros.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * If default != dest, we need to lower. Predicated moves generalize as SEL, + * with default in src0 to allow for immediates. + * + * For anything else, we have to insert a copy. + */ +static void +lower_non_tied_default(jay_builder *b, jay_inst *I, jay_def default_) +{ + jay_def not_pred = jay_negate(*jay_inst_get_predicate(I)); + assert(default_.file != FLAG && "we don't support this"); + + if (I->op == JAY_OPCODE_MOV) { + jay_SEL(b, I->type, I->dst, default_, I->src[0], not_pred); + jay_remove_instruction(I); + } else { + jay_foreach_comp(I->dst, c) { + jay_def dst = jay_extract_post_ra(I->dst, c); + jay_def src = jay_extract_post_ra(default_, c); + + jay_add_predicate(b, jay_MOV(b, dst, src), not_pred); + } + } +} + +static inline jay_def +hi(jay_def x) +{ + x.hi = true; + return x; +} + +static bool +lower(jay_builder *b, jay_inst *I) +{ + switch (I->op) { + case JAY_OPCODE_PRELOAD: + case JAY_OPCODE_PHI_DST: + case JAY_OPCODE_INDETERMINATE: + /* Delete instructions that only exist for RA. Uninitialized register + * contents is a perfectly cromulent indeterminate value. + */ + return true; + + case JAY_OPCODE_MOV: { + /* Delete trivial moves */ + if (jay_regs_equal(I->dst, I->src[0]) && !I->predication) + return true; + + if (I->dst.file == GPR && I->src[0].file == GPR) { + jay_def dst = I->dst, src = I->src[0], tmp4 = jay_bare_reg(GPR, 0); + enum jay_stride dst_stride = jay_def_stride(b->shader, dst); + enum jay_stride src_stride = jay_def_stride(b->shader, src); + assert(jay_def_stride(b->shader, tmp4) == JAY_STRIDE_4 && "ABI"); + + if (dst_stride == JAY_STRIDE_8 && src_stride == JAY_STRIDE_2) { + jay_MOV(b, dst, tmp4); + jay_MOV(b, tmp4, src)->type = JAY_TYPE_U16; + jay_MOV(b, hi(tmp4), hi(src))->type = JAY_TYPE_U16; + + jay_XOR(b, JAY_TYPE_U32, dst, dst, tmp4); + jay_XOR(b, JAY_TYPE_U32, tmp4, dst, tmp4); + jay_XOR(b, JAY_TYPE_U32, dst, dst, tmp4); + return true; + } else if (dst_stride == JAY_STRIDE_2 && src_stride == JAY_STRIDE_8) { + jay_MOV(b, dst, tmp4)->type = JAY_TYPE_U16; + jay_MOV(b, hi(dst), hi(tmp4))->type = JAY_TYPE_U16; + jay_MOV(b, tmp4, src); + + for (unsigned i = 0; i < 3; ++i) { + jay_XOR(b, JAY_TYPE_U16, i == 1 ? tmp4 : dst, dst, tmp4); + jay_XOR(b, JAY_TYPE_U16, i == 1 ? hi(tmp4) : hi(dst), hi(dst), + hi(tmp4)); + } + + return true; + } + + /* Lower 4B<-->2B copies. To pack the register file, RA + * sometimes inserts 32-bit copies involving 16-bit strided sources like + * "mov.u32 r4 <32-bit>, r50 <16-bit>". This cannot be implemented in a + * single hardware instruction, so we split into two 16-bit copies. + */ + enum jay_stride min_stride = MIN2(dst_stride, src_stride); + unsigned stride_sz = jay_stride_to_bits(min_stride); + unsigned type_sz = jay_type_size_bits(I->type); + + if (stride_sz < type_sz) { + assert(stride_sz == 16 && type_sz == 32 && "no other case hit"); + I->type = JAY_TYPE_U16; + jay_MOV(b, hi(dst), hi(src))->type = JAY_TYPE_U16; + } + } + + return false; + } + + case JAY_OPCODE_SWAP: { + jay_def x = I->src[0], y = I->src[1]; + /* TODO: Need stride-aware lowering here too like MOV. Same ideas. */ + if (jay_def_stride(b->shader, x) != jay_def_stride(b->shader, y)) + UNREACHABLE("todo"); + + jay_XOR(b, JAY_TYPE_U32, x, y, x); + jay_XOR(b, JAY_TYPE_U32, y, x, y); + jay_XOR(b, JAY_TYPE_U32, x, y, x); + return true; + } + + case JAY_OPCODE_ZERO_FLAG: { + jay_MOV(b, jay_bare_reg(FLAG, jay_zero_flag_reg(I)), 0)->type = + JAY_TYPE_U32; + return true; + } + + default: + return false; + } +} + +void +jay_lower_post_ra(jay_shader *s) +{ + jay_foreach_inst_in_shader_safe(s, func, I) { + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + + if (jay_inst_has_default(I)) { + if (!jay_regs_equal(I->dst, *jay_inst_get_default(I))) { + lower_non_tied_default(&b, I, *jay_inst_get_default(I)); + } + + /* Now just drop the default source */ + jay_shrink_sources(I, I->num_srcs - 1); + I->predication = JAY_PREDICATED; + } + + if (lower(&b, I)) { + jay_remove_instruction(I); + } + } +} diff --git a/src/intel/compiler/jay/jay_lower_pre_ra.c b/src/intel/compiler/jay/jay_lower_pre_ra.c new file mode 100644 index 00000000000..d71ea7c3711 --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_pre_ra.c @@ -0,0 +1,200 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/bitscan.h" +#include "util/hash_table.h" +#include "util/lut.h" +#include "util/macros.h" +#include "util/u_math.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * Register allocation operates only on power-of-two vectors. Pad out + * non-power-of-two vectors with null values to simplify RA. + */ +static jay_def +lower_npot_vector(jay_builder *b, jay_def x) +{ + unsigned n = jay_num_values(x); + + if (!util_is_power_of_two_or_zero(n)) { + uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; + + for (unsigned i = 0; i < n; ++i) { + indices[i] = jay_channel(x, i); + } + + x = jay_collect(b, x.file, indices, util_next_power_of_two(n)); + } + + assert(util_is_power_of_two_or_zero(jay_num_values(x)) && "post-cond"); + return x; +} + +/** + * Vectors need to be allocated to contiguous registers. Furthermore, we + * require power-of-two sizes in certain cases, that's handled here too. + * + * This means that a value cannot appear in multiple channels of an + * instruction, as register allocation would need to assign the same value to + * locations and . Scalars don't have this restriction, except for + * SENDs because the hardware bans repeated sources. + * + * If a value appears in multiple positions, we emit copies so that each + * can be register allocated in the correct position. + */ +static void +lower_contiguous_sources(jay_builder *b, jay_inst *I) +{ + b->cursor = jay_before_inst(I); + uint32_t seen[JAY_MAX_DEF_LENGTH], nr_seen = 0; + + jay_foreach_src(I, s) { + if (jay_num_values(I->src[s]) > 1 || I->op == JAY_OPCODE_SEND) { + jay_foreach_index(I->src[s], c, index) { + /* Search for the index */ + unsigned i; + for (i = 0; i < nr_seen && seen[i] != index; ++i) { + } + + if (i == nr_seen) { + /* Record a new index */ + assert(nr_seen < ARRAY_SIZE(seen)); + seen[nr_seen++] = index; + } else { + /* Insert a copy to access a duplicated index */ + jay_def copy = jay_alloc_def(b, I->src[s].file, 1); + jay_MOV(b, copy, jay_extract(I->src[s], c)); + jay_insert_channel(b, &I->src[s], c, copy); + } + } + + jay_replace_src(&I->src[s], lower_npot_vector(b, I->src[s])); + } + } +} + +static jay_def +lower_imm_to_ugpr(jay_builder *b, + jay_inst *I, + unsigned s, + struct hash_table_u64 *constants) +{ + /* Although only 32-bit constants are supported, 64-bit constants are + * separate in the key since they must be zero-extended. We could optimize + * this but it doesn't really matter. + */ + uint32_t imm = jay_as_uint(I->src[s]); + bool is_64bit = jay_type_size_bits(jay_src_type(I, s)) == 64; + uint64_t key = imm | (is_64bit ? BITFIELD64_BIT(32) : 0); + + jay_inst *mov = _mesa_hash_table_u64_search(constants, key); + if (mov) + return mov->dst; + + /* Try to use source modifiers to reuse a constant if we can */ + if (jay_src_type(I, s) == JAY_TYPE_F32 && jay_has_src_mods(I, s)) { + mov = _mesa_hash_table_u64_search(constants, fui(-uif(imm))); + if (mov) + return jay_negate(mov->dst); + } + + /* If this is a new constant, insert a move and cache it. Currently, we pool + * constants per-function. Inserting everything at the start guarantees that + * these moves dominate all their uses, although it hurts register pressure. + * The spiller should rematerialize constants where necessary to ensure we + * don't lose the wave, but we could still probably optimize this. + */ + jay_def x = jay_alloc_def(b, UGPR, is_64bit ? 2 : 1); + b->cursor = jay_before_function(b->func); + _mesa_hash_table_u64_insert(constants, key, jay_MOV(b, x, imm)); + return x; +} + +static bool +try_swap_src01(jay_inst *I) +{ + if (I->op == JAY_OPCODE_SEL) { + /* sel(a, b, p) = sel(b, a, !p) */ + I->src[2].negate ^= true; + } else if (I->op == JAY_OPCODE_CMP) { + I->conditional_mod = jay_conditional_mod_swap_sources(I->conditional_mod); + } else if (I->op == JAY_OPCODE_BFN) { + jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1)); + } else if (!jay_opcode_infos[I->op]._2src_commutative) { + /* Nothing to do for commutative, but otherwise we give up */ + return false; + } + + SWAP(I->src[0], I->src[1]); + return true; +} + +/* + * Instructions can only encode immediates in certain positions. Lower + * immediates to moves where necessary. + */ +static void +lower_immediates(jay_builder *b, jay_inst *I, struct hash_table_u64 *constants) +{ + /* Canonicalize compare-with-zero to increase freedom */ + if (I->op == JAY_OPCODE_CMP && + jay_is_zero(I->src[1]) && + jay_is_null(I->dst) && + I->type == JAY_TYPE_U32) { + + assert(!jay_is_null(I->cond_flag) && !I->predication); + I->op = JAY_OPCODE_MOV; + jay_shrink_sources(I, 1); + } + + /* One source supports immediates but the other does not, so swap. */ + unsigned other = I->op == JAY_OPCODE_BFN ? 1 : 0; + if (jay_is_imm(I->src[other]) && + !_mesa_hash_table_u64_search(constants, jay_as_uint(I->src[other]))) { + + try_swap_src01(I); + } + + /* Immediates allowed only in certain cases, lower the rest */ + jay_foreach_src(I, s) { + if (jay_is_imm(I->src[s])) { + uint32_t imm = jay_as_uint(I->src[s]); + + bool last = s == (jay_num_isa_srcs(I) - 1); + bool allowed = s < 2 && (last || I->op == JAY_OPCODE_SEND); + allowed |= (I->op == JAY_OPCODE_BFN && s == 0 && imm < UINT16_MAX); + + if (!allowed) { + I->src[s] = lower_imm_to_ugpr(b, I, s, constants); + } + } + } +} + +void +jay_lower_pre_ra(jay_shader *s) +{ + struct hash_table_u64 *constants = _mesa_hash_table_u64_create(NULL); + + jay_foreach_function(s, f) { + /* Pool constants per function. */ + _mesa_hash_table_u64_clear(constants); + + jay_foreach_inst_in_func(f, block, I) { + jay_builder b = { .shader = s, .func = f }; + + /* lower_immediates must be last since it consumes I */ + lower_contiguous_sources(&b, I); + lower_immediates(&b, I, constants); + } + } + + _mesa_hash_table_u64_destroy(constants); +} diff --git a/src/intel/compiler/jay/jay_lower_scoreboard.c b/src/intel/compiler/jay/jay_lower_scoreboard.c new file mode 100644 index 00000000000..305dfff57ba --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_scoreboard.c @@ -0,0 +1,376 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include "compiler/brw/brw_eu_defines.h" +#include "util/bitset.h" +#include "util/macros.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* TODO: Shrink */ +#define MAX_KEYS (2 * JAY_NUM_UGPR) +#define NUM_TOKENS (16) + +/** SEND scoreboarding */ +struct gpr_range { + unsigned base, width; +}; + +static inline struct gpr_range +def_to_gpr(jay_function *func, jay_inst *I, jay_def x) +{ + if (x.file == GPR || x.file == UGPR) { + unsigned base = x.file == UGPR ? func->shader->num_regs[GPR] : 0; + return (struct gpr_range) { base + x.reg, jay_num_values(x) }; + } else { + return (struct gpr_range) { 0, 0 }; + } +} + +static inline void +sync_sbid(jay_function *func, jay_inst *I, uint32_t *busy, unsigned sbid) +{ + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, sbid); + *busy &= ~BITFIELD_BIT(sbid); +} + +static void +lower_send_local(jay_function *func, jay_block *block) +{ + struct { + BITSET_DECLARE(reading, MAX_KEYS); + BITSET_DECLARE(writing, MAX_KEYS); + } tokens[NUM_TOKENS]; + + uint32_t busy = 0; + unsigned roundrobin = 0; + + jay_foreach_inst_in_block_safe(block, I) { + /* Read-after-write */ + jay_foreach_src(I, s) { + struct gpr_range src = def_to_gpr(func, I, I->src[s]); + + u_foreach_bit(sbid, busy) { + if (BITSET_TEST_COUNT(tokens[sbid].writing, src.base, src.width)) { + sync_sbid(func, I, &busy, sbid); + } + } + } + + /* Write-after-write & write-after-read */ + jay_foreach_dst(I, d) { + struct gpr_range dst = def_to_gpr(func, I, I->dst); + + u_foreach_bit(sbid, busy) { + if (BITSET_TEST_COUNT(tokens[sbid].reading, dst.base, dst.width) || + BITSET_TEST_COUNT(tokens[sbid].writing, dst.base, dst.width)) { + sync_sbid(func, I, &busy, sbid); + } + } + } + + if (I->op == JAY_OPCODE_SEND && !jay_send_eot(I)) { + unsigned sbid = (roundrobin++) % NUM_TOKENS; + jay_set_send_sbid(I, sbid); + + if (!(busy & BITSET_BIT(sbid))) { + busy |= BITSET_BIT(sbid); + BITSET_ZERO(tokens[sbid].writing); + BITSET_ZERO(tokens[sbid].reading); + } + + struct gpr_range dst = def_to_gpr(func, I, I->dst); + BITSET_SET_COUNT(tokens[sbid].writing, dst.base, dst.width); + + jay_foreach_src(I, s) { + struct gpr_range src = def_to_gpr(func, I, I->src[s]); + BITSET_SET_COUNT(tokens[sbid].reading, src.base, src.width); + } + } + } + + /* Sync on block boundaries. */ + if (block != jay_last_block(func)) { + jay_builder b = jay_init_builder(func, jay_before_jump(block)); + + u_foreach_bit(sbid, busy) { + jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, sbid); + } + } +} + +/** + * Regdist scoreboarding + * + * Register access is tracked per pipe, with 0 (NONE) having data on the writer + * packed into a u32 with the following macros. + */ +#define make_writer(pipe, ip) (((uint32_t) ip << 3) | (uint32_t) (pipe)) +#define writer_ip(writer) (writer >> 3) +#define writer_pipe(writer) (enum tgl_pipe)(writer & BITFIELD_MASK(3)) + +#define TGL_NUM_PIPES (TGL_PIPE_ALL) +typedef uint32_t u32_per_pipe[TGL_NUM_PIPES]; + +struct swsb_state { + unsigned ip[TGL_NUM_PIPES]; + unsigned last_shape[TGL_NUM_PIPES]; + + /* finished_ip[X][Y] = ip means from the perspective of pipe X, ip on pipe Y + * has already been waited on. + */ + unsigned finished_ip[TGL_NUM_PIPES][TGL_NUM_PIPES]; + u32_per_pipe *access; +}; + +static enum tgl_pipe +inst_exec_pipe(const struct intel_device_info *devinfo, jay_inst *I) +{ + if (I->op == JAY_OPCODE_SEND || jay_op_is_control_flow(I->op) /* XXX*/) { + return TGL_PIPE_NONE; + } else if (I->op == JAY_OPCODE_MATH) { + return TGL_PIPE_MATH; + } else if (I->type == JAY_TYPE_F64) { + return TGL_PIPE_LONG; + } else if (jay_type_is_any_float(I->type)) { + return TGL_PIPE_FLOAT; + } else { + return TGL_PIPE_INT; + } +} + +/** + * Return the RegDist pipeline the hardware will synchronize with if no + * pipeline information is provided in the SWSB annotation of an + * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb). + */ +static enum tgl_pipe +inferred_sync_pipe(const struct intel_device_info *devinfo, const jay_inst *I) +{ + bool has_int_src = false, has_long_src = false; + + if (devinfo->verx10 >= 125) { + jay_foreach_src(I, s) { + has_int_src |= !jay_type_is_any_float(jay_src_type(I, s)); + has_long_src |= jay_src_type(I, s) == JAY_TYPE_F64; + } + + /* Avoid emitting (RegDist, SWSB) annotations for long instructions on + * platforms where they are unordered as they may not be allowed. + */ + if (devinfo->has_64bit_float_via_math_pipe && has_long_src) + return TGL_PIPE_NONE; + } + + return I->op == JAY_OPCODE_SEND ? TGL_PIPE_NONE : + has_long_src ? TGL_PIPE_LONG : + has_int_src ? TGL_PIPE_INT : + TGL_PIPE_FLOAT; +} + +static void +depend_on_writer(struct swsb_state *state, struct gpr_range r, unsigned *dep) +{ + for (unsigned i = 0; i < r.width; ++i) { + uint32_t w = state->access[r.base + i][0]; + dep[writer_pipe(w)] = MAX2(dep[writer_pipe(w)], writer_ip(w)); + } +} + +#define jay_foreach_pipe(pipe) \ + for (unsigned pipe = 1; pipe < TGL_NUM_PIPES; ++pipe) + +static void +lower_regdist_local(jay_function *func, jay_block *block, u32_per_pipe *access) +{ + struct swsb_state state = { .access = access }; + jay_inst *last_sync = NULL; + bool need_deswizzle_wait = false; + + jay_foreach_inst_in_block_safe(block, I) { + enum tgl_pipe exec_pipe = inst_exec_pipe(func->shader->devinfo, I); + unsigned dep[TGL_NUM_PIPES] = { 0 }; + if (I->op == JAY_OPCODE_SYNC) { + last_sync = I; + continue; + } else if (I->op == JAY_OPCODE_DESWIZZLE_16) { + need_deswizzle_wait = true; + state.ip[TGL_PIPE_INT]++; + continue; + } + + /* Force a wait on the deswizzles at the start of the program. XXX: Is + * there a cleaner way to deal with this? + */ + if (need_deswizzle_wait) { + dep[TGL_PIPE_INT] = state.ip[TGL_PIPE_INT]; + need_deswizzle_wait = false; + } + + /* Write-after-{write, read} */ + jay_foreach_dst(I, def) { + struct gpr_range r = def_to_gpr(func, I, def); + depend_on_writer(&state, r, dep); + + for (unsigned i = 0; i < r.width; ++i) { + jay_foreach_pipe(p) { + dep[p] = MAX2(dep[p], state.access[r.base + i][p]); + } + } + } + + /* Read-after-write */ + jay_foreach_src(I, s) { + depend_on_writer(&state, def_to_gpr(func, I, I->src[s]), dep); + } + + unsigned nr_waits = 0; + unsigned last_pipe = TGL_PIPE_NONE; + + /* If dependency P implies dependency Q, drop dependency Q to avoid + * unnecessary annotations. + */ + jay_foreach_pipe(p) { + if (dep[p]) { + jay_foreach_pipe(q) { + if (dep[q] && state.finished_ip[p][q] >= dep[q]) { + dep[q] = 0; + } + } + } + } + + unsigned min_delta = 7; + jay_foreach_pipe(p) { + if (dep[p] && (exec_pipe == TGL_PIPE_NONE /* TODO: Sends */ || + dep[p] > state.finished_ip[exec_pipe][p])) { + unsigned delta = state.ip[p] - dep[p] + 1; + min_delta = MIN2(min_delta, delta); + state.finished_ip[exec_pipe][p] = dep[p]; + nr_waits++; + last_pipe = p; + } + } + + /* If we're SIMD split the same way as our dependency, we can relax the + * dependency to have each half wait in parallel. We could do even better + * with more tracking but this should be good enough for now. + */ + unsigned simd_split = jay_simd_split(func->shader, I); + unsigned shape = ((simd_split << 2) | jay_macro_length(I)) + 1; + bool same_shape = state.last_shape[last_pipe] == shape; + + if (simd_split && same_shape && nr_waits == 1 && min_delta == 1) { + min_delta += ((1 << simd_split) - 1) * jay_macro_length(I); + I->replicate_dep = true; + I->decrement_dep = last_pipe != exec_pipe; + } + + bool has_sbid = I->op == JAY_OPCODE_SEND && !jay_send_eot(I); + I->dep = (struct tgl_swsb) { + .sbid = has_sbid ? jay_send_sbid(I) : 0, + .mode = has_sbid ? TGL_SBID_SET : TGL_SBID_NULL, + .regdist = nr_waits ? min_delta : 0, + .pipe = nr_waits == 1 && (!has_sbid || + last_pipe == TGL_PIPE_FLOAT || + last_pipe == TGL_PIPE_INT) ? + last_pipe : + TGL_PIPE_ALL, + }; + + /* Fold the immediate preceding SYNC.nop into this instruction, allowing + * us to wait on both ALU and a SEND in the same annotation. + */ + if (last_sync && + jay_sync_op(last_sync) == TGL_SYNC_NOP && + I->dep.mode == TGL_SBID_NULL && + (I->dep.regdist == 0 || + inferred_sync_pipe(func->shader->devinfo, I) == I->dep.pipe)) { + + assert(last_sync->dep.regdist == 0); + assert(last_sync->dep.pipe == TGL_PIPE_NONE); + + I->dep.mode = last_sync->dep.mode; + I->dep.sbid = last_sync->dep.sbid; + + jay_remove_instruction(last_sync); + } + + if (exec_pipe != TGL_PIPE_NONE) { + /* Advance the IP by the number of physical instructions emitted */ + state.ip[exec_pipe] += + jay_macro_length(I) << jay_simd_split(func->shader, I); + + struct gpr_range r = def_to_gpr(func, I, I->dst); + uint32_t now = make_writer(exec_pipe, state.ip[exec_pipe]); + + for (unsigned i = 0; i < r.width; ++i) { + state.access[r.base + i][0] = now; + } + + jay_foreach_src(I, s) { + struct gpr_range r = def_to_gpr(func, I, I->src[s]); + for (unsigned i = 0; i < r.width; ++i) { + state.access[r.base + i][exec_pipe] = state.ip[exec_pipe]; + } + } + + state.last_shape[exec_pipe] = shape; + } + + last_sync = NULL; + } + + /* Sync on block boundaries. */ + jay_inst *first = jay_first_inst(block); + if (block != jay_first_block(func) && first && first->op != JAY_OPCODE_SEND) { + first->dep = tgl_swsb_regdist(1); + } +} + +/* + * Trivial scoreboard lowering pass for debugging use. Stalls after every + * instruction and assigns SBID zero to all messages. + */ +static void +lower_trivial(jay_function *func) +{ + jay_foreach_inst_in_func_safe(func, block, I) { + if (I->op == JAY_OPCODE_SEND && !jay_send_eot(I)) { + I->dep = tgl_swsb_dst_dep(tgl_swsb_sbid(TGL_SBID_SET, 0), 1); + + jay_builder b = jay_init_builder(func, jay_after_inst(I)); + jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, 0); + } else { + I->dep = tgl_swsb_regdist(1); + } + } +} + +void +jay_lower_scoreboard(jay_shader *s) +{ + uint32_t nr_keys = s->num_regs[GPR] + s->num_regs[UGPR]; + assert(nr_keys <= MAX_KEYS && "SENDs use uninitialized stack allocation"); + u32_per_pipe *access = malloc(sizeof(*access) * nr_keys); + + jay_foreach_function(s, func) { + if (jay_debug & JAY_DBG_SYNC) { + lower_trivial(func); + } else { + jay_foreach_block(func, block) { + memset(access, 0, sizeof(*access) * nr_keys); + lower_send_local(func, block); + lower_regdist_local(func, block, access); + } + } + } + + free(access); +} diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c new file mode 100644 index 00000000000..21fbac1777e --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_spill.c @@ -0,0 +1,156 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/brw/brw_eu_defines.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* We reserve an address register for spilling by ABI */ +#define ADDRESS_REG jay_bare_reg(J_ADDRESS, 2) + +static void +insert_spill_fill(jay_builder *b, + jay_def mem, + jay_def gpr, + jay_def sp, + bool load, + unsigned *sp_delta_B, + unsigned umem_base) +{ + assert(jay_is_mem(mem) && !jay_is_mem(gpr)); + + bool uniform = mem.file == UMEM; + unsigned offs_B = mem.reg * 4; + unsigned mem_reg_B = + uniform ? (umem_base + offs_B) : (offs_B * b->shader->dispatch_width); + + /* The stack pointer needs to be offset to the desired offset */ + signed sp_adjust_B = mem_reg_B - (*sp_delta_B); + if (sp_adjust_B) { + jay_ADD(b, JAY_TYPE_U32, sp, sp, sp_adjust_B); + *sp_delta_B = mem_reg_B; + } + + const struct intel_device_info *devinfo = b->shader->devinfo; + unsigned cache = load ? LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS) : + LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS); + uint32_t desc = lsc_msg_desc(devinfo, load ? LSC_OP_LOAD : LSC_OP_STORE, + LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, 1, uniform, cache); + if (uniform) { + sp.num_values_m1 = 0; + } + + jay_def srcs[] = { sp, gpr }; + + jay_SEND(b, .sfid = BRW_SFID_UGM, .msg_desc = desc, .srcs = srcs, + .nr_srcs = load ? 1 : 2, .dst = load ? gpr : jay_null(), + .type = JAY_TYPE_U32, .uniform = uniform, .ex_desc = ADDRESS_REG); +} + +void +jay_lower_spill(jay_function *func) +{ + jay_builder b = jay_init_builder(func, jay_before_function(func)); + + /* We reserve the top UGPRs for spilling by ABI */ + unsigned ugpr_reservation = func->shader->num_regs[UGPR]; + assert(util_is_aligned(ugpr_reservation + 1, func->shader->dispatch_width)); + + jay_def surf = jay_bare_reg(UGPR, ugpr_reservation); + jay_def sp = jay_bare_reg(UGPR, ugpr_reservation + 1); + sp.num_values_m1 = func->shader->dispatch_width - 1; + + /* Calculate how much stack space we need */ + unsigned nr_mem = 0, nr_umem = 0; + jay_foreach_inst_in_func(func, block, I) { + if (I->op == JAY_OPCODE_MOV && jay_is_send_like(I)) { + jay_def mem = jay_is_mem(I->dst) ? I->dst : I->src[0]; + unsigned *nr = mem.file == UMEM ? &nr_umem : &nr_mem; + + *nr = MAX2(*nr, mem.reg + 1); + } + } + + assert((nr_umem > 0) || (nr_mem > 0)); + unsigned umem_base = (func->shader->dispatch_width * nr_mem * 4); + + /* We burn the address & stack pointer registers for all spills/fills in a + * shader. Preinitialize at the top using a scratch register. + * + * TODO: Need ABI for multi-function. + */ + assert(func->is_entrypoint); + jay_AND(&b, JAY_TYPE_U32, surf, jay_bare_reg(UGPR, 5), ~BITFIELD_MASK(10)); + jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4); + + /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */ + jay_def tmp2 = jay_bare_reg(GPR, func->shader->partition.base2); + jay_LANE_ID_8(&b, tmp2); + for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) { + jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i); + } + + jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4)); + jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0); + if (b.shader->scratch_size) { + jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size); + } + + jay_foreach_block(func, block) { + /* We offset the stack pointer locally within a block to form offsets. By + * contract keep it in its canonical (unoffset) form at block boundaries. + */ + unsigned sp_delta_B = 0; + bool address_valid = true; + + jay_foreach_inst_in_block_safe(block, I) { + b.cursor = jay_before_inst(I); + + if (I->op == JAY_OPCODE_MOV && jay_is_send_like(I)) { + if (!address_valid) { + jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4); + address_valid = true; + } + + if (jay_is_mem(I->dst)) { + insert_spill_fill(&b, I->dst, I->src[0], sp, false, &sp_delta_B, + umem_base); + func->shader->spills++; + } else { + insert_spill_fill(&b, I->src[0], I->dst, sp, true, &sp_delta_B, + umem_base); + func->shader->fills++; + } + + jay_remove_instruction(I); + } else if (I->op == JAY_OPCODE_SHUFFLE) { + /* Shuffles implicitly clobber the address register so we'll need to + * rematerialize the surface state (but be lazy). + */ + address_valid = false; + } + } + + /* Canonicalize our internal registers at block boundaries */ + if (jay_num_successors(block) > 0) { + if (!address_valid) { + jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4); + } + + if (sp_delta_B > 0) { + jay_ADD(&b, JAY_TYPE_U32, sp, sp, -sp_delta_B); + } + } + } + + /* Note this is bogus with recursion, but recursion is not supported on any + * current graphics/compute API. + */ + func->shader->scratch_size += umem_base + (nr_umem * 4); +} diff --git a/src/intel/compiler/jay/jay_nir_algebraic.py b/src/intel/compiler/jay/jay_nir_algebraic.py new file mode 100644 index 00000000000..209f9585172 --- /dev/null +++ b/src/intel/compiler/jay/jay_nir_algebraic.py @@ -0,0 +1,95 @@ +# Copyright 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +import argparse +import sys +from math import pi + +a = 'a' +b = 'b' +c = 'c' + +lower_fsign = [ + (('fsign', a), ('bcsel', ('!flt', 0, a), +1.0, + ('bcsel', ('!flt', a, 0), -1.0, 0.0))), + (('fceil', a), ('fneg', ('ffloor', ('fneg', a)))), + + # inot is free on and/or/xor sources but not dests. Apply De Morgan's. + (('inot', ('iand(is_used_once)', ('inot', a), b)), ('ior', a, ('inot', b))), + (('inot', ('ior(is_used_once)', ('inot', a), b)), ('iand', a, ('inot', b))), + (('inot', ('ixor(is_used_once)', ('inot', a), b)), ('ixor', a, b)), + (('inot', ('iand(is_used_once)', a, b)), ('ior', ('inot', a), ('inot', b))), + (('inot', ('ior(is_used_once)', a, b)), ('iand', ('inot', a), ('inot', b))), + (('inot', ('ixor(is_used_once)', a, b)), ('ixor', ('inot', a), b)), + + # Remove the zeroing. Down-conversion is free but extracts are not. + (('u2f32', ('extract_u8', a, 0)), ('u2f32', ('u2u8', a))), + (('u2f32', ('extract_u16', a, 0)), ('u2f32', ('u2u16', a))), + (('i2f32', ('extract_i8', a, 0)), ('i2f32', ('i2i8', a))), + (('i2f32', ('extract_i16', a, 0)), ('i2f32', ('i2i16', a))), + + (('pack_half_2x16_split', a, b), + ('pack_32_2x16_split', ('f2f16', a), ('f2f16', b))), + + # Allows us to use more modifiers + (('bcsel', a, ('iadd(is_used_once)', b, c), b), + ('iadd', ('bcsel', a, c, 0), b)), +] + + +lower_bool = [ + # Try to use conditional modifiers more + (('ieq', ('iand(is_used_once)', a, b), b), + ('ieq', ('iand', ('inot', a), b), 0)), + (('ine', ('iand(is_used_once)', a, b), b), + ('ine', ('iand', ('inot', a), b), 0)), +] + +for T, sizes, one in [('f', [16, 32], 1.0), + ('i', [8, 16, 32], 1), + ('b', [8, 16, 32], -1)]: + for sz in sizes: + if T in ['f', 'i']: + lower_bool.extend([ + ((f'{T}neg', (f'b2{T}{sz}', ('inot', 'a@1'))), + ('bcsel', a, 0, -one)), + ((f'{T}neg', (f'b2{T}{sz}', 'a@1')), ('bcsel', a, -one, 0)), + ]) + + lower_bool.extend([ + ((f'b2{T}{sz}', ('inot', 'a@1')), ('bcsel', a, 0, one)), + ((f'b2{T}{sz}', 'a@1'), ('bcsel', a, one, 0)), + ]) + +lower_bool.extend([ + ((f'b2i64', 'a@1'), ('pack_64_2x32_split', ('bcsel', a, 1, 0), 0)), +]) + +opt_sel_zero = [ + (('bcsel@32', a, 0, 1), ('iadd', ('bcsel', a, 0xffffffff, 0), 1)), + (('bcsel@32', a, 1, 0), ('ineg', ('bcsel', a, 0xffffffff, 0))), +] + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + parser.add_argument('output') + args = parser.parse_args() + + sys.path.insert(0, args.import_path) + import nir_algebraic # pylint: disable=import-error + + with open(args.output, 'w', encoding='utf-8') as f: + f.write('#include "jay_private.h"') + + f.write(nir_algebraic.AlgebraicPass( + "jay_nir_lower_fsign", lower_fsign).render()) + f.write(nir_algebraic.AlgebraicPass( + "jay_nir_lower_bool", lower_bool).render()) + f.write(nir_algebraic.AlgebraicPass( + "jay_nir_opt_sel_zero", opt_sel_zero).render()) + + +if __name__ == '__main__': + main() diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py new file mode 100644 index 00000000000..928d1e90b04 --- /dev/null +++ b/src/intel/compiler/jay/jay_opcodes.py @@ -0,0 +1,233 @@ +# Copyright 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +from typing import TYPE_CHECKING +from dataclasses import dataclass +import enum + +if TYPE_CHECKING: + from collections.abc import Mapping + + +@dataclass +class Opcode: + name: str + has_dest: bool + num_srcs: int + types: list[str] + negate: int + sat: bool + cmod: bool + side_effects: bool + _2src_commutative: bool + extra_struct: list[tuple[str, str]] + + +@enum.unique +class Props(enum.IntEnum): + NEGATE0 = 1 << 0 + NEGATE1 = 1 << 1 + NEGATE2 = 1 << 2 + NEGATE3 = 1 << 3 + SAT = 1 << 4 + CMOD = 1 << 5 + SIDE_EFFECTS = 1 << 6 + COMMUTATIVE = 1 << 7 + NO_DEST_ = 1 << 8 + NEGATE = NEGATE0 | NEGATE1 | NEGATE2 | NEGATE3 + NO_DEST = SIDE_EFFECTS | NO_DEST_ + + +_opcodes: dict[str, Opcode] = {} + + +def op(name: str, num_srcs: int, types: str | None = None, + props: int = 0, extra_struct: str | list[str] | None = None) -> None: + types_ = types.split(' ') if types else ['untyped'] + + # We can always negate the predicate. + negate_mask = (props & Props.NEGATE) | (1 << num_srcs) + + if extra_struct is not None: + extra_struct_ = [(' '.join(x.split(' ')[0:-1]), x.split(' ')[-1]) + for x in extra_struct] + else: + extra_struct_ = [] + + _opcodes[name] = Opcode(name, not bool(props & Props.NO_DEST_), + num_srcs, types_, negate_mask, + bool(props & Props.SAT), bool(props & Props.CMOD), + bool(props & Props.SIDE_EFFECTS), + bool(props & Props.COMMUTATIVE), + extra_struct_) + + +op('and', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE) +op('or', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE) +op('xor', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE) + +op('add', 2, 'u32 s32 u64 s64 f32 f64 f16 bf16 u16 s16', + Props.SAT | Props.CMOD | Props.COMMUTATIVE | Props.NEGATE) +op('add3', 3, 'u32 s32 u64 s64 u16 s16', Props.SAT | + Props.CMOD | Props.COMMUTATIVE | Props.NEGATE) +op('asr', 2, 's32 s64 s16', Props.CMOD | Props.NEGATE0) +op('avg', 2, 's16 s32 u16 u32', Props.NEGATE | Props.CMOD) +op('bfe', 3, 'u32 s32', Props.NEGATE0) +op('bfi1', 2, 'u32') +op('bfi2', 3, 'u32') +op('bfn', 3, 'u32', Props.CMOD, ['uint8_t ctrl']) +op('bfrev', 1, 'u32', Props.NEGATE) +op('cbit', 1, 'u32', Props.NEGATE | Props.CMOD) +op('cmp', 2, 'u32', Props.NEGATE | Props.CMOD) + + +# With an 8/16-bit type, `index` specifies the element index of the source +# within the 32-bit word. For example, if src_type == U16 and index == 1, this +# converts the upper 16-bits of the input. +op('cvt', 1, 'u8 s8 u16 s16 u32 s32 u64 s64 f32 f64 f16 bf16', Props.NEGATE | Props.SAT, [ + 'enum jay_type src_type', + 'enum jay_rounding_mode rounding_mode', + 'uint8_t index', + 'uint8_t pad' +]) + +op('fbh', 1, 'u32 s32') +op('fbl', 1, 'u32') +op('lzd', 1, 'u32') +op('frc', 1, 'f32 f64', Props.NEGATE | Props.CMOD) +op('mad', 3, 'u32 s32 u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.CMOD | Props.COMMUTATIVE) +op('max', 2, 'u32 s32 u64 s64 u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.COMMUTATIVE) +op('min', 2, 'u32 s32 u64 s64 u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.COMMUTATIVE) +op('mov', 1, 'u1 u16 u32 u64', Props.NEGATE0 | Props.CMOD) +op('modifier', 1, 'f32 f64 f16 s16 s32 s64 u16 u32 u64 s8', + Props.NEGATE | Props.SAT | Props.CMOD) +op('mul', 2, 'u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.CMOD | Props.COMMUTATIVE) +op('mul_high', 2, 'u32 s32', Props.COMMUTATIVE) +op('mul_32x16', 2, 'u32 s32') +op('mul_32', 2, 'u32 s32', Props.COMMUTATIVE, ['bool high']) +op('sel', 3, 'u32 f32 u1 u16', Props.NEGATE) +op('csel', 3, 'u32 s32 f32', Props.NEGATE) +op('dp4a_uu', 3, 'u32', Props.SAT) +op('dp4a_ss', 3, 's32', Props.SAT) +op('dp4a_su', 3, 's32', Props.SAT) +op('rndd', 1, 'f16 f32 f64', Props.NEGATE | Props.SAT) +op('rndz', 1, 'f16 f32 f64', Props.NEGATE | Props.SAT) +op('rnde', 1, 'f16 f32 f64', Props.NEGATE | Props.SAT) +op('math', 1, 'f16 f32', Props.NEGATE | Props.SAT, ['enum jay_math op']) + +for n in ['rol', 'ror', 'shl', 'shr']: + op(n, 2, 'u32 u64 u16 s16 s32 s64', Props.CMOD | Props.NEGATE0) + +op('quad_swizzle', 1, 'u1 u32', 0, ['enum jay_quad_swizzle swizzle']) +op('sync', 0, None, Props.NO_DEST, ['enum tgl_sync_function op']) + +for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else', + 'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret', + 'loop_once']: + op(n, 0, None, Props.NO_DEST) + +op('send', 4, None, Props.SIDE_EFFECTS, [ + 'enum brw_sfid sfid', + 'uint8_t sbid', + 'bool eot', + 'bool check_tdr', + 'bool uniform', + 'bool bindless', + 'enum jay_type type_0', + 'enum jay_type type_1', + 'uint8_t ex_mlen', + 'uint32_t ex_desc_imm', +]) + +op('reloc', 0, 'u32 u64', 0, ['unsigned param', 'unsigned base']) +op('preload', 0, 'u32', 0, ['unsigned reg']) +op('deswizzle_16', 0, 'u32', Props.NO_DEST, ['unsigned dst', 'unsigned src']) + +# Calculating the lane ID requires multiple power-of-two steps each involving +# complex architectural features not modelled in the IR. +op('lane_id_8', 0, 'u16') +op('lane_id_expand', 1, 'u16', 0, ['unsigned width']) + +# Sample ID calculation +op('extract_byte_per_8lanes', 2, 'u32') +op('shr_odd_subspans_by_4', 1, 'u16') +op('and_u32_u16', 2, 'u32') + +# Pixel coord calculations. expand_quad replicates out the per-2x2 values from +# its source g0.[10...13] and - in the case of SIMD32 - g1.[10...13] into a +# per-lane value. Then offset_packed_pixel_coords adds the appropriate packed +# 2x16-bit offset within each quad, giving 2x16-bit per-lane coordinates. +op('expand_quad', 2, 'u32') +op('offset_packed_pixel_coords', 1, 'u32') +op('extract_layer', 2, 'u32') + +# Generated by RA and lowered after. Valid only for GPR/UGPR. +op('swap', 2, 'u32', Props.NO_DEST) + +# Phi function representations +# +# Unlike in NIR, we represent Phi functions as a pair of opcodes, purely +# for convenience since it makes many things easier to work with. +# +# Phis locially exist along control flow edges between blocks. PHI_DST +# lives where 𝜙 would traditionally be written, at the point where the new +# value is defined. A PHI_DST will have a corresponding PHI_SRC in each of +# its predecessor block, representing value coming in along that edge. This +# ensures that source modifiers, scalar to vector promotion, or other source +# evaluation happens in the predecessor block. +# +# The PHI_SRC refers to the SSA index of the PHI_DST. For example, 'if (..) r3 = +# r1 else r3 = r2 endif' might look +# +# (following block) | (then block) | (else block) +# START B3 +#include "util/macros.h" + +enum PACKED jay_opcode { +% for opcode in opcodes: + JAY_OPCODE_${opcode.upper()}, +% endfor + JAY_NUM_OPCODES +}; +static_assert(sizeof(enum jay_opcode) == 1); + +struct jay_opcode_info { + const char *name; + unsigned num_srcs; + + /** Bitfield of sources which support negation/abs */ + uint8_t src_mods; + + /** Which modifiers are broadly supported by the opcode. Note there may be + * further restrictions (e.g. based on types) not encoded here. + */ + bool sat; + bool cmod; + + /** Whether the operation has side effects not expressed in the SSA IR */ + bool side_effects; + + /** op(a, b, c, ...) = op(b, a, c, ...) */ + bool _2src_commutative; +}; + +extern const struct jay_opcode_info jay_opcode_infos[JAY_NUM_OPCODES]; +""" + +CODE_TEMPLATE = """/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "jay_opcodes.h" + +const struct jay_opcode_info jay_opcode_infos[JAY_NUM_OPCODES] = { +% for opcode, op in opcodes.items(): + [JAY_OPCODE_${opcode.upper()}] = { + .name = "${opcode}", + .num_srcs = ${op.num_srcs}, + .src_mods = ${bin(op.negate)}, +% for mod in ["sat", "cmod", "side_effects", "_2src_commutative"]: +% if getattr(op, mod): + .${mod} = true, +% endif +% endfor + }, +% endfor +}; +""" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument('--code', action='store', default=None) + parser.add_argument('--header', action='store', default=None) + args = parser.parse_args() + + if not (args.header or args.code): + parser.error('At least one of --code or --header is required') + + try: + if args.code is not None: + with open(args.code, 'w', encoding='utf-8') as f: + f.write(Template(CODE_TEMPLATE).render(opcodes=OPCODES)) + if args.header is not None: + with open(args.header, 'w', encoding='utf-8') as f: + f.write(Template(HEADER_TEMPLATE).render(opcodes=OPCODES)) + except Exception: + print(exceptions.text_error_template().render()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/intel/compiler/jay/jay_opt_control_flow.c b/src/intel/compiler/jay/jay_opt_control_flow.c new file mode 100644 index 00000000000..1f337f37296 --- /dev/null +++ b/src/intel/compiler/jay/jay_opt_control_flow.c @@ -0,0 +1,137 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2023 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/list.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * Detect the block "else; endif" and remove the no-op else, effectively + * removing empty else blocks. Logically, that causes critical edges, so this + * pass must run late (post-RA). + */ +static void +opt_empty_else(jay_block *blk) +{ + unsigned i = 0; + enum jay_opcode ops[] = { JAY_OPCODE_ELSE, JAY_OPCODE_ENDIF }; + + jay_foreach_inst_in_block(blk, I) { + if (i >= ARRAY_SIZE(ops) || ops[i++] != I->op) + return; + } + + if (i == ARRAY_SIZE(ops)) { + jay_remove_instruction(jay_first_inst(blk)); + } +} + +/* + * Replace short if-statements with predication. Assumes opt_empty_else already + * ran. TODO: Generalize. + */ +static void +opt_predicate(jay_function *f, jay_block *block) +{ + jay_inst *if_ = jay_last_inst(block); + if (!if_ || if_->op != JAY_OPCODE_IF) + return; + + /* If's fallthrough to the then */ + jay_block *then_block = jay_next_block(block); + assert(block->successors[0] == then_block && "successors for if"); + + /* We're searching for a single block then, so the next block is else */ + jay_block *else_block = jay_next_block(then_block); + if (block->successors[1] != else_block || + list_length(&then_block->instructions) > 3 || + !list_is_singular(&else_block->instructions)) + return; + + /* We can only access one flag per instruction, so do not predicate anything + * accessing flags. This also ensures the if-condition flag is kept live. + * + * MIN/MAX turn into SEL which cannot be predicated despite not using flags. + * + * Predicating NoMask instructions doesn't work if we are electing a nonzero + * lane but the NoMask forces lane 0. This should be optimized later. + */ + jay_foreach_inst_in_block(then_block, I) { + if (jay_uses_flag(I) || + I->op == JAY_OPCODE_MIN || + I->op == JAY_OPCODE_MAX || + I->op == JAY_OPCODE_CSEL || + jay_is_no_mask(I)) + return; + } + + jay_inst *endif = jay_last_inst(else_block); + if (endif->op != JAY_OPCODE_ENDIF) + return; + + /* Rewrite with predication */ + jay_builder b = jay_init_builder(f, jay_after_block(block)); + assert(if_->predication == JAY_PREDICATED && "if's are always predicated"); + + jay_foreach_inst_in_block_safe(then_block, I) { + jay_add_predicate(&b, I, *jay_inst_get_predicate(if_)); + } + + /* Remove the jumps */ + jay_remove_instruction(if_); + jay_remove_instruction(endif); +} + +/* + * Optimize "(f0) break; while" to "(!f0) while". As break/while appear in + * different blocks, we optimize the entire function at a time. + */ +static void +opt_predicate_while(jay_function *func) +{ + jay_inst *prev_break = NULL; + + jay_foreach_block(func, block) { + if (list_is_empty(&block->instructions)) { + /* Ignore empty blocks */ + } else if (jay_last_inst(block)->op == JAY_OPCODE_BREAK) { + prev_break = jay_last_inst(block); + } else if (jay_first_inst(block)->op == JAY_OPCODE_WHILE && + prev_break && + prev_break->predication) { + assert(!jay_first_inst(block)->predication); + jay_inst_get_predicate(prev_break)->negate ^= true; + + jay_remove_instruction(jay_first_inst(block)); + jay_remove_instruction(prev_break); + + jay_builder b = jay_init_builder(func, jay_before_block(block)); + jay_builder_insert(&b, prev_break); + + prev_break->op = JAY_OPCODE_WHILE; + prev_break = NULL; + } else { + prev_break = NULL; + } + } +} + +void +jay_opt_control_flow(jay_shader *s) +{ + jay_foreach_function(s, f) { + /* Iterating blocks in reverse lets both opts converge in 1 pass */ + jay_foreach_block_rev(f, block) { + opt_empty_else(block); + opt_predicate(f, block); + } + + /* Do last: opt_predicate_while depends on both previous optimizations */ + opt_predicate_while(f); + } +} diff --git a/src/intel/compiler/jay/jay_opt_dead_code.c b/src/intel/compiler/jay/jay_opt_dead_code.c new file mode 100644 index 00000000000..da9d7299d57 --- /dev/null +++ b/src/intel/compiler/jay/jay_opt_dead_code.c @@ -0,0 +1,58 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +static void +pass(jay_function *f) +{ + BITSET_WORD *live_set = BITSET_CALLOC(f->ssa_alloc); + + jay_foreach_inst_in_func_safe_rev(f, block, I) { + /* TODO: Allow for atomics? */ + if (!BITSET_TEST_COUNT(live_set, jay_base_index(I->dst), + jay_num_values(I->dst)) && + I->op != JAY_OPCODE_SEND) { + I->dst = jay_null(); + } + + if (!jay_is_null(I->cond_flag) && + !BITSET_TEST(live_set, jay_index(I->cond_flag)) && + (I->op != JAY_OPCODE_CMP || jay_is_null(I->dst))) { + + I->cond_flag = jay_null(); + I->conditional_mod = 0; + } + + bool no_dest = jay_is_null(I->dst) && jay_is_null(I->cond_flag); + bool side_effects = jay_opcode_infos[I->op].side_effects; + + if (no_dest && !side_effects) { + jay_remove_instruction(I); + } else { + jay_foreach_src_index(I, s, _, index) { + BITSET_SET(live_set, index); + } + } + } + + /* Eliminate phis. This step may leave dead code but it's good enough in + * practice since NIR already eliminated dead phis. + */ + jay_foreach_block(f, block) { + jay_foreach_phi_src_in_block(block, I) { + if (!BITSET_TEST(live_set, jay_phi_src_index(I))) { + jay_remove_instruction(I); + } + } + } + + free(live_set); +} + +JAY_DEFINE_FUNCTION_PASS(jay_opt_dead_code, pass) diff --git a/src/intel/compiler/jay/jay_opt_propagate.c b/src/intel/compiler/jay/jay_opt_propagate.c new file mode 100644 index 00000000000..25a58253d93 --- /dev/null +++ b/src/intel/compiler/jay/jay_opt_propagate.c @@ -0,0 +1,282 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/lut.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +static enum jay_type +canonicalize_for_bit_compare(enum jay_type type) +{ + enum jay_type base = jay_base_type(type); + return (base == JAY_TYPE_S) ? jay_type_rebase(type, JAY_TYPE_U) : type; +} + +static bool +propagate_cmod(jay_function *func, jay_inst *I, jay_inst **defs) +{ + enum jay_type cmp_type = I->type; + enum jay_conditional_mod cmod = I->conditional_mod; + jay_inst *def = NULL; + + /* TODO: Generalize cmod propagation */ + if (jay_type_size_bits(cmp_type) != 32) + return false; + + /* Pattern match `cmp ssa, 0` or `cmp 0, ssa`. */ + jay_foreach_ssa_src(I, s) { + if (jay_is_zero(I->src[1 - s])) { + def = defs[jay_base_index(I->src[s])]; + + /* Canonicalize the cmod to have the zero second */ + cmod = s == 1 ? jay_conditional_mod_swap_sources(cmod) : cmod; + break; + } + } + + /* Check if we can fold into the def */ + if (!def || !jay_is_null(def->cond_flag) || !jay_opcode_infos[def->op].cmod) + return false; + + /* "Neither Saturate nor conditional modifier allowed with DW integer + * multiply." + * + * Could be refined. + */ + if (def->op == JAY_OPCODE_MUL && !jay_type_is_any_float(def->type)) + return false; + + enum jay_type instr_type = def->type; + + if (cmod == JAY_CONDITIONAL_NE || cmod == JAY_CONDITIONAL_EQ) { + cmp_type = canonicalize_for_bit_compare(cmp_type); + instr_type = canonicalize_for_bit_compare(instr_type); + } + + if (instr_type != cmp_type) + return false; + + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + jay_set_conditional_mod(&b, def, I->cond_flag, cmod); + return true; +} + +static jay_def +jay_compose_src(jay_def to, jay_def from) +{ + if (to.abs) { + from.negate = false; + from.abs = true; + } + + from.negate ^= to.negate; + return from; +} + +static bool +uses_modifiers(const jay_inst *I) +{ + jay_foreach_src(I, s) { + if (I->src[s].abs || I->src[s].negate) + return true; + } + + return I->saturate; +} + +static void +propagate_modifier(jay_inst *I, unsigned s, jay_inst *mod) +{ + /* Check if we can propagate abs/neg here in general */ + if (!jay_has_src_mods(I, s) || mod->saturate) + return; + + /* Try to make the types compatible. */ + if (jay_src_type(I, s) != mod->type) { + if (I->op == JAY_OPCODE_SEL && !uses_modifiers(I)) { + I->type = mod->type; + } else { + return; + } + } + + jay_replace_src(&I->src[s], mod->src[0]); + I->src[s] = jay_compose_src(I->src[s], mod->src[0]); +} + +static void +propagate_not(jay_inst *I, unsigned s, jay_inst *mod) +{ + /* Handle inot specially for predicates, and logic operations per bspec text: + * + * When used with logic instructions (and, not, or, xor), [the + * negate] field indicates whether the source bits are + * inverted... regardless of the source type. + */ + if ((s == I->num_srcs - I->predication) || + I->op == JAY_OPCODE_AND || + I->op == JAY_OPCODE_OR || + I->op == JAY_OPCODE_XOR) { + jay_replace_src(&I->src[s], mod->src[0]); + I->src[s].negate ^= true; + } else if (I->op == JAY_OPCODE_BFN) { + jay_replace_src(&I->src[s], mod->src[0]); + jay_set_bfn_ctrl(I, util_lut3_invert_source(jay_bfn_ctrl(I), s)); + } +} + +static void +propagate_forwards(jay_function *f) +{ + jay_inst **defs = calloc(f->ssa_alloc, sizeof(defs[0])); + + jay_foreach_inst_in_func_safe(f, block, I) { + jay_builder b = jay_init_builder(f, jay_before_inst(I)); + + jay_foreach_dst_index(I, _, d) { + defs[d] = I; + } + + /* Copy propagate individual components into vectors */ + jay_foreach_src_index(I, s, c, idx) { + jay_inst *def = defs[idx]; + assert(def != NULL && "SSA"); + + if (def->op == JAY_OPCODE_MOV && + !def->predication && + jay_num_values(def->dst) == 1 && + jay_num_values(def->src[0]) == 1 && + I->src[s].file == def->src[0].file) { + + jay_insert_channel(&b, &I->src[s], c, def->src[0]); + } + } + + /* Don't propagate into phis yet - TODO: File awareness */ + if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND) + continue; + + jay_foreach_ssa_src(I, s) { + /* Copy propagate whole vectors */ + jay_def src = I->src[s]; + if (src.collect) + continue; + + jay_inst *def = defs[jay_base_index(src)]; + assert(def != NULL && "SSA"); + + if (!jay_defs_equivalent(def->dst, src) || def->predication) + continue; + + if (def->op == JAY_OPCODE_MOV) { + /* Default values must have the same file as their dest, do not + * propagate invalid there. Also don't propagate inverse-ballots. + * Also only source 0 can read ARF (i.e. ballotted flags). + */ + if ((I->src[s].file == def->src[0].file) || + ((!jay_inst_has_default(I) || + &I->src[s] != jay_inst_get_default(I)) && + !(I->src[s].file == UFLAG && !jay_is_imm(def->src[0])) && + !(I->src[s].file == FLAG) && + (s == 0 || !jay_is_flag(def->src[0])) && + !(jay_is_imm(def->src[0]) && I->src[s].negate))) { + + jay_replace_src(&I->src[s], def->src[0]); + } + } else if (def->op == JAY_OPCODE_MODIFIER && !jay_uses_flag(def)) { + propagate_modifier(I, s, def); + } else if (def->op == JAY_OPCODE_NOT && !jay_uses_flag(def)) { + propagate_not(I, s, def); + } + } + + if (I->op == JAY_OPCODE_CMP && propagate_cmod(f, I, defs)) { + /* Even if we propagate the predicate write, there might be uses of the + * register value (TODO: Maybe check for this and skip propagating in + * that case?). So we cannot remove the compare, just strip the cond + * flag. Furthermore the CMP we always clobber some predicate, so give + * it an immediately-dead one instead. + */ + I->cond_flag = jay_alloc_def(&b, I->cond_flag.file, 1); + continue; + } + } + + free(defs); +} + +static bool +propagate_fsat(jay_inst *I, jay_inst *fsat) +{ + if (fsat->op != JAY_OPCODE_MODIFIER || + fsat->predication || + fsat->src[0].negate || + fsat->src[0].abs || + (fsat->conditional_mod && !jay_opcode_infos[I->op].cmod) || + I->conditional_mod || + I->type != fsat->type || + !jay_type_is_any_float(fsat->type)) + return false; + + /* saturate(saturate(x)) = saturate(x) */ + I->saturate |= fsat->saturate; + I->dst = fsat->dst; + I->cond_flag = fsat->cond_flag; + I->conditional_mod = fsat->conditional_mod; + return true; +} + +static void +propagate_backwards(jay_function *f) +{ + jay_inst **uses = calloc(f->ssa_alloc, sizeof(uses[0])); + BITSET_WORD *multiple = BITSET_CALLOC(f->ssa_alloc); + + jay_foreach_inst_in_func_rev(f, block, I) { + /* Record uses */ + jay_foreach_src_index(I, s, c, ssa_index) { + if (uses[ssa_index]) + BITSET_SET(multiple, ssa_index); + else + uses[ssa_index] = I; + } + + /* TODO: f64 sat propagation */ + if (jay_num_values(I->dst) != 1) + continue; + + assert(jay_is_ssa(I->dst)); + + jay_inst *use = uses[jay_base_index(I->dst)]; + if (!use || BITSET_TEST(multiple, jay_base_index(I->dst))) + continue; + + if (jay_opcode_infos[I->op].sat && + jay_type_is_any_float(I->type) && + propagate_fsat(I, use)) { + + jay_remove_instruction(use); + continue; + } + + /* Fold UGPR->{GPR, FLAG} copies coming out of NIR */ + if (I->type == use->type && + I->op != JAY_OPCODE_PHI_DST && + use->op == JAY_OPCODE_MOV) { + + I->dst = use->dst; + jay_remove_instruction(use); + continue; + } + } + + free(multiple); + free(uses); +} + +JAY_DEFINE_FUNCTION_PASS(jay_opt_propagate_forwards, propagate_forwards) +JAY_DEFINE_FUNCTION_PASS(jay_opt_propagate_backwards, propagate_backwards) diff --git a/src/intel/compiler/jay/jay_print.c b/src/intel/compiler/jay/jay_print.c new file mode 100644 index 00000000000..3b8c3781d20 --- /dev/null +++ b/src/intel/compiler/jay/jay_print.c @@ -0,0 +1,309 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/brw/brw_eu_defines.h" +#include "util/lut.h" +#include "util/macros.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +#define ENUM_TO_STR(x, arr) \ + ({ \ + assert(x < ARRAY_SIZE(arr)); \ + arr[x]; \ + }) + +static const char *jay_conditional_mod_str[] = { + [JAY_CONDITIONAL_EQ] = ".eq", [JAY_CONDITIONAL_NE] = ".ne", + [JAY_CONDITIONAL_GT] = ".gt", [JAY_CONDITIONAL_LT] = ".lt", + [JAY_CONDITIONAL_GE] = ".ge", [JAY_CONDITIONAL_LE] = ".le", + [JAY_CONDITIONAL_OV] = ".ov", [JAY_CONDITIONAL_NAN] = ".nan", +}; + +static const char *jay_arf_str[] = { + [JAY_ARF_NULL] = "_", + [JAY_ARF_MASK] = "mask", + [JAY_ARF_CONTROL] = "ctrl", + [JAY_ARF_TIMESTAMP] = "timestamp", +}; + +static const char *jay_file_str[JAY_FILE_LAST + 1] = { + [GPR] = "r", [UGPR] = "u", [FLAG] = "f", [UFLAG] = "uf", + [J_ADDRESS] = "a", [ACCUM] = "acc", [UACCUM] = "uacc", [J_ARF] = "arf", + [MEM] = "m", [UMEM] = "um", [TEST_FILE] = "t", +}; + +static const char *jay_base_types[] = { + [JAY_TYPE_U] = "u", [JAY_TYPE_S] = "s", [JAY_TYPE_F] = "f", [JAY_TYPE_BF] = "bf" +}; + +void +jay_print_type(FILE *fp, enum jay_type t) +{ + fprintf(fp, ".%s%u", ENUM_TO_STR(jay_base_type(t), jay_base_types), + jay_type_size_bits(t)); +} + +static void +jay_print_def(FILE *fp, const jay_inst *I, int src) +{ + jay_def def = src == -2 ? I->cond_flag : src == -1 ? I->dst : I->src[src]; + unsigned len = jay_num_values(def); + const char *file = ENUM_TO_STR(def.file, jay_file_str); + bool has_lu = jay_is_ssa(def) && !jay_is_null(def) && src >= 0; + unsigned lu_bit = has_lu ? jay_source_last_use_bit(I->src, src) : 0; + + bool has_index = jay_channel(def, 0) != JAY_SENTINEL; + bool has_reg = !def.collect && def.reg && def.file != J_ARF; + + if (jay_is_null(def)) { + has_reg = false; + fprintf(fp, "_"); + } else if (def.file == J_ARF) { + fputs(ENUM_TO_STR(jay_base_index(def), jay_arf_str), fp); + } else if (def.collect) { + assert(has_index && "else would be contiguous"); + fprintf(fp, "("); + for (unsigned i = 0; i < len; ++i) { + if (i) + fprintf(fp, ", "); + + if (jay_channel(def, i)) { + if (has_lu && BITSET_TEST(I->last_use, lu_bit)) + fprintf(fp, "*"); + + fprintf(fp, "%s%u", file, jay_channel(def, i)); + ++lu_bit; + } else { + fprintf(fp, "_"); + } + } + fprintf(fp, ")"); + } else if (has_index) { + fprintf(fp, "%s%s%u", + has_lu && BITSET_TEST(I->last_use, lu_bit) ? "*" : "", file, + jay_channel(def, 0)); + if (len > 1) { + fprintf(fp, ":%s%u", file, jay_channel(def, len - 1)); + } + } + + if (has_reg) { + if (has_index) + fprintf(fp, "("); + + fprintf(fp, "%s%u%s", file, def.reg, def.hi ? "h" : ""); + if (len > 1) { + fprintf(fp, ":%s%u", file, def.reg + len - 1); + } + + if (has_index) + fprintf(fp, ")"); + } +} + +static void +jay_print_src(FILE *fp, jay_inst *I, unsigned s) +{ + jay_def src = I->src[s]; + fprintf(fp, "%s%s", src.negate ? "-" : "", src.abs ? "(abs)" : ""); + + if (jay_is_imm(src)) { + fprintf(fp, "0x%X", jay_as_uint(src)); + if (util_is_probably_float(jay_as_uint(src))) { + float f = uif(jay_as_uint(src)); + fprintf(fp, fabs(f) >= 1000000.0 ? " (%e)" : " (%f)", f); + } + } else { + jay_print_def(fp, I, s); + } +} + +/* XXX: copypaste of brw_print_swsb */ +static void +jay_print_swsb(FILE *f, const struct tgl_swsb swsb) +{ + if (swsb.regdist) { + fprintf(f, "%s@%d", + (swsb.pipe == TGL_PIPE_FLOAT ? "F" : + swsb.pipe == TGL_PIPE_INT ? "I" : + swsb.pipe == TGL_PIPE_LONG ? "L" : + swsb.pipe == TGL_PIPE_ALL ? "A" : + swsb.pipe == TGL_PIPE_MATH ? "M" : + swsb.pipe == TGL_PIPE_SCALAR ? "S" : + ""), + swsb.regdist); + } + + if (swsb.mode) { + if (swsb.regdist) + fprintf(f, " "); + + fprintf(f, "$%d%s", swsb.sbid, + (swsb.mode & TGL_SBID_SET ? "" : + swsb.mode & TGL_SBID_DST ? ".dst" : + ".src")); + } +} + +void +jay_print_inst(FILE *fp, jay_inst *I) +{ + const char *sep = ""; + + if (!jay_is_null(I->dst)) { + jay_print_def(fp, I, -1); + sep = ", "; + } + + if (!jay_is_null(I->cond_flag)) { + fprintf(fp, "%s", sep); + jay_print_def(fp, I, -2); + } + + if (!jay_is_null(I->dst) || !jay_is_null(I->cond_flag)) { + fprintf(fp, " = "); + } + + if (I->predication) { + fprintf(fp, "("); + jay_print_src(fp, I, jay_inst_get_predicate(I) - I->src); + + if (jay_inst_has_default(I)) { + fprintf(fp, "/"); + jay_print_src(fp, I, jay_inst_get_default(I) - I->src); + } + + fprintf(fp, ")"); + } + + if (I->op == JAY_OPCODE_MATH) { + jay_print_inst_info(fp, I, ""); + } else { + fprintf(fp, "%s", jay_opcode_infos[I->op].name); + } + + if (I->type != JAY_TYPE_UNTYPED) { + jay_print_type(fp, I->type); + } + + if (I->op == JAY_OPCODE_BFN) { + fprintf(fp, ".(%s)", util_lut3_to_str[jay_bfn_ctrl(I)]); + } + + const char *cmod = ENUM_TO_STR(I->conditional_mod, jay_conditional_mod_str); + fprintf(fp, "%s%s ", I->saturate ? ".sat" : "", cmod ? cmod : ""); + sep = ""; + + for (unsigned i = 0; i < I->num_srcs - I->predication; i++) { + fprintf(fp, "%s", sep); + jay_print_src(fp, I, i); + + enum jay_type T = jay_src_type(I, i); + if (T != I->type && !(T == JAY_TYPE_U1 && jay_is_flag(I->src[i]))) { + jay_print_type(fp, T); + } + + sep = ", "; + } + + if (I->op != JAY_OPCODE_MATH) { + sep = jay_print_inst_info(fp, I, sep); + } + + /* Software scoreboard dependency info */ + if (I->dep.regdist || I->dep.mode) { + fprintf(fp, "%s%s%s", strlen(sep) ? " {" : "{", + I->replicate_dep ? "*" : "", I->decrement_dep ? "+" : ""); + jay_print_swsb(fp, I->dep); + fprintf(fp, "}"); + } + + fprintf(fp, "\n"); +} + +static inline void +indent(FILE *fp, jay_block *block, bool interior) +{ + for (unsigned i = 0; i < block->indent + interior; i++) + fprintf(fp, " "); +} + +static void +comma_separate(FILE *fp, jay_block *block, bool *first) +{ + if (*first) { + indent(fp, block, true); + *first = false; + } else { + fprintf(fp, ", "); + } +} + +void +jay_print_block(FILE *fp, jay_block *block) +{ + indent(fp, block, false); + fprintf(fp, "B%d%s%s", block->index, block->uniform ? " [uniform]" : "", + block->loop_header ? " [loop header]" : ""); + bool first = true; + jay_foreach_predecessor(block, p) { + fprintf(fp, "%s B%d", first ? " <-" : "", (*p)->index); + first = false; + } + fprintf(fp, " {\n"); + + /* We group phi destinations/sources for legibility */ + first = true; + jay_foreach_phi_dst_in_block(block, phi) { + comma_separate(fp, block, &first); + jay_print_def(fp, phi, -1); + } + fprintf(fp, "%s", first ? "" : " = 𝜙\n"); + + jay_foreach_inst_in_block(block, inst) { + if (inst->op != JAY_OPCODE_PHI_DST && inst->op != JAY_OPCODE_PHI_SRC) { + indent(fp, block, true); + jay_print_inst(fp, inst); + } + } + + first = true; + jay_foreach_phi_src_in_block(block, phi) { + comma_separate(fp, block, &first); + fprintf(fp, "𝜙%u = ", jay_phi_src_index(phi)); + jay_print_def(fp, phi, 0); + } + fprintf(fp, "%s", first ? "" : "\n"); + + indent(fp, block, false); + fprintf(fp, "}"); + first = true; + jay_foreach_successor(block, succ) { + if (succ) { + fprintf(fp, "%s B%d", first ? " ->" : "", succ->index); + first = false; + } + } + fprintf(fp, "\n\n"); +} + +void +jay_print_func(FILE *fp, jay_function *f) +{ + fprintf(fp, "Jay function: \n\n"); + jay_foreach_block(f, block) { + jay_print_block(fp, block); + } +} + +void +jay_print(FILE *fp, jay_shader *s) +{ + jay_foreach_function(s, f) { + jay_print_func(fp, f); + } +} diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h new file mode 100644 index 00000000000..2799eaa7b7b --- /dev/null +++ b/src/intel/compiler/jay/jay_private.h @@ -0,0 +1,72 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "jay_ir.h" +#include "nir.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define JAY_DBG_NOOPT BITFIELD_BIT(0) +#define JAY_DBG_PRINTDEMAND BITFIELD_BIT(1) +#define JAY_DBG_SPILL BITFIELD_BIT(2) +#define JAY_DBG_SYNC BITFIELD_BIT(3) +extern int jay_debug; + +bool jay_nir_lower_bool(nir_shader *nir); +bool jay_nir_opt_sel_zero(nir_shader *nir); +bool jay_nir_lower_fsign(nir_shader *nir); + +void jay_compute_liveness(jay_function *f); +void jay_calculate_register_demands(jay_function *f); + +void jay_spill(jay_function *func, enum jay_file file, unsigned limit); +void jay_partition_grf(jay_shader *shader); +void jay_register_allocate(jay_shader *s); +void jay_assign_flags(jay_shader *s); +void jay_repair_ssa(jay_function *func); + +const char *jay_file_to_string(enum jay_file file); +void jay_print_type(FILE *f, enum jay_type t); +void jay_print_inst(FILE *f, jay_inst *I); +void jay_print_block(FILE *f, jay_block *block); +void jay_print_func(FILE *fp, jay_function *func); +void jay_print(FILE *f, jay_shader *s); + +#ifndef NDEBUG +void jay_validate(jay_shader *s, const char *when); +void jay_validate_ra(jay_function *func); +#else +static inline void +jay_validate(jay_shader *s, const char *when) +{ +} + +static inline void +jay_validate_ra(jay_function *func) +{ +} +#endif + +void jay_opt_propagate_forwards(jay_shader *s); +void jay_opt_propagate_backwards(jay_shader *s); +void jay_opt_dead_code(jay_shader *s); +void jay_opt_control_flow(jay_shader *s); + +void jay_lower_pre_ra(jay_shader *s); +void jay_lower_post_ra(jay_shader *s); +void jay_lower_spill(jay_function *func); +void jay_lower_simd_width(jay_shader *s); +void jay_lower_scoreboard(jay_shader *s); + +struct jay_shader_bin * +jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size); + +#ifdef __cplusplus +} /* extern C */ +#endif diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c new file mode 100644 index 00000000000..65cbf05c080 --- /dev/null +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -0,0 +1,1659 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include "util/bitscan.h" +#include "util/bitset.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "util/sparse_bitset.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" +#include "shader_enums.h" + +/** + * Register allocation for Jay shaders. + * + * We use a decoupled register allocation approach. First, we spill values + * until the register demand fits within the size of each register file. + * + * Secondly, we assign registers using a tree-scan algorithm similar to the + * one described in Colombet et al 2011: + * + * Q. Colombet, B. Boissinot, P. Brisk, S. Hack and F. Rastello, + * "Graph-coloring and treescan register allocation using repairing," + * 2011 Proceedings of the 14th International Conference on Compilers, + * Architectures and Synthesis for Embedded Systems (CASES), Taipei, + * Taiwan, 2011, pp. 45-54, doi: 10.1145/2038698.2038708. + * + * We also use a union-find set to construct equivalence classes for phi webs, + * and attempt to use the same regs for registers in that class, similar to + * the "Aggressive Pre-Coalescing" step described in that paper. + * + * Finally, we deconstruct SSA. + */ + +static inline bool +is_ra_src(jay_def d) +{ + return d.file < JAY_NUM_RA_FILES; +} + +#define jay_foreach_ra_file(file) \ + for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file) + +#define jay_foreach_ra_src(I, s) \ + jay_foreach_src(I, s) \ + if (is_ra_src(I->src[s]) && !jay_is_null(I->src[s])) + +static enum jay_stride +jay_min_stride_for_type(enum jay_type T) +{ + unsigned bits = jay_type_size_bits(T); + + /* We need at least enough contiguous bits per-lane to store a scalar */ + if (bits == 64) + return JAY_STRIDE_8; + else if (bits == 32) + return JAY_STRIDE_4; + else + return JAY_STRIDE_2; +} + +static enum jay_stride +jay_max_stride_for_type(enum jay_type T) +{ + /* Horizontal stride can be at most 4 */ + return (jay_type_size_bits(T) >= 16) ? JAY_STRIDE_8 : JAY_STRIDE_4; +} + +static bool +jay_restrict_mixed_strides(jay_inst *I, unsigned s) +{ + /* From the hardware spec section "Register Region Restrictions": + * + * "In case of all floating point data types used in destination:" and + * + * "In case where source or destination datatype is 64b or operation is + * integer DWord multiply:" and + * + * "Src2 Restrictions" + * + * Register Regioning patterns where register data bit location + * of the LSB of the channels are changed between source and + * destination are not supported on Src0 and Src1 except for + * broadcast of a scalar. + * + * Therefore, ban mixed-strides in these cases. + * + * Similarly, SENDs cannot do any regioning so restrict that too. + */ + return jay_type_is_any_float(I->type) || + jay_type_size_bits(I->type) == 64 || + jay_is_send_like(I) || + I->op == JAY_OPCODE_MUL_32X16 || + I->op == JAY_OPCODE_MUL_32 || + s == 2; +} + +static enum jay_stride +jay_dst_stride_minmax(jay_inst *I, bool do_max) +{ + enum jay_stride min = jay_min_stride_for_type(I->type); + enum jay_stride max = jay_max_stride_for_type(I->type); + + /* Destination stride must be equal to the ratio of the sizes of the + * execution data type to the destination type + */ + if (I->op == JAY_OPCODE_CVT) { + min = MAX2(min, jay_min_stride_for_type(jay_src_type(I, 0))); + } + + /* V/UV types are restricted */ + if (I->op == JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4) { + return JAY_STRIDE_2; + } + + /* The src2 restriction quoted above effectively implies we should not stride + * destinations of 3-source instructions either. + */ + if (jay_num_isa_srcs(I) >= 3) { + return min; + } + + return (do_max && !jay_restrict_mixed_strides(I, 0)) ? max : min; +} + +static enum jay_stride +jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max) +{ + enum jay_stride min = jay_min_stride_for_type(jay_src_type(I, s)); + enum jay_stride max = jay_max_stride_for_type(jay_src_type(I, s)); + + /* SENDs cannot do any regioning so force exactly the types of the sources + * regardless of the type of the destination. + * + * Shuffles could theoretically support regioning but it would be nontrivial + * and probably pointless most of the time. + */ + if (jay_is_send_like(I) || jay_is_shuffle_like(I)) { + return min; + } + + /* While "add.u16 r0<2>, r1<4>" is legal, "add.u16 r0, r1<4>" is not. + * Conservatively assume the destination is packed and restrict the source + * stride accordingly. This satisfies the special restrictions. + */ + if (jay_type_size_bits(I->type) <= 16) { + max = JAY_STRIDE_4; + } + + /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet + * (preferring to burn the upper bits) but it is used internally. + */ + if (I->op == JAY_OPCODE_LANE_ID_EXPAND) { + max = JAY_STRIDE_2; + } + + if (jay_restrict_mixed_strides(I, s) && + jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) { + + return jay_dst_stride_minmax(I, do_max); + } + + return (do_max && !jay_restrict_mixed_strides(I, s)) ? max : min; +} + +struct affinity { + /** + * If there is a vector affinity defined for this SSA def, it is relative to + * some representative SSA index. Else 0 if there is no affinity. + */ + uint32_t repr; + + /** If the representative: offset in registers from the base. + * + * If not the representative: offset in registers from the representative. */ + signed offset:4; + + /** + * If true, this value is used in an end-of-thread SEND and requires high + * registers. + */ + bool eot:1; + + /** If true, this UGPR needs full GRF alignment */ + bool grf_align :1; + unsigned align_offs:4; + unsigned padding :22; +}; +static_assert(sizeof(struct affinity) == 8, "packed"); + +struct phi_web_node { + /* Parent index, or circular for root */ + uint32_t parent; + + /* If root, assigned register, or ~0 if no register assigned. */ + uint16_t reg; + + /* Rank, at most log2(n) so need ~5-bits */ + uint16_t rank; + + /* If root, affinity for the whole web */ + struct affinity affinity; +}; +static_assert(sizeof(struct phi_web_node) == 16, "packed"); + +static unsigned +phi_web_find(struct phi_web_node *web, unsigned x) +{ + if (web[x].parent == x) { + /* Root */ + return x; + } else { + /* Search up the tree */ + unsigned root = x; + while (web[root].parent != root) + root = web[root].parent; + + /* Compress path. Second pass ensures O(1) memory usage. */ + while (web[x].parent != x) { + unsigned temp = web[x].parent; + web[x].parent = root; + x = temp; + } + + return root; + } +} + +static void +phi_web_union(struct phi_web_node *web, unsigned x, unsigned y) +{ + x = phi_web_find(web, x); + y = phi_web_find(web, y); + + if (x == y) + return; + + /* Union-by-rank: ensure x.rank >= y.rank */ + if (web[x].rank < web[y].rank) { + SWAP(x, y); + } + + web[y].parent = x; + + /* Increment rank if necessary */ + if (web[x].rank == web[y].rank) { + web[x].rank++; + } +} + +#define NO_REG 0xFFFF + +static inline jay_reg +make_reg(enum jay_file file, uint16_t reg) +{ + return (((uint16_t) file) << 13) | reg; +} + +static inline unsigned +r_reg(jay_reg r) +{ + assert(r != NO_REG); + return r & BITFIELD_MASK(13); +} + +static inline enum jay_file +r_file(jay_reg r) +{ + assert(r != NO_REG); + assert((r >> 13) < JAY_NUM_RA_FILES); + return r >> 13; +} + +static jay_def +def_from_reg(jay_reg r) +{ + return jay_bare_reg(r_file(r), r_reg(r)); +} + +typedef struct jay_ra_state { + /** Size of each register file */ + unsigned num_regs[JAY_NUM_RA_FILES]; + + /** First GPR that may be used for EOT sends */ + unsigned eot_offs; + + /** Phi coalescing data structure */ + struct phi_web_node *phi_web; + + /** + * Global SSA index -> jay_reg map. Unlike reg_for_index, once a register + * is picked it will not be shuffled. + */ + jay_reg *global_reg_for_index; + + /** + * Block currently being processed. ra_state is allocated once per + * function but the following fields are updated as we go through the + * program. This keeps RA linearish time. + */ + jay_block *block; + + /** Builder for inserting shuffle code */ + jay_builder bld; + + /** Local SSA index -> jay_reg map. Only defined for live indices. */ + jay_reg *reg_for_index; + + /** + * Value occupying a register (register -> uint32_t reverse maps) for + * registers that are not available. Undefined for available registers. + */ + uint32_t *index_for_reg[JAY_NUM_RA_FILES]; + + /** Set of registers that are available */ + BITSET_WORD *available_regs[JAY_NUM_RA_FILES]; + + /** + * Within assign_regs_for_inst, the set of registers that have been + * assigned and are therefore pinned. + * + * Invariant: zeroed on entry to assign_regs_for_inst. + */ + BITSET_WORD *pinned[JAY_NUM_RA_FILES]; + + /** Vector affinities for each def. */ + struct affinity *affinities; +} jay_ra_state; + +static inline jay_reg +current_reg(const jay_ra_state *ra, uint32_t index) +{ + assert(index > 0 && index < ra->bld.func->ssa_alloc); + jay_reg reg = ra->reg_for_index[index]; + + assert(reg != NO_REG); + assert(ra->index_for_reg[r_file(reg)][r_reg(reg)] == index); + return reg; +} + +/** (dst, src) pairs for use in parallel copies */ +struct jay_parallel_copy { + jay_reg dst, src; +}; + +static void +add_copy(struct util_dynarray *copies, jay_reg dst, jay_reg src) +{ + if (dst != src) { + assert(r_file(dst) == r_file(src)); + util_dynarray_append(copies, ((struct jay_parallel_copy) { dst, src })); + } +} + +static jay_def +push_temp(jay_builder *b, jay_reg reg, bool stride4) +{ + jay_def tmp = def_from_reg(reg); + + if (stride4 && jay_def_stride(b->shader, tmp) != JAY_STRIDE_4) { + jay_def new = def_from_reg(0); + jay_MOV(b, tmp, new); + tmp = new; + } + + return tmp; +} + +static void +pop_temp(jay_builder *b, struct jay_temp_regs t, jay_def temp) +{ + if (temp.file == GPR && temp.reg != t.gpr) { + jay_MOV(b, temp, def_from_reg(t.gpr)); + } +} + +/* + * Insert a single logical copy. Like jay_MOV but expands to multiple moves + * involving a temporary register in some cases. + */ +static void +mov(jay_builder *b, jay_def dst, jay_def src, struct jay_temp_regs temps) +{ + if (dst.file == MEM && src.file == MEM) { + assert(temps.gpr != NO_REG && "ensured by the spill limit"); + jay_def temp = push_temp(b, temps.gpr, true /* stride4 */); + jay_MOV(b, temp, src); + jay_MOV(b, dst, temp); + pop_temp(b, temps, temp); + } else if (dst.file == UMEM && src.file == UMEM) { + assert(temps.ugpr != NO_REG && "ensured by the spill limit"); + jay_MOV(b, def_from_reg(temps.ugpr), src); + jay_MOV(b, dst, def_from_reg(temps.ugpr)); + } else { + jay_MOV(b, dst, src); + } +} + +/* + * Sequentialize a parallel copy. temps are registers free *before* the + * parallel copy. A temporary might be the destination of a copy, but it + * cannot be the source of any copy (since copying a free register is + * undefined). Therefore it cannot be a part of a cycle, so it is free for use + * (only) when handling cycles, which must happen before sequential copies. + */ +static void +jay_emit_parallel_copies(jay_builder *b, + struct jay_parallel_copy *pcopies, + unsigned num_copies, + struct jay_temp_regs temps) +{ + /* Compact away trivial copies upfront to reduce runtime. */ + unsigned new_num_copies = 0; + for (unsigned i = 0; i < num_copies; ++i) { + assert(r_file(pcopies[i].dst) == r_file(pcopies[i].src)); + + if (pcopies[i].dst != pcopies[i].src) { + pcopies[new_num_copies++] = pcopies[i]; + } + } + + num_copies = new_num_copies; + if (num_copies == 0) + return; + + assert(num_copies < UINT16_MAX); + BITSET_WORD *done = BITSET_CALLOC(num_copies); + uint16_t *reg_use_count[JAY_NUM_RA_FILES]; + jay_foreach_ra_file(f) { + reg_use_count[f] = calloc(b->shader->num_regs[f], sizeof(uint16_t)); + } + + struct jay_parallel_copy *simple = malloc(num_copies * sizeof(*simple)); + unsigned num_simple = 0; + +#ifndef NDEBUG + BITSET_WORD *packed = BITSET_CALLOC(UINT16_MAX); + + if (0) { + const char *files = "ruMm"; + printf("[[\n"); + + for (unsigned i = 0; i < num_copies; i++) { + printf(" %c%u = %c%u\n", files[r_file(pcopies[i].dst)], + r_reg(pcopies[i].dst), files[r_file(pcopies[i].src)], + r_reg(pcopies[i].src)); + } + + printf("]]\n"); + } + + /** + * Assert that each parallel copy destination is unique: no reg can appear + * as the destination of two parallel copies. + */ + for (unsigned i = 0; i < num_copies; i++) { + assert(!BITSET_TEST(packed, pcopies[i].dst)); + BITSET_SET(packed, pcopies[i].dst); + } + + free(packed); +#endif + + for (unsigned i = 0; i < num_copies; i++) { + ++reg_use_count[r_file(pcopies[i].src)][r_reg(pcopies[i].src)]; + } + + bool progress; + do { + progress = false; + + /* Step 1: resolve paths in the transfer graph. This means finding + * copies whose destination aren't blocked by something else and then + * emitting them, continuing this process until every copy is blocked + * and there are only cycles left. + * + * TODO: We should note that src is also available in dest to unblock + * cycles that src is involved in. + */ + for (unsigned i = 0; i < num_copies; i++) { + struct jay_parallel_copy *copy = &pcopies[i]; + + if (!BITSET_TEST(done, i) && + reg_use_count[r_file(copy->dst)][r_reg(copy->dst)] == 0) { + + simple[num_simple++] = *copy; + BITSET_SET(done, i); + --reg_use_count[r_file(copy->src)][r_reg(copy->src)]; + progress = true; + } + } + } while (progress); + + /* Step 2: resolve cycles through swapping. + * + * At this point, the transfer graph should consist of only cycles. + * The reason is that, given any reg n_1 that's the source of a + * remaining entry, it has a destination n_2, which (because every + * copy is blocked) is the source of some other copy whose destination + * is n_3, and so we can follow the chain until we get a cycle. If we + * reached some other node than n_1: + * + * n_1 -> n_2 -> ... -> n_i + * ^ | + * |-------------| + * + * then n_2 would be the destination of 2 copies, which is illegal + * (checked above in an assert). So n_1 must be part of a cycle: + * + * n_1 -> n_2 -> ... -> n_i + * ^ | + * |---------------------| + * + * and this must be only cycle n_1 is involved in, because any other + * path starting from n_1 would also have to end in n_1, resulting in + * a node somewhere along the way being the destination of 2 copies + * when the 2 paths merge. + * + * The way we resolve the cycle is through picking a copy (n_1, n_2) + * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken + * out of the cycle: + * + * n_1 -> ... -> n_i + * ^ | + * |--------------| + * + * and we can keep repeating this until the cycle is empty. After each + * swap, we update sources of blocking copies. At that point, every + * blocking copy's source should be contained within our destination. + */ + for (unsigned i = 0; i < num_copies; i++) { + struct jay_parallel_copy *copy = &pcopies[i]; + + if (!BITSET_TEST(done, i) && copy->dst != copy->src) { + jay_def dst = def_from_reg(copy->dst), src = def_from_reg(copy->src); + assert(dst.file == src.file); + enum jay_file file = dst.file; + jay_reg tmp = (file == GPR || file == MEM) ? temps.gpr : temps.ugpr; + + if (tmp != NO_REG) { + struct jay_temp_regs t = { .gpr = temps.gpr2, .ugpr = temps.ugpr2 }; + jay_def temp = push_temp(b, tmp, file == MEM /* stride4 */); + { + mov(b, temp, dst, t); + mov(b, dst, src, t); + mov(b, src, temp, t); + } + pop_temp(b, temps, temp); + } else { + jay_SWAP(b, dst, src); + } + + for (unsigned j = 0; j < num_copies; j++) { + if (pcopies[j].src == copy->dst) + pcopies[j].src = copy->src; + } + + /* Simple copies are deferred. Their destinations do not conflict with + * our swaps, but we need to swap their sources to sink. + */ + for (unsigned j = 0; j < num_simple; j++) { + assert(simple[j].dst != copy->src && simple[j].dst != copy->dst); + + if (simple[j].src == copy->src) + simple[j].src = copy->dst; + else if (simple[j].src == copy->dst) + simple[j].src = copy->src; + } + } + + BITSET_SET(done, i); + } + + /* Emit moves after swaps because they fan out and thus increase demand. + * This gives us more freedom around temporaries. The rewrite of simple + * copies above ensures correctness. + * + * Simiarly, we first emit memory-memory copies since those require + * temporaries but only register copies can clobber the temporaries. + */ + for (unsigned i = 0; i < num_simple; i++) { + jay_def dst = def_from_reg(simple[i].dst); + jay_def src = def_from_reg(simple[i].src); + + if (jay_is_mem(dst) && jay_is_mem(src)) { + mov(b, dst, src, temps); + } + } + + for (unsigned i = 0; i < num_simple; i++) { + jay_def dst = def_from_reg(simple[i].dst); + jay_def src = def_from_reg(simple[i].src); + + if (!(jay_is_mem(dst) && jay_is_mem(src))) { + mov(b, dst, src, temps); + + if (temps.gpr == simple[i].dst || temps.gpr == simple[i].src) { + temps.gpr = NO_REG; + } + + if (temps.ugpr == simple[i].dst || temps.ugpr == simple[i].src) { + temps.ugpr = NO_REG; + } + } + } + + jay_foreach_ra_file(f) { + free(reg_use_count[f]); + } + + free(simple); + free(done); +} + +static bool +reg_is_available(jay_ra_state *ra, jay_reg reg) +{ + assert(reg != NO_REG); + return BITSET_TEST(ra->available_regs[r_file(reg)], r_reg(reg)); +} + +static void +assign_reg_for_index(jay_ra_state *ra, uint32_t index, jay_reg reg) +{ + /* Update our data structures */ + ra->reg_for_index[index] = reg; + ra->index_for_reg[r_file(reg)][r_reg(reg)] = index; + BITSET_CLEAR(ra->available_regs[r_file(reg)], r_reg(reg)); + + /* Update the web to the most recent register. Heuristic from Colombet. */ + ra->phi_web[phi_web_find(ra->phi_web, index)].reg = reg; + + /* Post-conditions */ + assert(!reg_is_available(ra, reg)); + assert(current_reg(ra, index) == reg); +} + +static void +release_reg(jay_ra_state *ra, jay_reg reg) +{ + /* Update available_regs only - the reg<-->index maps are invalidated. */ + BITSET_SET(ra->available_regs[r_file(reg)], r_reg(reg)); +} + +static unsigned +register_demand(jay_ra_state *ra, enum jay_file f) +{ + unsigned n = ra->num_regs[f]; + return n - __bitset_prefix_sum(ra->available_regs[f], n, BITSET_WORDS(n)); +} + +static jay_reg +try_find_free_reg(jay_ra_state *ra, enum jay_file file, unsigned except) +{ + unsigned i; + + /* Prefer stride 4 temporaries, since they are more compatible and thus + * should reduce swapping on average. + */ + if (file == GPR) { + BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) { + if (i != except && + jay_gpr_to_stride(&ra->bld.shader->partition, i) == JAY_STRIDE_4) { + return make_reg(file, i); + } + } + } + + BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) { + if (i != except) { + return make_reg(file, i); + } + } + + return NO_REG; +} + +static jay_reg +find_free_reg(jay_ra_state *ra, enum jay_file file, unsigned except) +{ + jay_reg reg = try_find_free_reg(ra, file, except); + + if (reg == NO_REG) { + fprintf(stderr, "file %u, current demand %u, target %u\n", file, + register_demand(ra, file), ra->num_regs[file]); + UNREACHABLE("there should have been a free register"); + } + + return reg; +} + +static inline struct jay_temp_regs +find_temp_regs(jay_ra_state *ra) +{ + jay_reg gpr = try_find_free_reg(ra, GPR, ~0); + jay_reg ugpr = try_find_free_reg(ra, UGPR, ~0); + + return (struct jay_temp_regs) { + .gpr = gpr, + .ugpr = ugpr, + .gpr2 = try_find_free_reg(ra, GPR, gpr), + .ugpr2 = try_find_free_reg(ra, UGPR, ugpr), + }; +} + +static unsigned +pick_regs(jay_ra_state *ra, + enum jay_file file, + unsigned size, + unsigned alignment, + enum jay_stride min_stride, + enum jay_stride max_stride, + jay_inst *I, + jay_def var, + jay_def *last_killed, + bool is_src) +{ + struct jay_partition *partition = &ra->bld.shader->partition; + unsigned first = 0, end = ra->num_regs[file]; + unsigned ugpr_per_grf = jay_ugpr_per_grf(ra->bld.shader); + bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND; + must_tie &= !is_src; + + /* Cross-lane access cannot be SIMD split if the source/destination registers + * overlap, but as long as we don't tie those destinations, we're ok. + */ + bool may_tie = !jay_is_shuffle_like(I); + + /* Ensure we do not cross partitions */ + if (file == UGPR && size > 16) { + first = partition->large_ugpr_block.start; + end = partition->large_ugpr_block.start + partition->large_ugpr_block.len; + } + + /* Sources used by end-of-thread sends must be at the end of the file */ + if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) { + first = ra->eot_offs; + } + + /* If possible, keep sources in place to avoid shuffles. */ + if (is_src && jay_channel(var, 0) != 0) { + unsigned cur = r_reg(ra->reg_for_index[jay_channel(var, 0)]); + enum jay_stride stride = jay_gpr_to_stride(partition, cur); + + if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) && + util_is_aligned(cur, alignment) && + cur >= first && + cur + size <= end && + (file != GPR || (min_stride <= stride && stride <= max_stride))) { + return cur; + } + } + + unsigned best_cost = UINT32_MAX; + unsigned best_reg = 0; + struct affinity affinity = + ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity; + + assert(alignment >= size && "alignment must be a multiple of size"); + + for (unsigned r = first; r + size <= end; r += alignment) { + unsigned cost = 0; + bool tied = last_killed && last_killed->reg == r; + enum jay_stride stride = + file == GPR ? jay_gpr_to_stride(partition, r) : min_stride; + + if ((tied ? !may_tie : + (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) || + !(min_stride <= stride && stride <= max_stride)) + continue; + + /* Assigning a stride that is too big may result in SIMDness splitting. + * Model that cost so we prefer packed registers. + */ + cost += stride - min_stride; + + /* If we are used for end-of-thread and it is not in the appropriate + * register, we will need to insert 1 copy per channel at the end. + */ + if (affinity.eot && r < ra->eot_offs) + cost += size; + + /* If there are stricter alignment requirements later, model the cost of + * inserting copies for that. + */ + if (affinity.grf_align && + !util_is_aligned(r - affinity.align_offs, ugpr_per_grf)) + cost += size; + + if (affinity.repr == jay_channel(var, 0)) { + /* If we are the collect representative but the final collect won't + * actually be usable, the whole vector will need to be copied. + */ + if (!util_is_aligned(r - affinity.offset, 8) || + (affinity.eot && r - affinity.offset < ra->eot_offs)) { + cost += 8; + } + } else if (affinity.repr) { + /* If we are used for a collect but not in the right place, we will + * similarly insert copies. + */ + if (ra->reg_for_index[affinity.repr] != NO_REG && + r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) { + + cost += size; + } + } + + for (unsigned c = 0; c < size; ++c) { + unsigned i = r + c; + + /* If the register is unavailable, account for the cost of shuffling */ + if (!BITSET_TEST(ra->available_regs[file], i) && !tied) { + cost++; + + /* ..plus the cost of shuffling back. */ + if (u_sparse_bitset_test(&ra->block->live_out, + ra->index_for_reg[file][i])) + cost++; + } + + /* Model the cost of shuffling for phis */ + if (c < jay_num_values(var)) { + struct phi_web_node *phi_web = + &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))]; + if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != i) { + cost += 2; + } + } + + /* Choosing this register will pin it, leaving it unavailable to later + * smaller sources which will need to be shuffled. Account for those + * moves. + * + * TODO: Faster algorithm. + */ + jay_foreach_src_index(I, s, c, index) { + if (jay_num_values(I->src[s]) < size && + ra->reg_for_index[index] == make_reg(file, i)) { + cost++; + } + } + } + + if (cost < best_cost) { + best_cost = cost; + best_reg = r; + + /* If we find something with 0 cost, we are guaranteed to pick this + * register, so terminate early. This speeds up the search. + */ + if (cost == 0) { + break; + } + } + } + + assert(best_cost != UINT32_MAX && "we always find something"); + assert(best_reg + size <= ra->num_regs[file]); + return best_reg; +} + +struct window { + jay_reg base; + uint16_t length; +}; +static_assert(sizeof(struct window) == 4, "packed"); + +static void +assign_regs_for_inst(jay_ra_state *ra, jay_inst *I) +{ + jay_shader *shader = ra->bld.shader; + jay_def *vars[JAY_MAX_OPERANDS]; + jay_def *last_killed[JAY_NUM_RA_FILES] = { 0 }; + jay_def saved_srcs[JAY_MAX_SRCS]; + struct jay_parallel_copy copies[JAY_MAX_DEF_LENGTH * JAY_MAX_OPERANDS]; + uint32_t eviction_indices[JAY_MAX_DEF_LENGTH * JAY_MAX_OPERANDS]; + unsigned nr_vars = 0, nr_copies = 0; + + /* Gather temporary registers that are free /before/ any shuffling */ + struct jay_temp_regs temp_regs = find_temp_regs(ra); + + /* Save sources so we can get at last-use info even after munging */ + typed_memcpy(saved_srcs, I->src, I->num_srcs); + + /* Gather sources (in order) then destinations. This order (with a stable + * sort) ensures we see killed sources before same-size destinations, + * naturally tying the last source to the destination. Predicated default + * values rely on this invariant for correctness. + */ + jay_foreach_ra_src(I, s) { + /* Filter out duplicate scalar sources - they should only be assigned + * once. Duplicated vector sources are lowered away as a precondition. + */ + bool duplicate = false; + if (jay_num_values(I->src[s]) == 1) { + uint32_t index = jay_index(I->src[s]); + + for (unsigned i = 0; i < nr_vars; ++i) { + jay_def var = *(vars[i]); + duplicate |= (jay_num_values(var) == 1 && jay_index(var) == index); + } + } + + if (!duplicate) { + vars[nr_vars++] = &I->src[s]; + + /* Record the old registers as parallel copies to be filled in later. + * Then release the old registers to be reassigned. + */ + jay_foreach_index(I->src[s], _, index) { + jay_reg reg = current_reg(ra, index); + assert(reg != NO_REG); + + eviction_indices[nr_copies] = index; + copies[nr_copies++] = (struct jay_parallel_copy) { .src = reg }; + release_reg(ra, reg); + } + } + } + + if (!jay_is_null(I->dst) && I->dst.file < JAY_NUM_RA_FILES) { + vars[nr_vars++] = &I->dst; + } + + /* Sort variables by size in descending order. We use insertion sort + * because it is stable, adaptive, and faster than mergesort for small n. + * + * Algorithm from CLRS. + */ + for (unsigned i = 1; i < nr_vars; ++i) { + jay_def *pivot = vars[i]; + unsigned j, key = pivot->num_values_m1; + + for (j = i; j > 0 && key > vars[j - 1]->num_values_m1; --j) { + vars[j] = vars[j - 1]; + } + + vars[j] = pivot; + } + + /* Partition `copies` into "source shuffles" and "livethrough shuffles" */ + uint32_t first_eviction_copy = nr_copies; + + /* Choose registers for sources/destinations in order */ + for (unsigned i = 0; i < nr_vars; ++i) { + bool is_src = vars[i] >= I->src; + bool killed = false; + jay_def var = *(vars[i]); + unsigned size = jay_num_values(var); + if (is_src) { + assert(util_is_power_of_two_nonzero(size) && "NPOT sources lowered"); + } else { + size = util_next_power_of_two(size); + } + + unsigned alignment = I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : size; + enum jay_file file = var.file; + enum jay_stride min_stride = JAY_STRIDE_2, max_stride = JAY_STRIDE_8; + + assert(size > 0 && file < JAY_NUM_RA_FILES && "filtered above"); + + if (is_src) { + /* If a source is duplicated, we need to take the most constrained + * version. This matters for 3-src restrictions. + */ + jay_foreach_src(I, s) { + if (jay_defs_equivalent(var, I->src[s])) { + alignment = MAX2(alignment, jay_src_alignment(shader, I, s)); + min_stride = + MAX2(jay_src_stride_minmax(I, s, false), min_stride); + max_stride = MIN2(jay_src_stride_minmax(I, s, true), max_stride); + } + } + + unsigned s = vars[i] - I->src; + + /* Sources are considered killed only if completely killed */ + unsigned lu = jay_source_last_use_bit(saved_srcs, s); + + killed = true; + for (unsigned i = 0; i < size; ++i) { + if (jay_channel(I->src[s], i) == 0 || + !BITSET_TEST(I->last_use, lu + i)) { + killed = false; + break; + } + } + } else { + alignment = MAX2(alignment, jay_dst_alignment(shader, I)); + min_stride = jay_dst_stride_minmax(I, false); + max_stride = jay_dst_stride_minmax(I, true); + } + + /* Choose registers satisfying the constraints and minimizing shuffles */ + unsigned base = + pick_regs(ra, file, size, alignment, min_stride, max_stride, I, var, + is_src ? NULL : last_killed[file], is_src); + jay_reg reg = make_reg(file, base); + + /* If we decided to tie, process that */ + if (!is_src && last_killed[file] && last_killed[file]->reg == base) { + /* Fully killed source so we can zero a contiguous range. Note we need + * to use the unpadded size to avoid leaking a register for vec3 + * destinations tied to vec4 sources. + */ + unsigned offs = + jay_source_last_use_bit(saved_srcs, last_killed[file] - I->src); + BITSET_CLEAR_COUNT(I->last_use, offs, jay_num_values(var)); + last_killed[file] = NULL; + } else { + /* Otherwise pin our choice */ + BITSET_SET_COUNT(ra->pinned[file], base, size); + + for (unsigned c = 0; c < size; ++c) { + /* Evict any livethrough value interfering with our choice */ + if (!(is_src && jay_channel(var, c) == 0) && + !reg_is_available(ra, reg + c)) { + uint32_t index = ra->index_for_reg[file][base + c]; + struct jay_parallel_copy copy = { .src = reg + c }; + eviction_indices[nr_copies] = index; + copies[nr_copies++] = copy; + release_reg(ra, reg + c); + } + } + } + + jay_set_reg(vars[i], base); + + jay_foreach_index(var, c, index) { + assign_reg_for_index(ra, index, reg + c); + } + + if (killed) { + last_killed[file] = vars[i]; + } + } + + /* Set .reg late so duplicated scalar sources are handled properly */ + jay_foreach_ra_src(I, s) { + if (I->src[s]._payload != JAY_SENTINEL) { + jay_set_reg(&I->src[s], + r_reg(ra->reg_for_index[jay_channel(I->src[s], 0)])); + } + } + + /* Look up where shuffled sources ended up */ + for (unsigned i = 0; i < first_eviction_copy; ++i) { + copies[i].dst = ra->reg_for_index[eviction_indices[i]]; + } + + /* Assign new registers for evicted values */ + for (unsigned i = first_eviction_copy; i < nr_copies; ++i) { + copies[i].dst = find_free_reg(ra, r_file(copies[i].src), ~0); + assign_reg_for_index(ra, eviction_indices[i], copies[i].dst); + } + + /* Shuffle everything */ + ra->bld.cursor = jay_before_inst(I); + jay_emit_parallel_copies(&ra->bld, copies, nr_copies, temp_regs); + + /* Reset data structures */ + for (unsigned i = 0; i < nr_vars; ++i) { + jay_def var = *(vars[i]); + BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, + util_next_power_of_two(jay_num_values(var))); + } + + /* Sources selected for early-kill have had their last_use fields cleared. + * Anything else is late-killed. Release those registers. + */ + unsigned kill_idx = 0; + jay_foreach_ssa_src(I, s) { + jay_foreach_index(saved_srcs[s], c, idx) { + if (is_ra_src(I->src[s]) && BITSET_TEST(I->last_use, kill_idx)) { + release_reg(ra, make_reg(I->src[s].file, I->src[s].reg + c)); + } + + kill_idx++; + } + } +} + +static void +local_ra(jay_ra_state *ra, jay_block *block) +{ + ra->block = block; + + /* Initialize local data structures based on global state */ + jay_foreach_ra_file(file) { + BITSET_SET_COUNT(ra->available_regs[file], 0, ra->num_regs[file]); + } + + U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) { + if (ra->global_reg_for_index[i] != NO_REG) { + assign_reg_for_index(ra, i, ra->global_reg_for_index[i]); + } + } + + /* Assign registers locally */ + jay_foreach_inst_in_block(block, I) { + if (I->op == JAY_OPCODE_PHI_SRC) { + break; + } else if (I->op == JAY_OPCODE_PHI_DST) { + /* Phis are special as we never shuffle them */ + unsigned index = jay_index(I->dst); + jay_reg reg = ra->phi_web[phi_web_find(ra->phi_web, index)].reg; + + if (reg == NO_REG || !reg_is_available(ra, reg)) { + reg = find_free_reg(ra, I->dst.file, ~0); + } + + assign_reg_for_index(ra, jay_index(I->dst), reg); + I->dst.reg = r_reg(reg); + } else if (I->op == JAY_OPCODE_PRELOAD) { + /* Preloads always get what they want */ + I->dst.reg = jay_preload_reg(I); + jay_reg base = make_reg(I->dst.file, I->dst.reg); + + jay_foreach_comp(I->dst, c) { + assert(reg_is_available(ra, base + c) && "preloads always work"); + assign_reg_for_index(ra, jay_channel(I->dst, c), base + c); + } + } else { + /* For normal instructions, assign registers. */ + assign_regs_for_inst(ra, I); + } + + /* Release registers for destinations that are immediately killed */ + jay_foreach_index(I->dst, _, index) { + if (BITSET_TEST(ra->bld.func->dead_defs, index)) { + release_reg(ra, current_reg(ra, index)); + } + } + + if (jay_debug & JAY_DBG_PRINTDEMAND) { + printf("(RA) [G:%u\tU:%u] ", register_demand(ra, GPR), + register_demand(ra, UGPR)); + jay_print_inst(stdout, I); + } + } + + /* Gather temporary registers that are free /before/ any shuffling */ + struct jay_temp_regs temp_regs = find_temp_regs(ra); + + /* Reconcile local state with the global structures */ + jay_foreach_ra_file(file) { + BITSET_SET_COUNT(ra->available_regs[file], 0, ra->num_regs[file]); + } + + /* Extend live ranges for correctness. Might be a better solution though. */ + jay_foreach_inst_in_block_rev(block, I) { + if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) { + break; + } + + jay_foreach_ra_src(I, s) { + u_sparse_bitset_set(&block->live_out, jay_index(I->src[s])); + } + } + + /* Already assigned global registers need to be shuffled back */ + struct util_dynarray copies = UTIL_DYNARRAY_INIT; + + U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) { + jay_reg lreg = ra->reg_for_index[i], greg = ra->global_reg_for_index[i]; + + if (lreg != NO_REG && greg != NO_REG) { + add_copy(&copies, greg, lreg); + assign_reg_for_index(ra, i, greg); + } + } + + /* Live-out variables defined in this block need global registers assigned */ + U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) { + jay_reg reg = ra->reg_for_index[i]; + + if (ra->global_reg_for_index[i] == NO_REG && reg != NO_REG) { + if (!reg_is_available(ra, reg)) { + jay_reg old = reg; + reg = find_free_reg(ra, r_file(reg), ~0); + add_copy(&copies, reg, old); + } + + assign_reg_for_index(ra, i, reg); + ra->global_reg_for_index[i] = reg; + } + } + + /* Gather temporary registers free after shuffling (before phis) */ + block->temps_out = find_temp_regs(ra); + + /* Handle the end of the block */ + ra->bld.cursor = jay_before_block(block); + + jay_foreach_inst_in_block_rev(block, I) { + if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) { + ra->bld.cursor = jay_after_inst(I); + break; + } + + jay_foreach_ra_src(I, s) { + jay_set_reg(&I->src[s], + r_reg(ra->global_reg_for_index[jay_index(I->src[s])])); + } + } + + const unsigned num_pcopies = + util_dynarray_num_elements(&copies, struct jay_parallel_copy); + + jay_emit_parallel_copies(&ra->bld, copies.data, num_pcopies, temp_regs); + util_dynarray_fini(&copies); +} + +/* + * Record all phi webs. First initialize the union-find data structure + * with all SSA defs in their own singletons, then union together anything + * related by a phi. The resulting union-find structure will be the webs. + */ +static void +construct_phi_webs(struct phi_web_node *web, jay_function *f) +{ + for (unsigned i = 0; i < f->ssa_alloc; ++i) { + web[i] = (struct phi_web_node) { .parent = i, .reg = NO_REG }; + } + + jay_foreach_block(f, block) { + jay_foreach_phi_src_in_block(block, phi) { + phi_web_union(web, jay_index(phi->src[0]), jay_phi_src_index(phi)); + } + } +} + +static void +insert_parallel_copies_for_phis(jay_function *f) +{ + jay_reg *phi_dsts = calloc(f->ssa_alloc, sizeof(jay_reg)); + struct util_dynarray copies = UTIL_DYNARRAY_INIT; + memset(phi_dsts, 0xFF, sizeof(jay_reg) * f->ssa_alloc); + + jay_foreach_block(f, block) { + jay_foreach_phi_dst_in_block(block, I) { + phi_dsts[jay_index(I->dst)] = make_reg(I->dst.file, I->dst.reg); + } + } + + jay_foreach_block(f, block) { + jay_builder b = jay_init_builder(f, jay_before_jump(block)); + + /* Copy phi source to phi destination along the edge. */ + jay_foreach_phi_src_in_block(block, phi) { + jay_reg src = make_reg(phi->src[0].file, phi->src[0].reg); + add_copy(&copies, phi_dsts[jay_phi_src_index(phi)], src); + jay_remove_instruction(phi); + } + + const unsigned nr = + util_dynarray_num_elements(&copies, struct jay_parallel_copy); + + jay_emit_parallel_copies(&b, copies.data, nr, block->temps_out); + util_dynarray_clear(&copies); + } + + util_dynarray_fini(&copies); + free(phi_dsts); +} + +static struct jay_register_block +block_gpr_to_grf(struct jay_partition *p, enum jay_file file, unsigned block) +{ + assert(file == GPR || file == UGPR); + assert(((p->blocks[file][block].start * 16) % p->units_x16[file]) == 0); + assert(((p->blocks[file][block].len * 16) % p->units_x16[file]) == 0); + + return (struct jay_register_block) { + .start = (p->blocks[file][block].start * 16) / p->units_x16[file], + .len = (p->blocks[file][block].len * 16) / p->units_x16[file], + }; +} + +static void +print_partition(struct jay_partition *p) +{ + for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { + for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { + struct jay_register_block B = block_gpr_to_grf(p, f, b); + const char *file = f ? "UGPR" : "GPR"; + + if (B.len > 1) { + fprintf(stderr, "%s: %u-%u\n", file, B.start, B.start + B.len - 1); + } else if (B.len == 1) { + fprintf(stderr, "%s: %u\n", file, B.start); + } + } + } + + fprintf(stderr, "\n"); +} + +/* + * Verify that a register partition is a bijective mapping of the GRF file. + */ +static void +validate_partition(struct jay_partition *p, + unsigned stride4_header_size, + unsigned nonuniform_gprs) +{ + BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 }; + + for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { + for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { + struct jay_register_block B = block_gpr_to_grf(p, f, b); + if (B.len) { + assert(B.start + B.len <= JAY_NUM_PHYS_GRF && "GRF file size"); + assert(!BITSET_TEST_COUNT(regs, B.start, B.len) && "uniqueness"); + + BITSET_SET_COUNT(regs, B.start, B.len); + } + } + } + + for (unsigned i = 0; i < JAY_NUM_PHYS_GRF; ++i) { + assert(BITSET_TEST(regs, i) && "all GRFs mapped"); + } + + assert(p->large_ugpr_block.len && "partition must have a large UGPR block"); + assert(p->base2 >= p->base8 && p->base_eot >= p->base2 && "monotonic"); + assert(p->base8 >= stride4_header_size && "header is big enough"); + assert(p->base_eot + p->units_x16[GPR] <= nonuniform_gprs && "EOT fits"); + assert(util_is_aligned(p->base8, 8) && "so vectors don't cross"); + assert(util_is_aligned(p->base2, 8) && "so vectors don't cross"); + assert(util_is_aligned(p->base_eot, 8) && "so vectors don't cross"); +} + +static void +build_partition(jay_shader *shader, unsigned *blocks, unsigned n) +{ + unsigned base = 0; + unsigned ugpr_base = 0; + struct jay_partition *p = &shader->partition; + + *p = (struct jay_partition) { + .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16, + .units_x16[GPR] = 16 / jay_grf_per_gpr(shader), + }; + + for (unsigned i = 0; i < n; ++i) { + enum jay_file file = (i & 1) ? GPR : UGPR; + unsigned file_i = i >> 1; + + p->blocks[file][file_i].start = (base * p->units_x16[file]) / 16; + p->blocks[file][file_i].len = (blocks[i] * p->units_x16[file]) / 16; + + if (file == UGPR && blocks[i] >= 8) { + p->large_ugpr_block = (struct jay_register_block) { + .start = (ugpr_base * p->units_x16[file]) / 16, + .len = p->blocks[file][file_i].len, + }; + } + + base += blocks[i]; + if (file == UGPR) { + ugpr_base += blocks[i]; + } + + /* GPR partition blocks must be vector size aligned to avoid crossing */ + if (file == GPR && i != (n - 1)) { + unsigned max_vec = 8; + assert(util_is_aligned(blocks[i], max_vec * jay_grf_per_gpr(shader))); + } + } +} + +/* + * Partition the register file for the entire shader. All functions must + * share the same partition for correctness with non-uniform function calls. + * For unlinked library functions, we must use the ABI partition (TODO). + */ +void +jay_partition_grf(jay_shader *shader) +{ + /* Calculate the maximum register demand across all functions in the shader. + * We will use this to choose a good partition. + */ + struct jay_partition *p = &shader->partition; + unsigned demand[JAY_NUM_GRF_FILES] = { 0 }; + + jay_foreach_function(shader, f) { + jay_compute_liveness(f); + jay_calculate_register_demands(f); + + demand[GPR] = MAX2(demand[GPR], f->demand[GPR]); + demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]); + } + + /* We must have enough register file space for the register payload, plus the + * reserved UGPRs in the case we spill. That UGPR interferes with everything + * we preload so it needs to be reserved specially here for the worst case. + */ + jay_foreach_preload(jay_shader_get_entrypoint(shader), I) { + unsigned end = jay_preload_reg(I) + jay_num_values(I->dst); + unsigned extra = I->dst.file == UGPR ? shader->dispatch_width + 1 : 0; + assert(I->dst.file < JAY_NUM_GRF_FILES); + demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra); + } + + /* Determine a good GPR/UGPR split informed by the demand calculation */ + unsigned ugpr_per_grf = jay_ugpr_per_grf(shader); + unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf); + + /* We must have enough for SIMD1 images (TODO: Check if this actually + * applies. Or if we could eliminate this with smarter partitioning even.) + */ + unsigned min_ugprs = 16; + min_ugprs = MAX2(min_ugprs, 256); + + unsigned grf_block_alignment = 8 * jay_grf_per_gpr(shader); /* max_vec */ + + /* TODO: We could partition more cleverly */ + uniform_grfs = CLAMP(align(uniform_grfs, grf_block_alignment), + DIV_ROUND_UP(min_ugprs, ugpr_per_grf), + 128 - (32 * jay_grf_per_gpr(shader))); + unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; + + /* Check the split */ + assert((uniform_grfs * ugpr_per_grf) >= min_ugprs); + assert(nonuniform_grfs >= 32 * jay_grf_per_gpr(shader)); + assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF); + + /* Partition GRFs between GPR & UGPR */ + unsigned dispatch_grf = 0; + unsigned stride4_header_size = 0; + + if (shader->stage == MESA_SHADER_VERTEX) { + unsigned attrib_grfs = shader->prog_data->vue.urb_read_length * 8; + unsigned blocks[] = { + 1, /* UGPR: g0 */ + 8, /* GPR: URB output handle */ + shader->push_grfs, /* UGPR: Push constants */ + attrib_grfs, /* GPR: Vertex inputs */ + uniform_grfs - (blocks[0] + blocks[2]), /* UGPR: * */ + nonuniform_grfs - (blocks[1] + blocks[3]), /* GPR: * and EOT */ + }; + + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + dispatch_grf = blocks[0] + blocks[1]; + stride4_header_size = blocks[1] + blocks[3]; + } else if (shader->stage == MESA_SHADER_FRAGMENT) { + unsigned len0 = jay_grf_per_gpr(shader); + unsigned blocks[] = { + len0, /* UGPR: g0 (and maybe g1) */ + len0 * 8, /* GPR: Barycentrics */ + uniform_grfs - len0, /* UGPR: Dispatch (eg push constants) & general */ + nonuniform_grfs - (len0 * 8), /* GPR: General & end-of-thread */ + }; + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + dispatch_grf = blocks[0] + blocks[1]; + stride4_header_size = blocks[1]; + } else { + unsigned blocks[] = { uniform_grfs - 4, nonuniform_grfs, 4 }; + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + } + + /* TODO: Make the stride partition smarter */ + unsigned nonuniform_gprs = nonuniform_grfs / jay_grf_per_gpr(shader); + unsigned eot_gprs = 16 / jay_grf_per_gpr(shader); + p->base8 = ROUND_DOWN_TO(nonuniform_gprs - (16 + eot_gprs), 8) + 0; + p->base2 = 8 + p->base8; + p->base_eot = 8 + p->base2; + + // print_partition(p); + validate_partition(p, stride4_header_size, nonuniform_gprs); + + if (shader->stage == MESA_SHADER_FRAGMENT && shader->dispatch_width == 32) { + shader->prog_data->fs.dispatch_grf_start_reg_32 = dispatch_grf; + } else if (shader->stage == MESA_SHADER_FRAGMENT && + shader->dispatch_width == 16) { + shader->prog_data->fs.dispatch_grf_start_reg_16 = dispatch_grf; + } else { + shader->prog_data->base.dispatch_grf_start_reg = dispatch_grf; + } + + /* By construction of our partition, the entire GRF is used. */ + shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF; + + /* Set the targets for the virtual register file accordingly */ + for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { + for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { + shader->num_regs[f] += p->blocks[f][b].len; + } + } + + /* TODO: These are arbitrary. Need to rework somehow, we have options. */ + shader->num_regs[MEM] = 512; + shader->num_regs[UMEM] = 2048; +} + +static void +spill_file(jay_function *f, enum jay_file file, bool *spilled) +{ + unsigned limit = f->shader->num_regs[file]; + + /* If testing spilling, set limit tightly. */ + if ((jay_debug & JAY_DBG_SPILL) && + file == GPR && + f->shader->stage != MESA_SHADER_VERTEX) { + limit = 13; + } + + /* Ensures we don't XOR swap, XXX: TODO: FIXME */ + limit--; + + if (f->demand[file] > limit) { + /* In the worst case, we + * require 2 temporary registers to lower a memory-memory swap produced by + * parallel copy lowering, so adjust the limit to be num_regs - 2. + */ + limit--; + + /* If we spill, we need to reserve UGPRs for spilling */ + if (!(*spilled)) { + unsigned reservation = f->shader->dispatch_width + 1; + f->shader->num_regs[UGPR] -= reservation; + f->shader->partition.large_ugpr_block.len -= reservation; + } + + jay_spill(f, file, limit); + jay_validate(f->shader, "spilling"); + jay_compute_liveness(f); + jay_calculate_register_demands(f); + + if (f->demand[file] > limit) { + fprintf(stderr, "limit %u but demand %u\n", limit, f->demand[file]); + UNREACHABLE("spiller bug"); + } + + *spilled = true; + } +} + +static void +jay_register_allocate_function(jay_function *f) +{ + jay_shader *shader = f->shader; + jay_ra_state ra = { .bld.shader = shader, .bld.func = f }; + + /* Spill as needed to fit within the limits. We spill GPR before UGPR since + * spilling GPRs requires reserving a UGPR. + */ + bool spilled = false; + spill_file(f, GPR, &spilled); + spill_file(f, UGPR, &spilled); + + typed_memcpy(ra.num_regs, shader->num_regs, JAY_NUM_RA_FILES); + + /* The end of the register file is allowed for end-of-thread messages. + * Calculate the offset in GPRs. Compute shaders have this as UGPRs while + * fragment shaders have this as GPRs. + */ + if (mesa_shader_stage_is_compute(shader->stage)) { + ra.eot_offs = ROUND_DOWN_TO(ra.num_regs[UGPR], jay_ugpr_per_grf(shader)) - + jay_ugpr_per_grf(shader); + } else { + ra.eot_offs = ra.num_regs[GPR] - (16 / jay_grf_per_gpr(shader)); + } + + linear_ctx *lin_ctx = linear_context(shader); + + ra.reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc); + ra.global_reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc); + ra.affinities = linear_zalloc_array(lin_ctx, struct affinity, f->ssa_alloc); + + memset(ra.reg_for_index, 0xFF, sizeof(jay_reg) * f->ssa_alloc); + memset(ra.global_reg_for_index, 0xFF, sizeof(jay_reg) * f->ssa_alloc); + + jay_foreach_ra_file(file) { + const unsigned num_regs = ra.num_regs[file]; + ra.index_for_reg[file] = linear_zalloc_array(lin_ctx, uint32_t, num_regs); + ra.available_regs[file] = BITSET_LINEAR_ZALLOC(lin_ctx, num_regs); + ra.pinned[file] = BITSET_LINEAR_ZALLOC(lin_ctx, num_regs); + } + + ra.phi_web = linear_zalloc_array(lin_ctx, struct phi_web_node, f->ssa_alloc); + + /* Construct the phi equivalence classes using the union-find data + * structure. This associates all SSA values related to the same phi, + * and selects one of them as a canonical/representative value. + */ + construct_phi_webs(ra.phi_web, f); + + jay_foreach_inst_in_func(f, block, I) { + jay_foreach_src_index(I, s, c, index) { + if (jay_num_values(I->src[s]) > 1) { + uint32_t repr = UINT_MAX, repr_c = 0; + + /* Pick the representative with the smallest index, as it most + * likely dominates the other components. + */ + jay_foreach_comp(I->src[s], j) { + if (jay_channel(I->src[s], j) < repr) { + repr = jay_channel(I->src[s], j); + repr_c = j; + } + } + + ra.affinities[index].repr = repr; + ra.affinities[index].offset = repr == index ? c : c - repr_c; + } + + if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) { + ra.affinities[index].eot = true; + } + + if (jay_src_alignment(shader, I, s) >= jay_ugpr_per_grf(shader)) { + ra.affinities[index].grf_align = true; + ra.affinities[index].align_offs = c; + } + + ra.phi_web[phi_web_find(ra.phi_web, index)].affinity = + ra.affinities[index]; + } + } + + jay_foreach_block(f, block) { + local_ra(&ra, block); + } + + linear_free_context(lin_ctx); + + /* Validate the registers we picked before going out of SSA */ + jay_validate_ra(f); + + insert_parallel_copies_for_phis(f); + + /* Lower spills using the UGPRs we stole above. We need to update num_regs + * for correct scoreboarding calculations. + */ + if (spilled) { + jay_lower_spill(f); + f->shader->num_regs[UGPR] += f->shader->dispatch_width + 1; + } +} + +void +jay_register_allocate(jay_shader *s) +{ + jay_foreach_function(s, f) { + jay_register_allocate_function(f); + } + + s->post_ra = true; +} diff --git a/src/intel/compiler/jay/jay_repair_ssa.c b/src/intel/compiler/jay/jay_repair_ssa.c new file mode 100644 index 00000000000..794f3977cdf --- /dev/null +++ b/src/intel/compiler/jay/jay_repair_ssa.c @@ -0,0 +1,247 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2023 Alyssa Rosenzweig + * Copyright 2023 Valve Corporation + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +/* + * Implementation of "Simple and Efficient + * Construction of Static Single Assignment Form", also by Braun et al. + * https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf + */ + +#include "util/bitset.h" +#include "util/hash_table.h" +#include "util/ralloc.h" +#include "util/u_dynarray.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +struct incomplete_phi { + jay_def old; + unsigned new; +}; + +struct phi { + jay_block *block; + unsigned *src; + jay_def old; + unsigned dst; +}; + +struct ctx { + /* Array of index->index maps with the remapped definition at block end */ + struct hash_table_u64 **defs; + struct hash_table_u64 *remap; + struct util_dynarray phis, indices, *incomplete_phis; + BITSET_WORD *sealed; + void *linctx; + unsigned alloc, idx_i; +}; + +#define jay_repair_foreach_phi(ctx, phi) \ + util_dynarray_foreach(&(ctx)->phis, struct phi, phi) \ + if (phi->block != NULL) + +static unsigned lookup(struct ctx *ctx, jay_block *block, jay_def def); + +static unsigned +remap_idx(struct ctx *ctx, unsigned idx) +{ + /* TODO: Switch to union-find */ + void *remapped; + while ((remapped = _mesa_hash_table_u64_search(ctx->remap, idx))) { + idx = (uintptr_t) remapped; + } + + return idx; +} + +static bool +try_remove_trivial_phi(struct ctx *ctx, struct phi *phi) +{ + unsigned same = 0; + for (unsigned i = 0; i < jay_num_predecessors(phi->block); ++i) { + unsigned src = remap_idx(ctx, phi->src[i]); + if (same && src != same && src != phi->dst) { + /* Nontrivial */ + return false; + } + + if (src != phi->dst) { + same = src; + } + } + + _mesa_hash_table_u64_insert(ctx->remap, phi->dst, (void *) (uintptr_t) same); + phi->block = NULL; + return true; +} + +static void +add_phi(struct ctx *ctx, jay_block *block, jay_def src, unsigned dst) +{ + unsigned i = 0, n = jay_num_predecessors(block); + unsigned *srcs = linear_alloc_array(ctx->linctx, unsigned, n); + jay_foreach_predecessor(block, pred) { + assert(i < n); + srcs[i++] = lookup(ctx, *pred, src); + } + + struct phi tmpl = { .block = block, .old = src, .dst = dst, .src = srcs }; + if (!try_remove_trivial_phi(ctx, &tmpl)) { + util_dynarray_append(&ctx->phis, tmpl); + } +} + +static unsigned +lookup(struct ctx *ctx, jay_block *block, jay_def def) +{ + /* Lookup within a block */ + struct hash_table_u64 *ht = ctx->defs[block->index]; + void *local = _mesa_hash_table_u64_search(ht, jay_index(def)); + if (local) { + return (uintptr_t) local; + } + + /* For a single predecessor, we can recurse without adding a phi. */ + bool insert_phi = jay_num_predecessors(block) > 1; + unsigned val = insert_phi ? ctx->alloc++ : + lookup(ctx, jay_first_predecessor(block), def); + + _mesa_hash_table_u64_insert(ctx->defs[block->index], jay_index(def), + (void *) (uintptr_t) val); + + if (block->loop_header && !BITSET_TEST(ctx->sealed, block->index)) { + struct incomplete_phi tmpl = { .old = def, .new = val }; + util_dynarray_append(&ctx->incomplete_phis[block->index], tmpl); + } else if (insert_phi) { + add_phi(ctx, block, def, val); + } + + return val; +} + +static void +remap(struct ctx *ctx, jay_builder *b, jay_def *inout) +{ + jay_def def = *inout; + unsigned reg = def.reg; + jay_foreach_index(def, c, index) { + unsigned el = ctx->idx_i++; + assert(el < util_dynarray_num_elements(&ctx->indices, unsigned)); + unsigned idx = *util_dynarray_element(&ctx->indices, unsigned, el); + idx = remap_idx(ctx, idx); + jay_insert_channel(b, inout, c, jay_scalar(def.file, idx)); + } + + /* We run after flag RA, so preserve flag registers */ + if (jay_is_flag(def)) { + inout->reg = reg; + } +} + +void +jay_repair_ssa(jay_function *func) +{ + jay_builder b = jay_init_builder(func, jay_before_function(func)); + void *memctx = ralloc_context(NULL); + void *linctx = linear_context(memctx); + BITSET_WORD *sealed = BITSET_LINEAR_ZALLOC(linctx, func->num_blocks); + struct ctx ctx = { .sealed = sealed, .alloc = 1, .linctx = linctx }; + unsigned *phi_remap = linear_zalloc_array(linctx, unsigned, func->ssa_alloc); + + ctx.remap = _mesa_hash_table_u64_create(memctx); + ctx.defs = + linear_alloc_array(linctx, struct hash_table_u64 *, func->num_blocks); + ctx.incomplete_phis = + linear_alloc_array(linctx, struct util_dynarray, func->num_blocks); + + jay_foreach_block(func, block) { + ctx.defs[block->index] = _mesa_hash_table_u64_create(memctx); + util_dynarray_init(&ctx.incomplete_phis[block->index], memctx); + } + + util_dynarray_init(&ctx.phis, memctx); + util_dynarray_init(&ctx.indices, memctx); + + jay_foreach_block(func, block) { + jay_foreach_inst_in_block(block, I) { + jay_foreach_src_index(I, s, c, index) { + unsigned val = lookup(&ctx, block, jay_extract(I->src[s], c)); + util_dynarray_append(&ctx.indices, val); + } + + jay_foreach_dst_index(I, d, index) { + unsigned val = ctx.alloc++; + util_dynarray_append(&ctx.indices, val); + if (I->op == JAY_OPCODE_PHI_DST) { + phi_remap[index] = val; + } + + _mesa_hash_table_u64_insert(ctx.defs[block->index], index, + (void *) (uintptr_t) val); + } + } + + /* Seal loop headers after processing the back edge */ + jay_foreach_successor(block, succ) { + if (succ->loop_header && succ->index <= block->index) { + util_dynarray_foreach(&ctx.incomplete_phis[succ->index], + struct incomplete_phi, el) { + add_phi(&ctx, succ, el->old, el->new); + } + + assert(!BITSET_TEST(sealed, succ->index) && "unique backedge"); + BITSET_SET(sealed, succ->index); + } + } + } + + /* Optimize trivial phis resulting from backedges. Use-lists would avoid the + * fixed point algorithm but this should be good enough for now. + */ + bool progress; + do { + progress = false; + jay_repair_foreach_phi(&ctx, phi) { + progress |= try_remove_trivial_phi(&ctx, phi); + } + } while (progress); + + /* Now apply everything */ + jay_foreach_block(func, block) { + jay_foreach_phi_src_in_block(block, I) { + jay_set_phi_src_index(I, phi_remap[jay_phi_src_index(I)]); + } + + jay_foreach_inst_in_block(block, I) { + jay_foreach_ssa_src(I, s) { + remap(&ctx, &b, &I->src[s]); + } + + remap(&ctx, &b, &I->dst); + remap(&ctx, &b, &I->cond_flag); + } + } + + jay_repair_foreach_phi(&ctx, phi) { + b.cursor = jay_before_block(phi->block); + jay_PHI_DST(&b, jay_scalar(phi->old.file, phi->dst)); + + unsigned i = 0; + jay_foreach_predecessor(phi->block, pred) { + b.cursor = jay_before_jump(*pred); + unsigned idx = remap_idx(&ctx, phi->src[i++]); + jay_PHI_SRC_u32(&b, jay_scalar(phi->old.file, idx), phi->dst); + } + } + + func->ssa_alloc = ctx.alloc; + ralloc_free(memctx); +} diff --git a/src/intel/compiler/jay/jay_simd_width.c b/src/intel/compiler/jay/jay_simd_width.c new file mode 100644 index 00000000000..86a48ba320d --- /dev/null +++ b/src/intel/compiler/jay/jay_simd_width.c @@ -0,0 +1,63 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_ir.h" +#include "jay_opcodes.h" + +static unsigned +max_simd_width(jay_shader *shader, const jay_inst *I) +{ + /* Only certain "complex" quad swizzles require splitting down to SIMD4 */ + if (I->op == JAY_OPCODE_QUAD_SWIZZLE && + (jay_quad_swizzle_swizzle(I) == JAY_QUAD_SWIZZLE_XYXY || + jay_quad_swizzle_swizzle(I) == JAY_QUAD_SWIZZLE_ZWZW)) { + return 4; + } + + /* These special instructions need to be split for various reasons. */ + if (I->op == JAY_OPCODE_EXPAND_QUAD || + I->op == JAY_OPCODE_EXTRACT_LAYER || + I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES || + I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || + I->op == JAY_OPCODE_MUL_32 || + I->op == JAY_OPCODE_SHUFFLE) { + return 16; + } + + if (I->op != JAY_OPCODE_SEND) { + /* If any source/destination is 64-bit strided, we must split to avoid + * crossing more than 2 GRFs. Note that SENDs don't have this restriction, + * we don't have to split A64 load/store. + */ + if (I->dst.file == GPR && + jay_def_stride(shader, I->dst) == JAY_STRIDE_8) { + return 16; + } + + jay_foreach_src(I, s) { + if (I->src[s].file == GPR && + jay_def_stride(shader, I->src[s]) == JAY_STRIDE_8) { + return 16; + } + } + } else { + /* TODO: Do we ever split SENDs? ..Can we even split SENDs given we don't + * have stride control? How is this supposed to work? + * + * XXX + */ + } + + return 32; +} + +unsigned +jay_simd_split(jay_shader *s, const jay_inst *I) +{ + unsigned actual = jay_simd_width_logical(s, I); + unsigned max = max_simd_width(s, I); + + return (actual > max) ? (util_logbase2(actual) - util_logbase2(max)) : 0; +} diff --git a/src/intel/compiler/jay/jay_spill.c b/src/intel/compiler/jay/jay_spill.c new file mode 100644 index 00000000000..f4c3b85789c --- /dev/null +++ b/src/intel/compiler/jay/jay_spill.c @@ -0,0 +1,849 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2023-2024 Alyssa Rosenzweig + * Copyright 2023-2024 Valve Corporation + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "util/ralloc.h" +#include "util/sparse_bitset.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "util/u_qsort.h" +#include "util/u_worklist.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form + * Programs" by Braun and Hack. + * + * Next-use distances are logically in ℤ ∪ {∞}, modelled as saturating uint32 + * and referred to as dist_t. Within a block, next-use data is dense. At block + * boundaries, next-use maps are stored as key-value pairs, where only variables + * with later uses (finite distance) are stored. That sparse representation + * ensures linear-time even for shaders with many blocks. + */ +#define DIST_INFINITY (UINT32_MAX) +typedef uint32_t dist_t; + +struct next_use { + uint32_t index; + dist_t dist; +}; + +static void +add_next_use(struct util_dynarray *nu, unsigned node, dist_t dist) +{ + struct next_use use = { .index = node, .dist = dist }; + util_dynarray_append(nu, use); +} + +#define foreach_next_use(nu, it) util_dynarray_foreach(nu, struct next_use, it) + +static dist_t +add_dist(dist_t A, dist_t B) +{ + return (A + B < A) ? DIST_INFINITY : (A + B); +} + +/* + * Calculate the minimum of two next-use sets. Values absent from one of the + * underlying sets are infinity so do not contribute to the minimum, instead + * acting like a set union. + */ +static bool +minimum_next_uses(struct util_dynarray *nu, + const struct util_dynarray *from, + dist_t *tmp_dist, + struct u_sparse_bitset *tmp_set) +{ + /* Convert "from" to be dense */ + u_sparse_bitset_clear_all(tmp_set); + + foreach_next_use(from, it) { + u_sparse_bitset_set(tmp_set, it->index); + tmp_dist[it->index] = it->dist; + } + + bool progress = false; + + /* Take the minimum of common elements */ + foreach_next_use(nu, it) { + if (u_sparse_bitset_test(tmp_set, it->index)) { + if (tmp_dist[it->index] < it->dist) { + it->dist = tmp_dist[it->index]; + progress = true; + } + + u_sparse_bitset_clear(tmp_set, it->index); + } + } + + /* Add elements that are only in "from" */ + U_SPARSE_BITSET_FOREACH_SET(tmp_set, index) { + add_next_use(nu, index, tmp_dist[index]); + progress = true; + } + + return progress; +} + +static uint32_t +inst_cycles(const jay_inst *I) +{ + return 1; +} + +struct spill_block { + /* W/S sets at the start/end of the block, see spill_ctx::{W,S} */ + struct u_sparse_bitset W_in, W_out, S_in, S_out; + + /* Next-use maps at the start/end of the block */ + struct util_dynarray next_use_in, next_use_out; + + /* Estimated cycle count of the block */ + uint32_t cycles; +}; + +struct spill_ctx { + jay_function *func; + + /* Register file being spilled */ + enum jay_file file; + + /* Set of values whose file equals `file` */ + BITSET_WORD *in_file; + + /* Set of values currently available in the register file */ + struct u_sparse_bitset W; + + /* For W-entry calculation, phis with a spilled source. For + * coupling calculation, phis defined along the given edge. + */ + struct u_sparse_bitset phi_set; + + /* |W| = Current register pressure */ + unsigned nW; + + /* For each variable in N, local IPs of next-use. Else, infinite. */ + struct u_sparse_bitset N; + dist_t *next_uses; + + /* Current local IP relative to the start of the block */ + uint32_t ip; + + /* Set of live values that have been spilled. Contrary to the paper, this + * is not a subset of W: the definition in the paper is bogus. + */ + struct u_sparse_bitset S; + + /* If a value is rematerializable or a phi, its definition. Else, NULL */ + jay_inst **defs; + + /* Maximum register pressure allowed */ + unsigned k; + + /* Number of variables */ + unsigned n; + + /* Information on blocks indexed in source order */ + struct spill_block *blocks; + + /* Preallocated array of candidates for calculating W entry */ + struct next_use *candidates; + struct util_dynarray next_ip; +}; + +static inline jay_def +jay_def_as_mem(struct spill_ctx *ctx, jay_def idx) +{ + assert(idx.file == GPR || idx.file == UGPR); + idx.file = idx.file == UGPR ? UMEM : MEM; + idx._payload = jay_base_index(idx) + ctx->n; + return idx; +} + +static bool +can_remat(jay_inst *I) +{ + /* TODO */ + return false; +} + +static bool +can_remat_node(struct spill_ctx *ctx, unsigned node) +{ + return ctx->defs[node] && ctx->defs[node]->op != JAY_OPCODE_PHI_DST; +} + +static jay_inst * +remat_to(jay_builder *b, jay_def dst, struct spill_ctx *ctx, unsigned node) +{ + jay_inst *I = ctx->defs[node]; + assert(can_remat(I)); + + UNREACHABLE("invalid remat"); +} + +static void +insert_spill(jay_builder *b, struct spill_ctx *ctx, unsigned node) +{ + if (!can_remat_node(ctx, node)) { + jay_def idx = jay_scalar(ctx->file, node); + jay_MOV(b, jay_def_as_mem(ctx, idx), idx); + } +} + +static void +insert_reload(struct spill_ctx *ctx, + jay_block *block, + jay_cursor cursor, + unsigned node) +{ + jay_builder b = jay_init_builder(ctx->func, cursor); + jay_def idx = jay_scalar(ctx->file, node); + + /* Reloading breaks SSA, but jay_repair_ssa will repair */ + if (can_remat_node(ctx, node)) { + remat_to(&b, idx, ctx, node); + } else { + jay_MOV(&b, idx, jay_def_as_mem(ctx, idx)); + } +} + +/* Insert into the register file */ +static void +insert_W(struct spill_ctx *ctx, unsigned v) +{ + assert(!u_sparse_bitset_test(&ctx->W, v)); + assert(BITSET_TEST(ctx->in_file, v)); + + u_sparse_bitset_set(&ctx->W, v); + ctx->nW++; +} + +/* Remove from the register file */ +static void +remove_W(struct spill_ctx *ctx, unsigned v) +{ + assert(u_sparse_bitset_test(&ctx->W, v)); + assert(BITSET_TEST(ctx->in_file, v)); + + u_sparse_bitset_clear(&ctx->W, v); + ctx->nW--; +} + +static int +nu_score(struct spill_ctx *ctx, struct next_use nu) +{ + /* We assume that rematerializing - even before every instuction - is + * cheaper than spilling. As long as one of the nodes is rematerializable + * (with distance > 0), we choose it over spilling. Within a class of nodes + * (rematerializable or not), compare by next-use-distance. + */ + bool remat = can_remat_node(ctx, nu.index) && nu.dist > 0; + return (remat ? 0 : 100000) + nu.dist; +} + +static int +cmp_dist(const void *left_, const void *right_, void *ctx) +{ + const struct next_use *left = left_; + const struct next_use *right = right_; + int l = nu_score(ctx, *left), r = nu_score(ctx, *right); + + return (l > r) - (l < r); +} + +/* + * Limit the register file W to maximum size m by evicting registers. + */ +static ATTRIBUTE_NOINLINE void +limit(struct spill_ctx *ctx, jay_inst *I, unsigned m) +{ + /* Nothing to do if we're already below the limit */ + if (ctx->nW <= m) { + return; + } + + /* Gather candidates for eviction. Note that next_uses gives IPs whereas + * cmp_dist expects relative distances. This requires us to subtract ctx->ip + * to ensure that cmp_dist works properly. Even though logically it shouldn't + * affect the sorted order, practically this matters for correctness with + * rematerialization. See the dist=0 test in cmp_dist. + */ + struct next_use vars[JAY_NUM_UGPR]; + unsigned j = 0; + + U_SPARSE_BITSET_FOREACH_SET(&ctx->W, i) { + assert(ctx->next_uses[i] != DIST_INFINITY && "live in W"); + dist_t dist = ctx->next_uses[i] - ctx->ip; + + assert(j < ARRAY_SIZE(vars)); + vars[j++] = (struct next_use) { .index = i, .dist = dist }; + } + + /* Sort by next-use distance */ + util_qsort_r(vars, j, sizeof(struct next_use), cmp_dist, ctx); + + /* Evict what doesn't fit, inserting a spill for evicted values that we + * haven't spilled before with a future use. + */ + for (unsigned i = m; i < j; ++i) { + if (!u_sparse_bitset_test(&ctx->S, vars[i].index)) { + jay_builder b = jay_init_builder(ctx->func, jay_before_inst(I)); + insert_spill(&b, ctx, vars[i].index); + u_sparse_bitset_set(&ctx->S, vars[i].index); + } + + remove_W(ctx, vars[i].index); + } +} + +/* + * Insert coupling code on block boundaries. This must ensure: + * + * - anything live-in we expect to have spilled is spilled + * - anything live-in we expect to have filled is filled + * - phi sources are spilled if the destination is spilled + * - phi sources are filled if the destination is not spilled + * + * The latter two requirements ensure correct pressure calculations for phis. + */ +static ATTRIBUTE_NOINLINE void +insert_coupling_code(struct spill_ctx *ctx, jay_block *pred, jay_block *succ) +{ + jay_builder b = jay_init_builder(ctx->func, jay_before_function(ctx->func)); + struct spill_block *sp = &ctx->blocks[pred->index]; + struct spill_block *ss = &ctx->blocks[succ->index]; + + /* Insert spill/fill at phi sources to match their destination */ + jay_foreach_phi_src_in_block(pred, phi_src) { + jay_inst *phi_dst = ctx->defs[jay_phi_src_index(phi_src)]; + unsigned src = jay_index(phi_src->src[0]); + + if (phi_src->src[0].file == ctx->file) { + if (jay_is_mem(phi_dst->dst)) { + if (!u_sparse_bitset_test(&sp->S_out, src)) { + /* Spill the phi source. TODO: avoid redundant spills here */ + b.cursor = jay_after_block_logical(pred); + insert_spill(&b, ctx, src); + } + + if (can_remat_node(ctx, jay_index(phi_src->src[0]))) { + jay_def idx = jay_scalar(ctx->file, src); + jay_def tmp = jay_alloc_def(&b, ctx->file, 1); + + b.cursor = jay_before_function(ctx->func); + remat_to(&b, tmp, ctx, src); + jay_MOV(&b, jay_def_as_mem(ctx, idx), tmp); + } + + /* Use the spilled version */ + phi_src->src[0] = jay_def_as_mem(ctx, phi_src->src[0]); + jay_set_phi_src_index(phi_src, jay_index(phi_dst->dst)); + } else if (!u_sparse_bitset_test(&sp->W_out, src)) { + /* Fill the phi source in the predecessor */ + jay_block *reload_block = jay_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, jay_along_edge(pred, succ), src); + } + } + } + + /* Anything assumed to be spilled in succ must be spilled along all edges. */ + U_SPARSE_BITSET_FOREACH_SET(&ss->S_in, v) { + if (!u_sparse_bitset_test(&sp->S_out, v)) { + b.cursor = jay_along_edge(pred, succ); + insert_spill(&b, ctx, v); + } + } + + jay_foreach_phi_dst_in_block(succ, phi) { + u_sparse_bitset_set(&ctx->phi_set, jay_index(phi->dst)); + } + + /* Variables in W at the start of succ must be defined along the edge. + * If not live at the end of the predecessor (and it's not a phi defined in + * the successor), insert a reload. + */ + U_SPARSE_BITSET_FOREACH_SET(&ss->W_in, v) { + if (!u_sparse_bitset_test(&sp->W_out, v) && + !u_sparse_bitset_test(&ctx->phi_set, v)) { + + jay_block *reload_block = jay_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, jay_along_edge(pred, succ), v); + } + } +} + +static dist_t +lookup_next_use(struct spill_ctx *ctx, unsigned v) +{ + return u_sparse_bitset_test(&ctx->N, v) ? ctx->next_uses[v] : DIST_INFINITY; +} + +/* + * Produce an array of next-use IPs relative to the start of the block. This is + * an array of dist_t scalars, representing the next-use IP of each SSA dest + * (right-to-left) and SSA source (left-to-right) of each instuction in the + * block (bottom-to-top). Its size equals the # of SSA sources in the block. + */ +static ATTRIBUTE_NOINLINE void +populate_local_next_use(struct spill_ctx *ctx, jay_block *block) +{ + struct spill_block *sb = &ctx->blocks[block->index]; + unsigned ip = sb->cycles; + + foreach_next_use(&sb->next_use_out, it) { + dist_t d = add_dist(it->dist, ip); + + if (d != DIST_INFINITY) { + u_sparse_bitset_set(&ctx->N, it->index); + ctx->next_uses[it->index] = d; + } + } + + jay_foreach_inst_in_block_rev(block, I) { + ip -= inst_cycles(I); + + jay_foreach_src_index(I, s, c, v) { + if (I->src[s].file == ctx->file) { + if (I->op != JAY_OPCODE_PHI_SRC) { + util_dynarray_append(&ctx->next_ip, lookup_next_use(ctx, v)); + } + + ctx->next_uses[v] = ip; + u_sparse_bitset_set(&ctx->N, v); + } + } + + if (I->dst.file == ctx->file) { + jay_foreach_index_rev(I->dst, _, v) { + util_dynarray_append(&ctx->next_ip, lookup_next_use(ctx, v)); + } + } + } + + assert(ip == 0 && "cycle counting is consistent"); +} + +/* + * Insert spills/fills for a single basic block, following Belady's algorithm. + * Corresponds to minAlgorithm from the paper. + */ +static ATTRIBUTE_NOINLINE void +min_algorithm(struct spill_ctx *ctx, + jay_block *block, + struct spill_block *sb, + dist_t *next_ips, + unsigned next_use_cursor) +{ + jay_foreach_inst_in_block(block, I) { + assert(ctx->nW <= ctx->k && "invariant"); + + /* Phis are special since they happen along the edge. When we initialized + * W and S, we implicitly chose which phis are spilled. So, here we just + * need to rewrite the phis to write into memory. + * + * Phi sources are handled later. + */ + if (I->op == JAY_OPCODE_PHI_DST) { + if (I->dst.file == ctx->file) { + if (!u_sparse_bitset_test(&ctx->W, jay_index(I->dst))) { + u_sparse_bitset_set(&ctx->S, jay_index(I->dst)); + I->dst = jay_def_as_mem(ctx, I->dst); + } + } + + ctx->ip += inst_cycles(I); + continue; + } else if (I->op == JAY_OPCODE_PHI_SRC) { + break; + } + + /* Any source that is not in W needs to be reloaded. Gather the set R of + * such values, and add them to the register file. + */ + unsigned R[JAY_MAX_SRCS], nR = 0; + + jay_foreach_src_index(I, s, c, v) { + if (I->src[s].file == ctx->file && !u_sparse_bitset_test(&ctx->W, v)) { + R[nR++] = v; + insert_W(ctx, v); + + assert(u_sparse_bitset_test(&ctx->S, v) && "must have spilled"); + assert(nR <= ARRAY_SIZE(R) && "maximum source count"); + } + } + + /* Limit W to make space for the operands. + * + * We need to round up to power-of-two destination sizes to match the + * rounding in demand calculation. + */ + bool has_dst = I->dst.file == ctx->file; + unsigned dst_size = util_next_power_of_two(jay_num_values(I->dst)); + limit(ctx, I, ctx->k - (has_dst ? dst_size : 0)); + + /* Add destinations to the register file */ + if (I->dst.file == ctx->file) { + jay_foreach_index(I->dst, _, index) { + assert(next_use_cursor >= 1); + ctx->next_uses[index] = next_ips[--next_use_cursor]; + + if (ctx->next_uses[index] != DIST_INFINITY) { + insert_W(ctx, index); + } + } + } + + /* Update next-use distances for this instuction. Unlike the paper, we + * require W contain only live values (with finite next-use distance). + * + * This happens after the above limit() calls to model sources as + * late-kill. This is conservative and could be improved, but it matches + * how we currently estimate register demand. + */ + jay_foreach_src_index_rev(I, s, c, node) { + if (I->src[s].file == ctx->file) { + assert(next_use_cursor >= 1); + ctx->next_uses[node] = next_ips[--next_use_cursor]; + + if (ctx->next_uses[node] == DIST_INFINITY) { + remove_W(ctx, node); + } + } + } + + /* Add reloads for the sources in front of the instuction. */ + for (unsigned i = 0; i < nR; ++i) { + insert_reload(ctx, block, jay_before_inst(I), R[i]); + } + + ctx->ip += inst_cycles(I); + + if (jay_debug & JAY_DBG_PRINTDEMAND) { + printf("(SP) %u: ", ctx->nW); + jay_print_inst(stdout, I); + } + } + + assert(next_use_cursor == 0 && "exactly sized"); + + u_sparse_bitset_dup(&sb->W_out, &ctx->W); + u_sparse_bitset_dup(&sb->S_out, &ctx->S); +} + +/* + * TODO: Implement section 4.2 of the paper. + * + * For now, we implement the simpler heuristic in Hack's thesis: sort + * the live-in set (+ destinations of phis) by next-use distance. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry_loop_header(struct spill_ctx *ctx, jay_block *block) +{ + unsigned j = 0; + /* TODO: Account for phis too! */ + foreach_next_use(&ctx->blocks[block->index].next_use_in, it) { + assert(j < ctx->n); + ctx->candidates[j++] = *it; + } + + /* Take the best candidates sorted by next-use distance */ + unsigned n = MIN2(j, ctx->k - ctx->nW); + if (n < j) { + util_qsort_r(ctx->candidates, j, sizeof(struct next_use), cmp_dist, ctx); + } + + for (unsigned i = 0; i < n; ++i) { + insert_W(ctx, ctx->candidates[i].index); + } +} + +/* + * Compute W_entry for a block. Section 4.2 in the paper. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry(struct spill_ctx *ctx, jay_block *block) +{ + unsigned j = 0; + + /* Variables that are in all predecessors are assumed in W_entry. Phis and + * variables in some predecessors are scored by next-use. + */ + U_SPARSE_BITSET_FOREACH_SET(&ctx->N, i) { + bool all = true, any = false; + + jay_foreach_predecessor(block, P) { + bool in = u_sparse_bitset_test(&ctx->blocks[(*P)->index].W_out, i); + all &= in; + any |= in; + } + + if (all) { + insert_W(ctx, i); + } else if (any) { + ctx->candidates[j++] = + (struct next_use) { .index = i, .dist = ctx->next_uses[i] }; + } + } + + jay_foreach_predecessor(block, pred) { + jay_foreach_phi_src_in_block(*pred, I) { + if (!u_sparse_bitset_test(&ctx->blocks[(*pred)->index].W_out, + jay_index(I->src[0]))) { + + u_sparse_bitset_set(&ctx->phi_set, jay_phi_src_index(I)); + } + } + } + + /* Heuristic: if any phi source is spilled, spill the phi. While suboptimal, + * this reduces pointless spills/fills with massive phi webs. + */ + jay_foreach_phi_dst_in_block(block, I) { + if (!u_sparse_bitset_test(&ctx->phi_set, jay_index(I->dst))) { + ctx->candidates[j++] = (struct next_use) { + .index = jay_index(I->dst), + .dist = ctx->next_uses[jay_index(I->dst)], + }; + } + } + + /* Take the best candidates sorted by next-use distance */ + unsigned n = MIN2(j, ctx->k - ctx->nW); + if (n < j) { + util_qsort_r(ctx->candidates, j, sizeof(struct next_use), cmp_dist, ctx); + } + + for (unsigned i = 0; i < n; ++i) { + insert_W(ctx, ctx->candidates[i].index); + } +} + +/* + * We initialize S with the union of S at the exit of (forward edge) + * predecessors and the complement of W, intersected with the live-in set. The + * former propagates S forward. The latter ensures we spill along the edge when + * a live value is not selected for the entry W. + */ +static ATTRIBUTE_NOINLINE void +compute_s_entry(struct spill_ctx *ctx, jay_block *block) +{ + jay_foreach_predecessor(block, pred) { + U_SPARSE_BITSET_FOREACH_SET(&ctx->blocks[(*pred)->index].S_out, v) { + if (u_sparse_bitset_test(&block->live_in, v)) { + u_sparse_bitset_set(&ctx->S, v); + } + } + } + + U_SPARSE_BITSET_FOREACH_SET(&block->live_in, v) { + if (BITSET_TEST(ctx->in_file, v) && !u_sparse_bitset_test(&ctx->W, v)) { + u_sparse_bitset_set(&ctx->S, v); + } + } + + u_sparse_bitset_dup(&ctx->blocks[block->index].S_in, &ctx->S); +} + +static ATTRIBUTE_NOINLINE void +global_next_use_distances(struct spill_ctx *ctx, void *memctx) +{ + u_worklist worklist; + u_worklist_init(&worklist, ctx->func->num_blocks, NULL); + + jay_foreach_block(ctx->func, block) { + struct spill_block *sb = &ctx->blocks[block->index]; + + util_dynarray_init(&sb->next_use_in, memctx); + util_dynarray_init(&sb->next_use_out, memctx); + + jay_foreach_inst_in_block(block, I) { + sb->cycles += inst_cycles(I); + } + + jay_worklist_push_head(&worklist, block); + } + + /* Iterate the work list in reverse order since liveness is backwards */ + while (!u_worklist_is_empty(&worklist)) { + jay_block *block = jay_worklist_pop_head(&worklist); + struct spill_block *sb = &ctx->blocks[block->index]; + + /* Clear locally accessed set (W) */ + u_sparse_bitset_clear_all(&ctx->W); + util_dynarray_clear(&sb->next_use_in); + + uint32_t cycle = 0; + + /* Calculate dists */ + jay_foreach_inst_in_block(block, I) { + /* Record first use before def */ + jay_foreach_src_index(I, s, c, index) { + if (I->src[s].file == ctx->file && + !u_sparse_bitset_test(&ctx->W, index)) { + + add_next_use(&sb->next_use_in, index, cycle); + u_sparse_bitset_set(&ctx->W, index); + } + } + + /* Record defs */ + jay_foreach_index(I->dst, _, index) { + u_sparse_bitset_set(&ctx->W, index); + } + + cycle += inst_cycles(I); + } + + /* Apply transfer function to get our entry state. */ + foreach_next_use(&sb->next_use_out, it) { + if (!u_sparse_bitset_test(&ctx->W, it->index)) { + add_next_use(&sb->next_use_in, it->index, + add_dist(it->dist, sb->cycles)); + } + } + + /* Propagate successor live-in to pred live-out, joining with min */ + jay_foreach_predecessor(block, pred) { + if (minimum_next_uses(&ctx->blocks[(*pred)->index].next_use_out, + &sb->next_use_in, ctx->next_uses, + &ctx->phi_set)) { + jay_worklist_push_tail(&worklist, *pred); + } + } + } + + u_worklist_fini(&worklist); + +#ifndef NDEBUG + /* In debug builds, validate the following invariant: + * + * Next-use distance is finite iff live and in file. + */ + jay_foreach_block(ctx->func, blk) { + struct spill_block *sb = &ctx->blocks[blk->index]; + + for (unsigned i = 0; i < 2; i++) { + struct util_dynarray *nu = i ? &sb->next_use_out : &sb->next_use_in; + struct u_sparse_bitset *live = i ? &blk->live_out : &blk->live_in; + + u_sparse_bitset_clear_all(&ctx->W); + + foreach_next_use(nu, it) { + assert(u_sparse_bitset_test(live, it->index) && + BITSET_TEST(ctx->in_file, it->index)); + + u_sparse_bitset_set(&ctx->W, it->index); + } + + U_SPARSE_BITSET_FOREACH_SET(live, i) { + if (BITSET_TEST(ctx->in_file, i)) { + assert(u_sparse_bitset_test(&ctx->W, i)); + } + } + } + } +#endif +} + +void +jay_spill(jay_function *func, enum jay_file file, unsigned k) +{ + void *memctx = ralloc_context(NULL); + void *linctx = linear_context(memctx); + struct spill_ctx ctx = { .func = func, .file = file, .k = k }; + + ctx.n = func->ssa_alloc; + ctx.in_file = BITSET_LINEAR_ZALLOC(linctx, ctx.n); + ctx.defs = linear_zalloc_array(linctx, jay_inst *, ctx.n); + ctx.next_uses = linear_alloc_array(linctx, dist_t, ctx.n); + ctx.candidates = linear_alloc_array(linctx, struct next_use, ctx.n); + ctx.blocks = + linear_zalloc_array(linctx, struct spill_block, func->num_blocks); + + jay_foreach_inst_in_func(func, block, I) { + if (can_remat(I) || I->op == JAY_OPCODE_PHI_DST) { + ctx.defs[jay_index(I->dst)] = I; + } + + if (I->dst.file == file) { + BITSET_SET_COUNT(ctx.in_file, jay_base_index(I->dst), + jay_num_values(I->dst)); + } + } + + u_sparse_bitset_init(&ctx.W, ctx.n, memctx); + u_sparse_bitset_init(&ctx.S, ctx.n, memctx); + u_sparse_bitset_init(&ctx.N, ctx.n, memctx); + u_sparse_bitset_init(&ctx.phi_set, ctx.n, memctx); + util_dynarray_init(&ctx.next_ip, memctx); + + global_next_use_distances(&ctx, memctx); + + /* Reserve a memory variable for every regular variable */ + func->ssa_alloc *= 2; + + jay_foreach_block(func, block) { + ctx.nW = 0; + ctx.ip = 0; + + u_sparse_bitset_clear_all(&ctx.W); + u_sparse_bitset_clear_all(&ctx.S); + u_sparse_bitset_clear_all(&ctx.N); + util_dynarray_clear(&ctx.next_ip); + + populate_local_next_use(&ctx, block); + + struct spill_block *sb = &ctx.blocks[block->index]; + dist_t *next_ips = util_dynarray_element(&ctx.next_ip, dist_t, 0); + unsigned nu_cursor = util_dynarray_num_elements(&ctx.next_ip, dist_t); + + /* Populate next-use with phi destinations, which are not in the + * next_use_in set but are accounted for when computing W_entry. + */ + jay_foreach_phi_dst_in_block(block, I) { + if (I->dst.file == file) { + assert(nu_cursor >= 1); + ctx.next_uses[jay_index(I->dst)] = next_ips[--nu_cursor]; + u_sparse_bitset_set(&ctx.N, jay_index(I->dst)); + } + } + + if (block->loop_header) { + compute_w_entry_loop_header(&ctx, block); + } else if (jay_num_predecessors(block) /* skip start blocks */) { + compute_w_entry(&ctx, block); + } + + assert(ctx.nW <= ctx.k && "invariant"); + u_sparse_bitset_dup(&sb->W_in, &ctx.W); + + compute_s_entry(&ctx, block); + min_algorithm(&ctx, block, sb, next_ips, nu_cursor); + } + + /* Now that all blocks are processed separately, stitch it together */ + jay_foreach_block(func, block) { + jay_foreach_predecessor(block, pred) { + u_sparse_bitset_clear_all(&ctx.phi_set); + insert_coupling_code(&ctx, *pred, block); + } + } + + ralloc_free(memctx); + + /* Spilling breaks SSA, so we need to repair before validating */ + jay_repair_ssa(func); + jay_validate(func->shader, "Spilling"); + + /* Remat can introduce dead code */ + jay_opt_dead_code(func->shader); +} diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c new file mode 100644 index 00000000000..935ae4d2727 --- /dev/null +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -0,0 +1,576 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include "compiler/brw/brw_disasm_info.h" +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/brw/brw_eu_inst.h" +#include "compiler/brw/brw_reg.h" +#include "compiler/brw/brw_reg_type.h" +#include "dev/intel_debug.h" +#include "util/macros.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "jay.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +static inline enum brw_reg_type +to_brw_reg_type(enum jay_type type) +{ + /* clang-format off */ + switch (type) { + case JAY_TYPE_UNTYPED: + case JAY_TYPE_U8: return BRW_TYPE_UB; + case JAY_TYPE_U16: return BRW_TYPE_UW; + case JAY_TYPE_U32: return BRW_TYPE_UD; + case JAY_TYPE_U64: return BRW_TYPE_UQ; + case JAY_TYPE_S8: return BRW_TYPE_B; + case JAY_TYPE_S16: return BRW_TYPE_W; + case JAY_TYPE_S32: return BRW_TYPE_D; + case JAY_TYPE_S64: return BRW_TYPE_Q; + case JAY_TYPE_F16: return BRW_TYPE_HF; + case JAY_TYPE_F32: return BRW_TYPE_F; + case JAY_TYPE_F64: return BRW_TYPE_DF; + case JAY_TYPE_BF16: return BRW_TYPE_BF; + default: UNREACHABLE("invalid type"); + } + /* clang-format on */ +} + +static inline unsigned +to_def_grf_16(struct jay_partition *p, jay_def d) +{ + unsigned count = jay_num_values(d); + if (count == 0 || !(d.file == GPR || d.file == UGPR)) { + return d.reg; + } + + unsigned base = 0; + for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) { + unsigned offset = d.reg - base; + + if (offset < p->blocks[d.file][i].len) { + assert(offset + count <= p->blocks[d.file][i].len && + "vectors must not cross partition boundaries"); + + return (p->blocks[d.file][i].start + offset) * 2 + d.hi; + } + + base += p->blocks[d.file][i].len; + } + + UNREACHABLE("virtual register must be in a block"); +} + +static inline brw_reg +to_brw_reg(jay_function *f, + const jay_inst *I, + signed idx, + unsigned simd_offs, + bool force_hi) +{ + bool is_dest = idx < 0; + enum jay_type type = is_dest ? I->type : jay_src_type(I, idx); + jay_def d = is_dest ? I->dst : I->src[idx]; + d.hi |= force_hi; + + struct brw_reg R; + unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0; + + if (jay_is_imm(d)) { + /* Immediates have size restrictions but can zero extend */ + if (jay_type_size_bits(type) == 64) { + type = jay_type_resize(type, 32); + } else if (I->op == JAY_OPCODE_BFN) { + assert(jay_as_uint(d) < UINT16_MAX); + type = JAY_TYPE_U16; + } + + R = brw_imm_ud(jay_as_uint(d)); + } else if (jay_is_null(d)) { + R = brw_null_reg(); + } else if (d.file == UGPR) { + unsigned grf = (reg >> 1) / 8; + offset_B = ((reg >> 1) % 8) * 4; + + if (d.file == UGPR) { + R = brw_ud1_grf(grf, 0); + } else { + R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0); + } + + /* Handle 3-src restrictions and vectorized uniform code. */ + if (is_dest || jay_num_values(d) >= 8) { + R = vec8(R); + } + + /* Some operations have special restrictions on the destination stride, + * but if we write a single UGPR the stride is ignored.. Specify + * whatever stride is needed to satisfy the rules. + */ + if (is_dest) { + /* BSpec 56640 "Special Restrictions" says: + * + * "Conversion between HF and Integer must be DWord-aligned + * and strided by a DWord on the destination." + */ + enum jay_type src0_type = jay_src_type(I, 0); + if ((I->type == JAY_TYPE_F16 && !jay_type_is_any_float(src0_type)) || + (src0_type == JAY_TYPE_F16 && !jay_type_is_any_float(I->type))) { + assert(jay_num_values(d) == 1 && "must not vectorize HF<->Int"); + R = stride(R, 8, 2, 4); + } + + /* Packed floats have restrictions on mixed sizes. Use <2>. */ + if (jay_type_size_bits(I->type) == 16 && + jay_type_size_bits(jay_src_type(I, 0)) != 16) { + assert(jay_num_values(d) == 1 && "must not vectorize mixed float"); + R = stride(R, 4, 2, 2); + } + } + } else if (d.file == GPR) { + enum jay_stride def_stride = jay_def_stride(f->shader, d); + uint32_t type_bits = jay_type_size_bits(type); + unsigned stride_bits = jay_stride_to_bits(def_stride); + unsigned simd_width = jay_simd_width_physical(f->shader, I); + + unsigned grf; + if (def_stride == JAY_STRIDE_2) { + /* Bit 0 selects between lo/hi halves of the GPR */ + grf = (reg / 2) * jay_grf_per_gpr(f->shader); + offset_B = (reg & 1) * 2 * f->shader->dispatch_width; + } else { + /* Low bits are an offset in 2-byte words into the GRF */ + unsigned mask = BITFIELD_MASK(stride_bits / 32); + grf = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader); + offset_B = (reg & mask) * 2; + } + + R = byte_offset(xe2_vec8_grf(grf, 0), + simd_offs * simd_width * stride_bits / 8); + + if (stride_bits == (type_bits * 4)) { + R = stride(R, 8, 2, 4); + } else if (stride_bits == (type_bits * 2)) { + R = stride(R, 4, 2, 2); + } else { + assert(stride_bits == type_bits); + } + + /* Broadcast is equivalent to <8, 8, 1> for SIMD1 instructions. Use that + * instead due to regioning restrictions. + */ + if (simd_width == 1) { + R = vec1(R); + } + } else if (jay_is_flag(d)) { + /* Explicit flags act like UGPRs. As sources they broadcast to all lanes, + * so we may ignore the SIMD offset. As destinations, they are written by + * SIMD1 instructions and are never SIMD split. + */ + assert(simd_offs == 0 || idx >= 0); + unsigned offs_B = d.reg * (f->shader->dispatch_width / 8); + R = brw_flag_subreg(offs_B / 2); + } else if (d.file == J_ADDRESS) { + R = brw_address_reg(d.reg); + } else if (d.file == J_ARF) { + R = brw_ud1_reg(ARF, jay_base_index(d), 0); + } else { + UNREACHABLE("unexpected file"); + } + + R.negate = d.negate; + R.abs = d.abs; + return byte_offset(retype(R, to_brw_reg_type(type)), offset_B); +} + +#define SRC(i) to_brw_reg(f, I, i, simd_offs, false) + +#define OP0(hw) \ + case JAY_OPCODE_##hw: \ + brw_##hw(p); \ + break; + +#define OP1(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu1(p, BRW_OPCODE_##hw, dst, SRC(0)); \ + break; + +#define OP2(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu2(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1)); \ + break; + +#define OP3(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1), SRC(2)); \ + break; + +#define OP3_SWAP(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(2), SRC(1), SRC(0)); \ + break; + +static struct brw_reg +quad_swizzle(struct brw_reg r, const jay_inst *I) +{ + /* clang-format off */ + switch (jay_quad_swizzle_swizzle(I)) { + case JAY_QUAD_SWIZZLE_XXXX: return suboffset(stride(r, 4, 4, 0), 0); + case JAY_QUAD_SWIZZLE_YYYY: return suboffset(stride(r, 4, 4, 0), 1); + case JAY_QUAD_SWIZZLE_ZZZZ: return suboffset(stride(r, 4, 4, 0), 2); + case JAY_QUAD_SWIZZLE_WWWW: return suboffset(stride(r, 4, 4, 0), 3); + case JAY_QUAD_SWIZZLE_XXZZ: return suboffset(stride(r, 2, 2, 0), 0); + case JAY_QUAD_SWIZZLE_YYWW: return suboffset(stride(r, 2, 2, 0), 1); + case JAY_QUAD_SWIZZLE_XYXY: return suboffset(stride(r, 0, 2, 1), 0); + case JAY_QUAD_SWIZZLE_ZWZW: return suboffset(stride(r, 0, 2, 1), 2); + } + /* clang-format on */ + + UNREACHABLE("invalid quad swizzle"); +} + +/* Runs once per SIMD-split, so must not modify the instruction! */ +static void +emit(struct brw_codegen *p, + jay_function *f, + const jay_inst *I, + unsigned simd_offs) +{ + ASSERTED unsigned nr_ins_before = p->nr_insn; + unsigned exec_size = jay_simd_width_physical(f->shader, I); + // jay_print_inst(stdout, (jay_inst *) I); + + /* Fix up SWSB dependencies for SIMD split instructions. The latter + * instructions do not need to redundantly wait on an SBID but might + * replicate their regdists. + */ + struct tgl_swsb dep = + simd_offs && !I->replicate_dep ? tgl_swsb_null() : I->dep; + dep.mode = simd_offs ? TGL_SBID_NULL : dep.mode; + + if (I->decrement_dep) { + unsigned delta = simd_offs * jay_macro_length(I); + assert(dep.regdist > delta); + dep.regdist -= delta; + } + + brw_set_default_exec_size(p, util_logbase2(exec_size)); + brw_set_default_mask_control(p, jay_is_no_mask(I)); + brw_set_default_swsb(p, dep); + brw_set_default_saturate(p, I->saturate); + + /* Quad swizzle can get split down to SIMD4 even on Xe2 where we don't have + * NibCtrl. Fortunately, it's NoMask so it doesn't matter. + */ + if (I->op != JAY_OPCODE_QUAD_SWIZZLE) { + brw_set_default_group(p, simd_offs * exec_size); + } + + /* Grab the hardware predicate, corresponding either to a logical predicate + * or SEL's selector. + */ + const jay_def *pred = I->predication ? jay_inst_get_predicate((void *) I) : + I->op == JAY_OPCODE_SEL ? &I->src[2] : + NULL; + + brw_set_default_predicate_control(p, pred ? BRW_PREDICATE_NORMAL : + BRW_PREDICATE_NONE); + brw_set_default_predicate_inverse(p, pred && pred->negate); + + /* Jay/brw enums line up by construction */ + enum brw_conditional_mod cmod = + (enum brw_conditional_mod) I->conditional_mod; + + if (!jay_is_null(I->cond_flag)) { + assert(!(pred && pred->reg != I->cond_flag.reg) && "must be tied"); + pred = &I->cond_flag; + } + + if (pred) { + unsigned reg = pred->reg * jay_phys_flag_per_virt(f->shader); + brw_set_default_flag_reg(p, reg / 2, reg % 2); + } + + if (I->op == JAY_OPCODE_MIN) { + cmod = BRW_CONDITIONAL_L; + } else if (I->op == JAY_OPCODE_MAX) { + cmod = BRW_CONDITIONAL_GE; + } + + struct brw_reg dst = to_brw_reg(f, I, -1, simd_offs, false); + + switch (I->op) { + OP0(ELSE) + OP0(ENDIF) + OP0(WHILE) + OP0(BREAK) + OP1(MOV, MOV) + OP1(MODIFIER, MOV) + OP1(RNDD, RNDD) + OP1(RNDZ, RNDZ) + OP1(RNDE, RNDE) + OP1(FRC, FRC) + OP1(BFREV, BFREV) + OP1(CBIT, CBIT) + OP1(NOT, NOT) + OP1(FBL, FBL) + OP1(FBH, FBH) + OP1(LZD, LZD) + OP2(ROL, ROL) + OP2(AVG, AVG) + OP2(ADD, ADD) + OP2(MUL, MUL) + OP2(SEL, SEL) + OP2(MIN, SEL) + OP2(MAX, SEL) + OP2(MUL_32X16, MUL) + OP2(AND, AND) + OP2(AND_U32_U16, AND) + OP2(OR, OR) + OP2(XOR, XOR) + OP2(ASR, ASR) + OP2(SHR, SHR) + OP2(SHL, SHL) + OP2(BFI1, BFI1) + OP3(BFI2, BFI2) + OP3(ADD3, ADD3) + OP3(CSEL, CSEL) + OP3(DP4A_UU, DP4A) + OP3(DP4A_SS, DP4A) + OP3(DP4A_SU, DP4A) + OP3_SWAP(MAD, MAD) + OP3_SWAP(BFE, BFE) + + case JAY_OPCODE_LOOP_ONCE: + /* TODO: Is there a better way to do this? */ + brw_BREAK(p); + brw_WHILE(p); + break; + + case JAY_OPCODE_IF: + brw_IF(p, util_logbase2(exec_size)); + break; + + case JAY_OPCODE_MATH: + gfx6_math(p, dst, jay_math_op(I), SRC(0), + retype(brw_null_reg(), to_brw_reg_type(I->type))); + break; + + case JAY_OPCODE_BFN: + brw_BFN(p, dst, SRC(0), SRC(1), SRC(2), brw_imm_ud(jay_bfn_ctrl(I))); + break; + + case JAY_OPCODE_DESWIZZLE_16: + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_MOV(p, retype(xe2_vec8_grf(jay_deswizzle_16_dst(I), 0), BRW_TYPE_UD), + retype(xe2_vec8_grf(jay_deswizzle_16_src(I), 0), BRW_TYPE_UD)); + break; + + case JAY_OPCODE_CVT: { + unsigned index = jay_cvt_index(I); + bool force_hi = false; + + /* We will apply a suboffset for the specific subword being converted. In + * the case where we have a subword (16-bit) stride, accesses to the upper + * half will be instead to a discontiguous GRF so we have to fix up. This + * affects u8->u32 conversions. + */ + if (I->src[0].file == GPR) { + unsigned type_size_B = jay_type_size_bits(jay_cvt_src_type(I)) / 8; + unsigned index_B = index * type_size_B; + unsigned stride_B = + jay_stride_to_bits(jay_def_stride(f->shader, I->src[0])) / 8; + + if (index_B >= stride_B) { + assert(stride_B == 2 && index_B <= 4 && !I->src[0].hi); + force_hi = true; + index = (index_B % stride_B) / type_size_B; + } + } + + brw_MOV(p, dst, + suboffset(to_brw_reg(f, I, 0, simd_offs, force_hi), index)); + break; + } + + case JAY_OPCODE_SYNC: + brw_SYNC(p, jay_sync_op(I)); + break; + + case JAY_OPCODE_CMP: + brw_CMP(p, dst, I->conditional_mod, SRC(0), SRC(1)); + break; + + case JAY_OPCODE_MOV_IMM64: + brw_MOV(p, dst, brw_imm_u64(jay_mov_imm64_imm(I))); + break; + + case JAY_OPCODE_RELOC: + brw_MOV_reloc_imm(p, dst, BRW_TYPE_UD, jay_reloc_param(I), + jay_reloc_base(I)); + break; + + case JAY_OPCODE_QUAD_SWIZZLE: + brw_MOV(p, dst, quad_swizzle(SRC(0), I)); + break; + + case JAY_OPCODE_BROADCAST_IMM: + brw_MOV(p, dst, get_element(SRC(0), jay_broadcast_imm_lane(I))); + break; + + case JAY_OPCODE_SEND: + brw_SEND(p, jay_send_sfid(I), dst, SRC(2), SRC(3), SRC(0), SRC(1), + jay_send_ex_desc_imm(I), jay_send_ex_mlen(I), + jay_send_bindless(I), jay_send_eot(I), false /* gather */); + if (jay_send_check_tdr(I)) { + brw_eu_inst_set_opcode(p->isa, brw_eu_last_inst(p), BRW_OPCODE_SENDC); + } + break; + + /* Gfx20+ has separate Render Target Array indices for each pair of subspans + * in order to support multiple polygons, so we need to use a <1;8,0> region + * in order to select the word for each channel. + */ + case JAY_OPCODE_EXTRACT_LAYER: + brw_AND(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UW), 1, 8, 0), + brw_imm_uw(0x7ff)); + break; + + case JAY_OPCODE_EXPAND_QUAD: + brw_MOV(p, dst, stride(SRC(simd_offs), 1, 4, 0)); + break; + + case JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS: + brw_set_default_exec_size(p, BRW_EXECUTE_32); + brw_set_default_group(p, 0); + brw_ADD(p, retype(dst, BRW_TYPE_UW), retype(SRC(0), BRW_TYPE_UW), + brw_imm_uv(0x11100100)); + break; + + case JAY_OPCODE_LANE_ID_8: + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_MOV(p, dst, brw_imm_uv(0x76543210)); + break; + + case JAY_OPCODE_LANE_ID_EXPAND: + brw_set_default_exec_size(p, util_logbase2(jay_lane_id_expand_width(I))); + brw_ADD(p, suboffset(dst, jay_lane_id_expand_width(I)), SRC(0), + brw_imm_uw(jay_lane_id_expand_width(I))); + break; + + case JAY_OPCODE_EXTRACT_BYTE_PER_8LANES: + brw_MOV(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UB), 1, 8, 0)); + break; + + case JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4: + brw_SHR(p, dst, SRC(0), brw_imm_uv(0x44440000)); + break; + + case JAY_OPCODE_MUL_32: { + brw_MUL(p, retype(brw_acc_reg(1), to_brw_reg_type(I->type)), SRC(0), + subscript(SRC(1), BRW_TYPE_UW, 0)); + + brw_set_default_swsb(p, tgl_swsb_null()); + brw_alu2(p, jay_mul_32_high(I) ? BRW_OPCODE_MACH : BRW_OPCODE_MACL, dst, + SRC(0), SRC(1)); + break; + } + + case JAY_OPCODE_SHUFFLE: { + struct brw_reg a0 = brw_address_reg(0); + unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]); + unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width; + + brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B)); + brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD)); + break; + } + + default: + jay_print_inst(stderr, (jay_inst *) I); + UNREACHABLE("Unhandled opcode"); + } + + if (cmod != BRW_CONDITIONAL_NONE) { + brw_eu_inst_set_cond_modifier(p->devinfo, brw_eu_last_inst(p), cmod); + } + + assert(p->nr_insn == (nr_ins_before + jay_macro_length(I)) && + "Jay instructions must map 1:n to GEN instructions"); +} + +struct jay_shader_bin * +jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size) +{ + struct jay_shader_bin *bin = rzalloc(s, struct jay_shader_bin); + + struct util_dynarray prog; + util_dynarray_init(&prog, bin); + + struct brw_isa_info isa; + struct brw_codegen p; + + brw_init_isa_info(&isa, s->devinfo); + brw_init_codegen(&isa, &p, bin); + int start_offset = p.next_insn_offset; + + /* TODO: Multifunction properly */ + jay_foreach_function(s, f) { + jay_foreach_block(f, block) { + if (block->loop_header) { + brw_DO(&p, 0); + } + + jay_foreach_inst_in_block(block, I) { + for (unsigned i = 0; i < (1 << jay_simd_split(s, I)); ++i) { + emit(&p, f, I, i); + } + } + } + } + + int final_halt_offset = -1 /* TODO */; + brw_set_uip_jip(&p, start_offset, final_halt_offset); + + struct disasm_info *disasm = disasm_initialize(p.isa, NULL); + + disasm_new_inst_group(disasm, 0); + disasm_new_inst_group(disasm, p.next_insn_offset); + + UNUSED bool valid = true; +#ifndef NDEBUG + valid = + brw_validate_instructions(p.isa, p.store, 0, p.next_insn_offset, disasm); +#endif + + brw_compact_instructions(&p, start_offset, disasm); + + if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(s->stage)) || !valid) { + dump_assembly(p.store, 0, p.next_insn_offset, disasm, NULL, stdout); + } + + if (!valid) { + UNREACHABLE("invalid assembly"); + } + + struct brw_stage_prog_data *prog_data = &s->prog_data->base; + + assert(prog_data->const_data_size == 0); + if (const_data_size > 0) { + prog_data->const_data_size = const_data_size; + prog_data->const_data_offset = + brw_append_data(&p, const_data, const_data_size, 32); + } + + bin->kernel = brw_get_program(&p, &bin->size); + s->prog_data->base.relocs = + brw_get_shader_relocs(&p, &s->prog_data->base.num_relocs); + + return bin; +} diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c new file mode 100644 index 00000000000..7a3a6953fb7 --- /dev/null +++ b/src/intel/compiler/jay/jay_validate.c @@ -0,0 +1,328 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +#ifndef NDEBUG + +enum validate_block_state { + STATE_PHI_DST, + STATE_NORMAL, + STATE_LATE, +}; + +struct validate_state { + bool failed; + bool post_ra; + const char *when; + jay_inst *I; + jay_block *block; + jay_function *func; + BITSET_WORD *defs; + enum jay_file *files; + enum validate_block_state block_state; +}; + +static enum validate_block_state +block_state_for_inst(jay_inst *I) +{ + if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) { + return STATE_PHI_DST; + } else if (I->op == JAY_OPCODE_PHI_SRC || + (jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) { + return STATE_LATE; + } else { + return STATE_NORMAL; + } +} + +static void +chirp(struct validate_state *validate, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + if (!validate->failed) { + fprintf(stderr, "jay shader validation failed (after %s):\n", + validate->when); + validate->failed = true; + } + if (validate->I) { + fprintf(stderr, + " invalid instruction in block %d: ", validate->block->index); + jay_print_inst(stderr, validate->I); + } + fprintf(stderr, " "); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n\n"); + + va_end(args); +} + +#define CHECK(cond) \ + if (!(cond)) { \ + chirp(validate, "assertion failed at %s:%u\n %s", __FILE__, __LINE__, \ + #cond); \ + } + +static void +validate_flagness(struct validate_state *validate, + jay_def def, + enum jay_type type, + const char *name) +{ + CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def)); +} + +static unsigned +get_src_words(struct validate_state *validate, jay_inst *I, unsigned s) +{ + if (I->op == JAY_OPCODE_EXPAND_QUAD) { + return 4; + } + + bool vectorized = I->dst.file == UGPR && + jay_num_values(I->dst) > jay_type_vector_length(I->type) && + I->op != JAY_OPCODE_SEND && + jay_num_values(I->src[s]) > 1; + + unsigned elsize = jay_type_vector_length(jay_src_type(I, s)); + unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1); + + if (vectorized && I->src[s].file == GPR) { + CHECK(words == validate->func->shader->dispatch_width); + return 1; + } else { + return words; + } +} + +/* + * Validate the fundamental invariants of static single assignment form. + */ +static void +validate_ssa(struct validate_state *validate, jay_inst *I) +{ + jay_foreach_src_index(I, src_index, _, ssa_index) { + CHECK(BITSET_TEST(validate->defs, ssa_index) && "defs dominate uses"); + CHECK(validate->files[ssa_index] == I->src[src_index].file && + "consistent files"); + } + + jay_foreach_dst_index(I, d, ssa_index) { + CHECK(!BITSET_TEST(validate->defs, ssa_index) && "single definition"); + BITSET_SET(validate->defs, ssa_index); + validate->files[ssa_index] = d.file; + } +} + +/* + * Validate the invariants of jay_def. + */ +static void +validate_def(struct validate_state *validate, jay_def def, const char *kind) +{ + CHECK(!jay_is_null(def) || !def.reg); + + if (def.collect) { + CHECK(jay_num_values(def) >= 2); + CHECK(def.file == GPR || def.file == UGPR); + + bool contiguous = true; + jay_foreach_comp(def, c) { + uint32_t index = jay_channel(def, c); + contiguous &= index == (jay_channel(def, 0) + c); + CHECK(index != JAY_SENTINEL); + } + + CHECK(!contiguous); + } else if (def.file == J_IMM) { + CHECK(!def.reg); + CHECK(!def.num_values_m1); + CHECK(!def.negate); + CHECK(!def.abs); + } else if (def.file == ACCUM || def.file == UACCUM || def.hi) { + CHECK(validate->post_ra); + } else { + CHECK(jay_base_index(def) != JAY_SENTINEL || validate->post_ra); + } + + if (jay_is_ssa(def) && jay_channel(def, 0) != JAY_SENTINEL) { + jay_foreach_comp(def, c) { + CHECK(jay_channel(def, c) < validate->func->ssa_alloc); + } + } + + CHECK(jay_num_values(def) == 1 || !jay_is_flag(def)); +} + +/** + * Validate an instruction. + */ +static void +validate_inst(struct validate_state *validate, jay_inst *I) +{ + validate->I = I; + + /* Block states are monotonic. */ + enum validate_block_state state = block_state_for_inst(I); + CHECK(state >= validate->block_state); + validate->block_state = state; + + const struct jay_opcode_info *opinfo = &jay_opcode_infos[I->op]; + + validate_def(validate, I->dst, "dst"); + validate_def(validate, I->cond_flag, "cond_flag"); + + jay_foreach_src(I, s) { + validate_def(validate, I->src[s], "source"); + } + + if (!validate->post_ra) { + validate_ssa(validate, I); + } + + CHECK(I->num_srcs <= JAY_MAX_SRCS); + + validate_flagness(validate, I->dst, I->type, "destination"); + validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag"); + + CHECK(!I->conditional_mod || + !jay_is_null(I->cond_flag) || + I->op == JAY_OPCODE_CSEL); + + /* These assumptions are baked into the definition of broadcast_flag and + * required to ensure correctness with the lane masking. + */ + CHECK(!I->broadcast_flag || + (!jay_is_null(I->cond_flag) && + jay_is_null(I->dst) && + I->cond_flag.file == FLAG && + (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_MOV))); + + /* Standard modifiers only allowed on some instructions */ + CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL); + CHECK(!I->saturate || opinfo->sat); + + unsigned num_srcs = I->num_srcs; + + if (I->predication) { + CHECK(num_srcs >= I->predication); + + if (jay_inst_has_default(I)) { + CHECK(jay_inst_get_default(I)->file == I->dst.file); + } + + CHECK(jay_is_flag(*jay_inst_get_predicate(I))); + CHECK(!jay_is_null(*jay_inst_get_predicate(I))); + + num_srcs -= I->predication; + } + + if (validate->post_ra) { + CHECK(jay_simd_width_logical(validate->func->shader, I) > 0); + CHECK(jay_simd_width_physical(validate->func->shader, I) > 0); + } + + /* Number of sources should match for our opcode. If opinfo->num_srcs + * is zero, then it may actually take a variable number of sources. + */ + CHECK(num_srcs == opinfo->num_srcs || opinfo->num_srcs == 0); + + for (unsigned s = 0; s < num_srcs; s++) { + if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) { + unsigned expected = get_src_words(validate, I, s); + unsigned words = jay_num_values(I->src[s]); + if (I->op != JAY_OPCODE_SEND || s < 2) { + CHECK(expected == words); + } + + validate_flagness(validate, I->src[s], jay_src_type(I, s), "source"); + } + + CHECK(!I->src[s].negate || jay_has_src_mods(I, s)); + } + + switch (I->op) { + case JAY_OPCODE_SEL: + CHECK(jay_is_flag(I->src[2]) && "SEL src[2] (selector) must be a flag"); + break; + case JAY_OPCODE_SWAP: + CHECK(I->src[0].file == I->src[1].file && "SWAP files must match"); + break; + default: + break; + } +} + +static void +jay_validate_function(struct validate_state *validate) +{ + validate->defs = BITSET_CALLOC(validate->func->ssa_alloc); + validate->files = + calloc(validate->func->ssa_alloc, sizeof(validate->files[0])); + + jay_foreach_block(validate->func, block) { + validate->block = block; + validate->I = NULL; + + CHECK(block->successors[0] || !block->successors[1]); + + /* Post-RA we can remove physical jumps though they exist logically */ + if (block->successors[1] && !validate->post_ra) { + CHECK(jay_block_ending_jump(block) != NULL); + } + + /* If a block has multiple successors, and one of them has multiple + * predecessors, then we've detected a critical edge. + */ + if (jay_num_successors(block) > 1 && !validate->post_ra) { + jay_foreach_successor(block, succ) { + if (jay_num_predecessors(succ) > 1) { + chirp(validate, "Critical edge (B%u -> B%u) is not allowed", + block->index, succ->index); + } + } + } + + validate->block_state = 0; + jay_foreach_inst_in_block(block, inst) { + validate_inst(validate, inst); + } + } + + /* Validate that there are no dead phis. RA relies on this. */ + if (!validate->post_ra) { + jay_foreach_block(validate->func, block) { + jay_foreach_phi_src_in_block(block, phi) { + CHECK(BITSET_TEST(validate->defs, jay_phi_src_index(phi))); + } + } + } + + free(validate->defs); + free(validate->files); +} + +void +jay_validate(jay_shader *s, const char *when) +{ + struct validate_state validate = { .when = when, .post_ra = s->post_ra }; + + jay_foreach_function(s, f) { + validate.func = f; + jay_validate_function(&validate); + } + + if (validate.failed) { + fprintf(stderr, "jay shader that failed validation:\n"); + jay_print(stderr, s); + abort(); + } +} + +#endif diff --git a/src/intel/compiler/jay/jay_validate_ra.c b/src/intel/compiler/jay/jay_validate_ra.c new file mode 100644 index 00000000000..02bd20b57bd --- /dev/null +++ b/src/intel/compiler/jay/jay_validate_ra.c @@ -0,0 +1,217 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2024 Alyssa Rosenzweig + * SPDX-License-Identifier: MIT + */ + +#include "util/ralloc.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* Validatation doesn't make sense in release builds */ +#ifndef NDEBUG + +struct regfile { + /* For each register in each file, records the SSA index currently stored + * in that register (or zero if undefined contents). + */ + uint32_t *r[JAY_NUM_SSA_FILES]; + + /* Size of each register file */ + size_t n[JAY_NUM_SSA_FILES]; +}; + +static uint32_t * +reg(struct regfile *rf, enum jay_file file, uint32_t reg) +{ + /* FLAG and UFLAG share their registers. TODO: Rework? */ + if (file == UFLAG) { + file = FLAG; + } + + assert(file < JAY_NUM_SSA_FILES); + assert(reg < rf->n[file]); + return &rf->r[file][reg]; +} + +static uint32_t * +def_reg(struct regfile *rf, jay_def x, uint32_t component) +{ + return reg(rf, x.file, x.reg + component); +} + +static void +print_regfile(struct regfile *rf, FILE *fp) +{ + fprintf(fp, "regfile: \n"); + jay_foreach_ssa_file(file) { + for (unsigned i = 0; i < rf->n[file]; ++i) { + uint32_t v = *reg(rf, file, i); + const char *prefixes = "ruf"; /* XXX: share with jay_print */ + + if (v) { + fprintf(fp, " %c%u = %u\n", prefixes[file], i, v); + } + } + } + fprintf(fp, "\n"); +} + +static bool +validate_src(struct jay_partition *partition, + jay_inst *I, + unsigned s, + struct regfile *rf, + jay_def def) +{ + jay_foreach_comp(def, c) { + uint32_t actual = *def_reg(rf, def, c); + + if (def.file == GPR) { + assert(jay_gpr_to_stride(partition, def.reg) == + jay_gpr_to_stride(partition, def.reg + c)); + } + + if (actual == 0 || actual != jay_channel(def, c)) { + fprintf(stderr, "invalid RA for source %u, channel %u.\n", s, c); + + fprintf(stderr, "expected index %u but", jay_channel(def, c)); + if (actual) + fprintf(stderr, " got index %u\n", actual); + else + fprintf(stderr, " register is undefined\n"); + + jay_print_inst(stderr, I); + print_regfile(rf, stderr); + return false; + } + } + + return true; +} + +static bool +validate_block(jay_function *func, jay_block *block, struct regfile *blocks) +{ + struct regfile *rf = &blocks[block->index]; + bool success = true; + + /* Pathological shaders can end up with loop headers that have only a + * single predecessor and act like normal blocks. Validate them as such, + * since RA treats them as such implicitly. Affects: + * + * dEQP-VK.graphicsfuzz.spv-stable-mergesort-dead-code + */ + bool loop_header = block->loop_header && jay_num_predecessors(block) > 1; + + /* Initialize the register file based on predecessors. */ + /* Initialize with the exit state of any one predecessor */ + jay_block *first_pred = jay_first_predecessor(block); + if (first_pred) { + struct regfile *pred_rf = &blocks[first_pred->index]; + + jay_foreach_ssa_file(f) { + memcpy(rf->r[f], pred_rf->r[f], rf->n[f] * sizeof(uint32_t)); + } + } + + /* TODO: Handle loop header validation better */ + if (!loop_header) { + /* Intersect with the other predecessor. If a register has different + * values coming in from each block, it is considered undefined at the + * start of the block. + */ + jay_foreach_predecessor(block, pred) { + struct regfile *pred_rf = &blocks[(*pred)->index]; + + jay_foreach_ssa_file(file) { + for (unsigned r = 0; r < rf->n[file]; ++r) { + if (*reg(rf, file, r) != *reg(pred_rf, file, r)) { + *reg(rf, file, r) = 0; + } + } + } + } + } + + jay_foreach_inst_in_block(block, I) { + /* Validate sources */ + jay_foreach_ssa_src(I, s) { + if (jay_channel(I->src[s], 0) != JAY_SENTINEL) { + success &= + validate_src(&func->shader->partition, I, s, rf, I->src[s]); + } + } + + /* Record destinations */ + jay_foreach_dst(I, dst) { + if (jay_channel(dst, 0) != JAY_SENTINEL) { + jay_foreach_comp(dst, c) { + *def_reg(rf, dst, c) = jay_channel(dst, c); + + if (dst.file == GPR) { + struct jay_partition *p = &func->shader->partition; + assert(jay_gpr_to_stride(p, dst.reg) == + jay_gpr_to_stride(p, dst.reg + c)); + } + } + } + } + + if (I->op == JAY_OPCODE_MOV && + jay_channel(I->dst, 0) == JAY_SENTINEL && + jay_is_ssa(I->src[0]) && + jay_channel(I->src[0], 0) == JAY_SENTINEL) { + + /* Lowered live range splits don't have SSA associated, handle + * directly at the register level. + */ + assert(jay_num_values(I->dst) == jay_num_values(I->src[0])); + + jay_foreach_comp(I->dst, c) { + *def_reg(rf, I->dst, c) = *def_reg(rf, I->src[0], c); + } + } else if (I->op == JAY_OPCODE_SWAP) { + assert(jay_num_values(I->src[0]) == jay_num_values(I->src[1])); + + jay_foreach_comp(I->src[0], c) { + SWAP(*def_reg(rf, I->src[0], c), *def_reg(rf, I->src[1], c)); + } + } + } + + return success; +} + +void +jay_validate_ra(jay_function *func) +{ + bool succ = true; + linear_ctx *lin_ctx = linear_context(func->shader); + struct regfile *blocks = + linear_zalloc_array(lin_ctx, struct regfile, func->num_blocks); + + jay_foreach_block(func, block) { + struct regfile *b = &blocks[block->index]; + assert(block->index < func->num_blocks); + + jay_foreach_ssa_file(file) { + b->n[file] = jay_num_regs(func->shader, file); + b->r[file] = linear_zalloc_array(lin_ctx, uint32_t, b->n[file]); + } + } + + jay_foreach_block(func, block) { + succ &= validate_block(func, block, blocks); + } + + if (!succ) { + jay_print_func(stderr, func); + UNREACHABLE("invalid RA"); + } + + linear_free_context(lin_ctx); +} + +#endif /* NDEBUG */ diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build new file mode 100644 index 00000000000..e9c47ada78c --- /dev/null +++ b/src/intel/compiler/jay/meson.build @@ -0,0 +1,109 @@ +# Copyright 2017 Intel Corporation +# SPDX-License-Identifier: MIT + +jay_opcodes = custom_target( + input : ['jay_opcodes_gen.py'], + output : ['jay_opcodes.c', 'jay_opcodes.h'], + command : [prog_python, '@INPUT@', '--code', '@OUTPUT0@', '--header', '@OUTPUT1@'], + depend_files : files('jay_opcodes.py'), +) + +idep_jay_opcodes_h = declare_dependency( + sources : [jay_opcodes[1]], + include_directories : include_directories('.'), +) + +jay_extra_info_h = custom_target( + input : ['jay_extra_info.h.py'], + output : 'jay_extra_info.h', + command : [prog_python, '@INPUT@', '@OUTPUT@'], + depend_files : files('jay_opcodes.py'), +) + +idep_jay_extra_info_h = declare_dependency( + sources : [jay_extra_info_h], + include_directories : include_directories('.'), +) + +jay_builder_opcodes_h = custom_target( + input : 'jay_builder_opcodes.h.py', + output : 'jay_builder_opcodes.h', + command : [prog_python, '@INPUT@', '@OUTPUT@'], + depend_files : files('jay_opcodes.py'), +) + +idep_jay_builder_opcodes_h = declare_dependency( + sources : [jay_builder_opcodes_h], + include_directories : include_directories('.'), +) + +jay_nir_algebraic = custom_target( + 'jay_nir_algebraic.c', + input : ['jay_nir_algebraic.py'], + output : 'jay_nir_algebraic.c', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-p', dir_compiler_nir] , + depend_files : nir_algebraic_depends, +) + +libintel_compiler_jay_files = files( + 'jay.h', + 'jay_assign_flags.c', + 'jay_from_nir.c', + 'jay_ir.h', + 'jay_liveness.c', + 'jay_lower_post_ra.c', + 'jay_lower_pre_ra.c', + 'jay_lower_scoreboard.c', + 'jay_lower_spill.c', + 'jay_opt_dead_code.c', + 'jay_opt_control_flow.c', + 'jay_opt_propagate.c', + 'jay_print.c', + 'jay_private.h', + 'jay_repair_ssa.c', + 'jay_register_allocate.c', + 'jay_simd_width.c', + 'jay_spill.c', + 'jay_to_binary.c', + 'jay_validate.c', + 'jay_validate_ra.c', +) + +libintel_compiler_jay = static_library( + 'intel_compiler_jay', + [libintel_compiler_jay_files, jay_nir_algebraic, jay_opcodes[0]], + include_directories : [inc_include, inc_src, inc_intel], + c_args : [no_override_init_args, '-Wno-c23-extensions', '-Wno-array-bounds'], + gnu_symbol_visibility : 'hidden', + dependencies : [idep_nir_headers, idep_jay_opcodes_h, idep_jay_builder_opcodes_h, idep_jay_extra_info_h, idep_mesautil, idep_intel_dev], + build_by_default : false, +) + +idep_intel_compiler_jay = declare_dependency( + link_with : [libintel_compiler_jay], + dependencies : [ + idep_nir, + idep_vtn, + ], +) + +if with_tests + test( + 'jay_tests', + executable( + 'jay_tests', + files( + 'test/test-lower-post-ra.cpp', + 'test/test-optimizer.cpp', + 'test/test-repair-ssa.cpp', + ), + c_args : [c_msvc_compat_args, no_override_init_args], + gnu_symbol_visibility : 'hidden', + include_directories : [inc_include, inc_src, inc_intel], + dependencies: [idep_gtest, idep_nir, idep_jay_opcodes_h, idep_jay_builder_opcodes_h, idep_jay_extra_info_h, idep_mesautil, idep_intel_dev], + link_with : [libintel_compiler_jay], + ), + suite : ['intel'], + protocol : 'gtest', + ) +endif diff --git a/src/intel/compiler/jay/register-file.md b/src/intel/compiler/jay/register-file.md new file mode 100644 index 00000000000..b2053ccf348 --- /dev/null +++ b/src/intel/compiler/jay/register-file.md @@ -0,0 +1,57 @@ +# Glossary + +**lane**: A single work-item. + +**subgroup**: A collection of 8, 16, or 32 lanes executing in lockstep. +Avoid using the term _thread_ as it is ambiguous. + +**uniform**: A value that has the same value in every active lane of a subgroup. +Sometimes called _convergent_. Opposite of "non-uniform". + +**non-uniform**: A value that may have different values in different active +lanes within a subgroup. Sometimes called _divergent_. Opposite of "uniform". + +**GPR**: General-purpose register, a single non-uniform value viewed from the +perspective of a single lane. This is a 'virtual' or 'logical' register within +the SIMT programming model. It does not represent a physical machine +register. For that, see "GRF". + +**UGPR**: Uniform general purpose register, a single uniform value. This is +again a virtual or logical register. + +**GRF**: A physical Intel GPU register. On Xe2+, a GRF is 512-bits. On older +platforms, a GRF is 256-bits. Depending on the platform and the SIMD width, +different numbers of GRFs required to store a single GPR, and different numbers +of UGPRs fit into a single GRF. In SIMD32 mode on Xe2, 1 GPR requires 2 GRFs, +and 16 UGPRs fit into 1 GRF. + +**scalar**: A single value from the perspective of a single lane; a single GPR +or UGPR. Note that a scalar may be either uniform or non-uniform. Opposite of +"vector". + +**vector**: A collection of multiple values from the perspective of a single +lane. All scalars within the vector must be identically be GPRs or UGPRs. + +# Introduction + +Jay separates the logical register files (GPR and UGPR) from the +unified physical register file. We assign registers independently for each +logical file, and then post-RA we remap to physical GRFs. This simplifies RA. + +We decide a static GPR/UGPR split up front. Ideally, we'd just use the +first N registers for GPRs and the rest for UGPRs, or something like +that. Unfortunately, several hardware issues complicate this scheme... + +# End-of-thread SENDs + +End-of-thread SENDs require their source is in r112-r127. As their source will +always be per-thread, we want to make sure these are GPRs. + +# Payloads + +At the start of each thread, the register file is preloaded with a payload. +Parts of the payload act like UGPRs, parts act like GPRs, and parts act like... +something weird and in between. To minimize copying, we want to assign UGPRs to +the UGPR parts of the payload and GPRs to the GPR parts. As for the weird cases, +we model them as UGPR vectors and use special opcodes (lowered late to +regioning) to unpack to GPRs for normal handling. diff --git a/src/intel/compiler/jay/test/jay_test.h b/src/intel/compiler/jay/test/jay_test.h new file mode 100644 index 00000000000..43cc48b87ef --- /dev/null +++ b/src/intel/compiler/jay/test/jay_test.h @@ -0,0 +1,141 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_private.h" +#include "shader_enums.h" + +static inline jay_block * +jay_test_block(jay_function *f) +{ + jay_block *blk = jay_new_block(f); + list_addtail(&blk->link, &f->blocks); + return blk; +} + +/* Helper to generate a jay_builder suitable for creating test instructions */ +static inline jay_builder * +jay_test_builder(void *memctx) +{ + jay_shader *s = jay_new_shader(memctx, MESA_SHADER_COMPUTE); + jay_function *f = jay_new_function(s); + s->partition.base8 = 8; + + struct intel_device_info *devinfo = + rzalloc(memctx, struct intel_device_info); + s->devinfo = devinfo; + s->dispatch_width = 32; + + unsigned verx10 = 200; + devinfo->verx10 = verx10; + devinfo->ver = verx10 / 10; + assert(devinfo->ver > 0); + + /* We'll use low indices for test values */ + f->ssa_alloc = 10; + + jay_builder *b = rzalloc(memctx, jay_builder); + *b = jay_init_builder(f, jay_after_block(jay_test_block(f))); + return b; +} + +/* Helper to compare for logical equality of instructions. Need to compare the + * pointers, then compare raw data. + */ +static inline bool +jay_inst_equal(jay_inst *A, jay_inst *B) +{ + /* Check the plain old data portion of jay_inst. */ + unsigned header = sizeof(struct list_head); + if (memcmp((uint8_t *) A + header, (uint8_t *) B + header, + sizeof(jay_inst) - header)) + return false; + + /* All of the sizes are plain data. They match, so do a deep compare. */ + size_t size = (A->num_srcs * sizeof(jay_def)) + jay_inst_info_size(A); + return !memcmp(A->src, B->src, size); +} + +static inline bool +jay_block_equal(jay_block *A, jay_block *B) +{ + if (list_length(&A->instructions) != list_length(&B->instructions)) + return false; + + list_pair_for_each_entry(jay_inst, I, J, &A->instructions, &B->instructions, + link) { + if (!jay_inst_equal(I, J)) { + return false; + } + } + + return true; +} + +static inline bool +jay_function_equal(jay_function *A, jay_function *B) +{ + if (list_length(&A->blocks) != list_length(&B->blocks)) + return false; + + list_pair_for_each_entry(jay_block, blockA, blockB, &A->blocks, &B->blocks, + link) { + if (!jay_block_equal(blockA, blockB)) + return false; + } + + return true; +} + +static inline bool +jay_shader_equal(jay_shader *A, jay_shader *B) +{ + if (list_length(&A->functions) != list_length(&B->functions)) + return false; + + list_pair_for_each_entry(jay_function, functionA, functionB, &A->functions, + &B->functions, link) { + if (!jay_function_equal(functionA, functionB)) + return false; + } + + return true; +} + +#define ASSERT_SHADER_EQUAL(A, B) \ + if (!jay_shader_equal(A, B)) { \ + ADD_FAILURE(); \ + fprintf(stderr, "Pass produced unexpected results"); \ + fprintf(stderr, " Actual:\n"); \ + jay_print(stderr, A); \ + fprintf(stderr, " Expected:\n"); \ + jay_print(stderr, B); \ + fprintf(stderr, "\n"); \ + } + +#define INSTRUCTION_CASE_GEN(instr, expected, pass, validate) \ + do { \ + jay_builder *A = jay_test_builder(mem_ctx); \ + jay_builder *B = jay_test_builder(mem_ctx); \ + { \ + jay_builder *b = A; \ + instr; \ + } \ + if (validate) \ + jay_validate(A->shader, "test setup"); \ + { \ + jay_builder *b = B; \ + expected; \ + } \ + JAY_PASS(A->shader, pass); \ + ASSERT_SHADER_EQUAL(A->shader, B->shader); \ + } while (0) + +#define INSTRUCTION_CASE(instr, expected, pass) \ + INSTRUCTION_CASE_GEN(instr, expected, pass, true) diff --git a/src/intel/compiler/jay/test/test-lower-post-ra.cpp b/src/intel/compiler/jay/test/test-lower-post-ra.cpp new file mode 100644 index 00000000000..209d944f347 --- /dev/null +++ b/src/intel/compiler/jay/test/test-lower-post-ra.cpp @@ -0,0 +1,82 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_test.h" + +#include + +#define CASE(instr, expected) \ + INSTRUCTION_CASE( \ + { \ + A->shader->post_ra = true; \ + instr; \ + }, \ + { \ + B->shader->post_ra = true; \ + expected; \ + }, \ + jay_lower_post_ra) + +#define PRE jay_add_predicate_else +#define POST jay_add_predicate +#define CFLAG jay_set_cond_flag + +#define NEGCASE(x) CASE(x, x) + +class LowerPostRA : public testing::Test { + protected: + LowerPostRA() + { + mem_ctx = ralloc_context(NULL); + + x = jay_bare_reg(GPR, 1); + y = jay_bare_reg(GPR, 2); + z = jay_bare_reg(GPR, 3); + u4 = jay_bare_reg(UGPR, 4); + f0 = jay_bare_reg(FLAG, 0); + f1 = jay_bare_reg(FLAG, 1); + f2 = jay_bare_reg(FLAG, 2); + } + + ~LowerPostRA() + { + ralloc_free(mem_ctx); + } + + jay_inst *I; + void *mem_ctx; + jay_def x, y, z, u4, f0, f1, f2, nul = jay_null(); +}; + +TEST_F(LowerPostRA, Tied) +{ + CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0, z), + POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0)); + + CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), jay_negate(f0), z), + POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), jay_negate(f0))); +} + +TEST_F(LowerPostRA, InsertMove) +{ + CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0, x), { + POST(b, jay_MOV(b, z, x), jay_negate(f0)); + POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0); + }); +} + +TEST_F(LowerPostRA, RewriteToSel) +{ + CASE(PRE(b, jay_MOV(b, z, y), f0, x), + jay_SEL(b, JAY_TYPE_U32, z, x, y, jay_negate(f0))); +} + +TEST_F(LowerPostRA, CopyUGPR) +{ + NEGCASE(jay_MOV(b, x, u4)); + NEGCASE(jay_MOV(b, u4, x)); +} diff --git a/src/intel/compiler/jay/test/test-optimizer.cpp b/src/intel/compiler/jay/test/test-optimizer.cpp new file mode 100644 index 00000000000..739a2d15610 --- /dev/null +++ b/src/intel/compiler/jay/test/test-optimizer.cpp @@ -0,0 +1,312 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/lut.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_private.h" +#include "jay_test.h" + +#include + +static void +jay_optimize_and_dce(jay_shader *shader) +{ + JAY_PASS(shader, jay_opt_propagate_forwards); + JAY_PASS(shader, jay_opt_propagate_backwards); + JAY_PASS(shader, jay_opt_dead_code); +} + +#define CASE(instr, expected) \ + INSTRUCTION_CASE( \ + { \ + instr; \ + jay_UNIT_TEST_u32(b, out); \ + }, \ + { \ + expected; \ + jay_UNIT_TEST_u32(b, out); \ + }, \ + jay_optimize_and_dce) + +#define NEGCASE(instr) CASE(instr, instr) +#define UNIT jay_UNIT_TEST_u32 + +#define NEG(x) jay_negate(x) + +#define MOV(T, src0) \ + ({ \ + jay_def dst = jay_alloc_def(b, GPR, 1); \ + jay_MODIFIER(b, T, dst, src0); \ + dst; \ + }) + +class Optimizer : public testing::Test { + protected: + Optimizer() + { + mem_ctx = ralloc_context(NULL); + + out = jay_scalar(GPR, 8); + wx = jay_scalar(TEST_FILE, 1); + wy = jay_scalar(TEST_FILE, 1); + wz = jay_scalar(TEST_FILE, 1); + } + + ~Optimizer() + { + ralloc_free(mem_ctx); + } + + void *mem_ctx; + + jay_def out, wx, wy, wz; +}; + +static enum jay_type float_types[] = { + JAY_TYPE_F16, + JAY_TYPE_F32, +}; + +TEST_F(Optimizer, Copyprop) +{ + CASE(jay_ADD(b, JAY_TYPE_U32, out, wx, jay_MOV_u32(b, wy)), + jay_ADD(b, JAY_TYPE_U32, out, wx, wy)); + + CASE(jay_ADD(b, JAY_TYPE_U32, out, wx, jay_MOV_u32(b, wy)), + jay_ADD(b, JAY_TYPE_U32, out, wx, wy)); +} + +TEST_F(Optimizer, FusedNeg) +{ + for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) { + enum jay_type T = float_types[i]; + + CASE(jay_ADD(b, T, out, wx, MOV(T, NEG(wy))), + jay_ADD(b, T, out, wx, NEG(wy))); + + CASE(jay_MUL(b, T, out, MOV(T, NEG(wy)), NEG(wx)), + jay_MUL(b, T, out, NEG(wy), NEG(wx))); + + CASE(jay_MAD(b, T, out, MOV(T, NEG(wy)), wz, NEG(MOV(T, NEG(wx)))), + jay_MAD(b, T, out, NEG(wy), wz, wx)); + } +} + +TEST_F(Optimizer, SELToFloat) +{ + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 3, x); + jay_SEL(b, JAY_TYPE_U32, out, wx, MOV(JAY_TYPE_F32, NEG(wy)), flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 3, x); + jay_SEL(b, JAY_TYPE_F32, out, wx, NEG(wy), flag); + }); +} + +TEST_F(Optimizer, FusedNot) +{ + CASE(jay_BFN(b, out, wx, jay_NOT_u32(b, wy), 0, UTIL_LUT3(a & b)), + jay_BFN(b, out, wx, wy, 0, UTIL_LUT3(a & ~b))); + + CASE(jay_AND(b, JAY_TYPE_U32, out, wx, jay_NOT_u32(b, wy)), + jay_AND(b, JAY_TYPE_U32, out, wx, jay_negate(wy))); + + CASE(jay_XOR(b, JAY_TYPE_U32, out, jay_NOT_u32(b, wx), wy), + jay_XOR(b, JAY_TYPE_U32, out, jay_negate(wx), wy)); + + CASE(jay_OR(b, JAY_TYPE_U32, out, jay_NOT_u32(b, wx), jay_NOT_u32(b, wy)), + jay_OR(b, JAY_TYPE_U32, out, jay_negate(wx), jay_negate(wy))); +} + +TEST_F(Optimizer, NegativeFusedFneg) +{ + for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) { + enum jay_type T = float_types[i]; + NEGCASE(jay_ADD(b, JAY_TYPE_U32, out, wx, MOV(T, NEG(wy)))); + NEGCASE(jay_ADD(b, JAY_TYPE_S32, out, wx, MOV(T, NEG(wy)))); + } +} + +/* TODO: test fneg with f64 */ + +TEST_F(Optimizer, FusedSat) +{ + for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) { + enum jay_type T = float_types[i]; + + CASE( + { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, T, x, wx, MOV(T, NEG(wy))); + jay_MODIFIER(b, T, out, x)->saturate = true; + }, + { jay_ADD(b, T, out, wx, NEG(wy))->saturate = true; }); + + CASE( + { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_MUL(b, T, x, wx, MOV(T, NEG(wy))); + jay_MODIFIER(b, T, out, x)->saturate = true; + }, + { jay_MUL(b, T, out, wx, NEG(wy))->saturate = true; }); + + CASE( + { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_MAX(b, T, x, wx, MOV(T, NEG(wy)))->saturate = true; + jay_MODIFIER(b, T, out, x)->saturate = true; + }, + { jay_MAX(b, T, out, wx, NEG(wy))->saturate = true; }); + } +} + +TEST_F(Optimizer, InverseBallotPropagate) +{ + CASE( + { + jay_def x = jay_alloc_def(b, UGPR, 1); + jay_def f = jay_alloc_def(b, FLAG, 1); + jay_ADD(b, JAY_TYPE_U32, x, wx, wy); + jay_MOV(b, f, x); + jay_SEL(b, JAY_TYPE_U32, out, wx, wy, f); + }, + { + UNUSED jay_def x = jay_alloc_def(b, UGPR, 1); + jay_def f = jay_alloc_def(b, FLAG, 1); + jay_ADD(b, JAY_TYPE_U32, f, wx, wy); + jay_SEL(b, JAY_TYPE_U32, out, wx, wy, f); + }); +} + +TEST_F(Optimizer, GtZero) +{ + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 0, x); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *add = jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_set_conditional_mod(b, add, flag, JAY_CONDITIONAL_GT); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); +} + +TEST_F(Optimizer, MultipleCmp) +{ + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def flag2 = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 0, x); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_GT, flag2, 0, x); + jay_SEL(b, JAY_TYPE_U32, out, x, jay_SEL_u32(b, x, 123, flag), flag2); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def flag2 = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *add = jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_set_conditional_mod(b, add, flag, JAY_CONDITIONAL_GT); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_GT, flag2, 0, x); + jay_SEL(b, JAY_TYPE_U32, out, x, jay_SEL_u32(b, x, 123, flag), flag2); + }); +} + +TEST_F(Optimizer, TypeNeutralConditionalMods) +{ + enum jay_conditional_mod mods[] = { + JAY_CONDITIONAL_NE, + JAY_CONDITIONAL_EQ, + }; + + for (unsigned i = 0; i < 2; ++i) { + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *bfn3 = jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_set_conditional_mod(b, bfn3, flag, mods[i]); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_AND(b, JAY_TYPE_U32, x, wx, wy); + jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *an = jay_AND(b, JAY_TYPE_U32, x, wx, wy); + jay_set_conditional_mod(b, an, flag, mods[i]); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + } +} + +TEST_F(Optimizer, SignednessMismatchConditionalMods) +{ + enum jay_conditional_mod mods[] = { + JAY_CONDITIONAL_LE, + JAY_CONDITIONAL_GT, + }; + + for (unsigned i = 0; i < 2; ++i) { + NEGCASE({ + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + } +} + +TEST_F(Optimizer, FloatMismatchConditionalMods) +{ + enum jay_conditional_mod mods[] = { + JAY_CONDITIONAL_NAN, + JAY_CONDITIONAL_EQ, + JAY_CONDITIONAL_NE, + JAY_CONDITIONAL_LT, + }; + + for (unsigned i = 0; i < 2; ++i) { + NEGCASE({ + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_CMP(b, JAY_TYPE_F32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + } +} diff --git a/src/intel/compiler/jay/test/test-repair-ssa.cpp b/src/intel/compiler/jay/test/test-repair-ssa.cpp new file mode 100644 index 00000000000..8d117746eee --- /dev/null +++ b/src/intel/compiler/jay/test/test-repair-ssa.cpp @@ -0,0 +1,213 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_test.h" + +#include + +JAY_DEFINE_FUNCTION_PASS(pass, jay_repair_ssa) + +#define CASE(instr) \ + INSTRUCTION_CASE_GEN( \ + { \ + UNUSED bool repaired = false; \ + b->func->ssa_alloc = 1; \ + instr \ + }, \ + { \ + UNUSED bool repaired = true; \ + b->func->ssa_alloc = 1; \ + instr \ + }, \ + pass, false) + +class RepairSSA : public testing::Test { + protected: + RepairSSA() + { + mem_ctx = ralloc_context(NULL); + } + + ~RepairSSA() + { + ralloc_free(mem_ctx); + } + + void *mem_ctx; +}; + +static jay_def +jay_phi_2(jay_builder *b, jay_block *p1, jay_def v1, jay_block *p2, jay_def v2) +{ + assert(v2.file == v1.file || jay_is_null(v2)); + jay_def idx = jay_alloc_def(b, v1.file, 1); + jay_PHI_DST(b, idx); + jay_cursor saved = b->cursor; + + b->cursor = jay_after_block(p1); + jay_PHI_SRC_u32(b, v1, jay_index(idx)); + + b->cursor = jay_after_block(p2); + jay_PHI_SRC_u32(b, jay_is_null(v2) ? idx : v2, jay_index(idx)); + + b->cursor = saved; + return idx; +} + +TEST_F(RepairSSA, Local) +{ + CASE({ + jay_def x = jay_MOV_u32(b, 0xcafe); + jay_def y = jay_MOV_u32(b, 0xefac); + + if (repaired) { + jay_UNIT_TEST(b, jay_ADD_f32(b, y, x)); + } else { + jay_ADD(b, JAY_TYPE_F32, x, y, x); + jay_UNIT_TEST(b, x); + } + }); +} + +/* A + * / \ + * B C + * \ / + * D + */ +TEST_F(RepairSSA, IfElse) +{ + CASE({ + jay_block *A = jay_first_block(b->func); + jay_block *B = jay_test_block(b->func); + jay_block *C = jay_test_block(b->func); + jay_block *D = jay_test_block(b->func); + + jay_block_add_successor(A, B); + jay_block_add_successor(A, C); + + jay_block_add_successor(B, D); + jay_block_add_successor(C, D); + + b->cursor = jay_after_block(A); + jay_IF(b); + + b->cursor = jay_after_block(B); + jay_def x = jay_MOV_u32(b, 0xcafe); + jay_def y = jay_MOV_u32(b, 0xbade); + + b->cursor = jay_after_block(C); + jay_ELSE(b); + jay_def x2 = repaired ? jay_alloc_def(b, UGPR, 1) : x; + jay_MOV(b, x2, 0xefac); + jay_def y2 = jay_MOV_u32(b, 0xbaee); + jay_ENDIF(b); + + b->cursor = jay_after_block(D); + jay_def y3 = jay_phi_2(b, B, y, C, y2); + if (repaired) + x = jay_phi_2(b, B, x, C, x2); + + jay_UNIT_TEST(b, jay_ADD_f32(b, x, y3)); + }); +} + +/* + * H + * | + * A---| + * / \ | + * B C | + * | / | + * | D---- + * | + * |-E + */ +TEST_F(RepairSSA, Loop) +{ + CASE({ + jay_block *H = jay_first_block(b->func); + jay_block *A = jay_test_block(b->func); + jay_block *B = jay_test_block(b->func); + jay_block *C = jay_test_block(b->func); + jay_block *D = jay_test_block(b->func); + jay_block *E = jay_test_block(b->func); + + jay_block_add_successor(H, A); + jay_block_add_successor(A, B); + jay_block_add_successor(A, C); + jay_block_add_successor(B, E); + jay_block_add_successor(C, D); + jay_block_add_successor(D, A); + + A->loop_header = true; + + b->cursor = jay_after_block(H); + jay_def x = jay_MOV_u32(b, 0xcafe); + + b->cursor = jay_after_block(A); + jay_def x_in = repaired ? jay_alloc_def(b, UGPR, 1) : x; + jay_def x_out = repaired ? jay_alloc_def(b, UGPR, 1) : x; + if (repaired) { + jay_PHI_DST(b, x_in); + } + jay_IF(b); + + b->cursor = jay_after_block(H); + if (repaired) { + jay_PHI_SRC_u32(b, x, jay_index(x_in)); + } + + b->cursor = jay_after_block(B); + jay_BREAK(b); + + b->cursor = jay_after_block(D); + jay_ADD(b, JAY_TYPE_U32, x_out, x_in, 1); + if (repaired) { + jay_PHI_SRC_u32(b, x_out, jay_index(x_in)); + } + jay_WHILE(b); + + b->cursor = jay_after_block(E); + jay_UNIT_TEST(b, x_in); + }); +} + +/* Same setup as IfElse */ +TEST_F(RepairSSA, TrivialPhisOptimized) +{ + CASE({ + jay_block *A = jay_first_block(b->func); + jay_block *B = jay_test_block(b->func); + jay_block *C = jay_test_block(b->func); + jay_block *D = jay_test_block(b->func); + + jay_block_add_successor(A, B); + jay_block_add_successor(A, C); + + jay_block_add_successor(B, D); + jay_block_add_successor(C, D); + + b->cursor = jay_after_block(A); + jay_def x = jay_MOV_u32(b, 0xcafe); + jay_IF(b); + + b->cursor = jay_after_block(C); + jay_ELSE(b); + jay_ENDIF(b); + + b->cursor = jay_after_block(D); + if (repaired) { + b->func->ssa_alloc++; + } + + jay_UNIT_TEST(b, jay_ADD_f32(b, x, x)); + }); +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index cdfdd00d5f8..0a2c0c1f66a 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -35,6 +35,7 @@ brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c', command : [prog_python, '@INPUT0@', '--out', '@OUTPUT@']) subdir('brw') +subdir('jay') if with_intel_elk subdir('elk')