From e42e3193137d1b21e84b499336e7a8887b8a8689 Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Thu, 27 Nov 2025 17:57:25 -0500 Subject: [PATCH] intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke Signed-off-by: Alyssa Rosenzweig Part-of: --- src/.clang-format | 46 + src/intel/compiler/jay/.clang-format | 31 + src/intel/compiler/jay/README.md | 3 + src/intel/compiler/jay/jay.h | 25 + src/intel/compiler/jay/jay_assign_flags.c | 365 ++ src/intel/compiler/jay/jay_builder.h | 643 +++ .../compiler/jay/jay_builder_opcodes.h.py | 153 + src/intel/compiler/jay/jay_extra_info.h.py | 153 + src/intel/compiler/jay/jay_from_nir.c | 3838 +++++++++++++++++ src/intel/compiler/jay/jay_ir.h | 1408 ++++++ src/intel/compiler/jay/jay_liveness.c | 203 + src/intel/compiler/jay/jay_lower_post_ra.c | 153 + src/intel/compiler/jay/jay_lower_pre_ra.c | 200 + src/intel/compiler/jay/jay_lower_scoreboard.c | 376 ++ src/intel/compiler/jay/jay_lower_spill.c | 156 + src/intel/compiler/jay/jay_nir_algebraic.py | 95 + src/intel/compiler/jay/jay_opcodes.py | 233 + src/intel/compiler/jay/jay_opcodes_gen.py | 99 + src/intel/compiler/jay/jay_opt_control_flow.c | 137 + src/intel/compiler/jay/jay_opt_dead_code.c | 58 + src/intel/compiler/jay/jay_opt_propagate.c | 282 ++ src/intel/compiler/jay/jay_print.c | 309 ++ src/intel/compiler/jay/jay_private.h | 72 + .../compiler/jay/jay_register_allocate.c | 1659 +++++++ src/intel/compiler/jay/jay_repair_ssa.c | 247 ++ src/intel/compiler/jay/jay_simd_width.c | 63 + src/intel/compiler/jay/jay_spill.c | 849 ++++ src/intel/compiler/jay/jay_to_binary.c | 576 +++ src/intel/compiler/jay/jay_validate.c | 328 ++ src/intel/compiler/jay/jay_validate_ra.c | 217 + src/intel/compiler/jay/meson.build | 109 + src/intel/compiler/jay/register-file.md | 57 + src/intel/compiler/jay/test/jay_test.h | 141 + .../compiler/jay/test/test-lower-post-ra.cpp | 82 + .../compiler/jay/test/test-optimizer.cpp | 312 ++ .../compiler/jay/test/test-repair-ssa.cpp | 213 + src/intel/compiler/meson.build | 1 + 37 files changed, 13892 insertions(+) create mode 100644 src/intel/compiler/jay/.clang-format create mode 100644 src/intel/compiler/jay/README.md create mode 100644 src/intel/compiler/jay/jay.h create mode 100644 src/intel/compiler/jay/jay_assign_flags.c create mode 100644 src/intel/compiler/jay/jay_builder.h create mode 100644 src/intel/compiler/jay/jay_builder_opcodes.h.py create mode 100644 src/intel/compiler/jay/jay_extra_info.h.py create mode 100644 src/intel/compiler/jay/jay_from_nir.c create mode 100644 src/intel/compiler/jay/jay_ir.h create mode 100644 src/intel/compiler/jay/jay_liveness.c create mode 100644 src/intel/compiler/jay/jay_lower_post_ra.c create mode 100644 src/intel/compiler/jay/jay_lower_pre_ra.c create mode 100644 src/intel/compiler/jay/jay_lower_scoreboard.c create mode 100644 src/intel/compiler/jay/jay_lower_spill.c create mode 100644 src/intel/compiler/jay/jay_nir_algebraic.py create mode 100644 src/intel/compiler/jay/jay_opcodes.py create mode 100644 src/intel/compiler/jay/jay_opcodes_gen.py create mode 100644 src/intel/compiler/jay/jay_opt_control_flow.c create mode 100644 src/intel/compiler/jay/jay_opt_dead_code.c create mode 100644 src/intel/compiler/jay/jay_opt_propagate.c create mode 100644 src/intel/compiler/jay/jay_print.c create mode 100644 src/intel/compiler/jay/jay_private.h create mode 100644 src/intel/compiler/jay/jay_register_allocate.c create mode 100644 src/intel/compiler/jay/jay_repair_ssa.c create mode 100644 src/intel/compiler/jay/jay_simd_width.c create mode 100644 src/intel/compiler/jay/jay_spill.c create mode 100644 src/intel/compiler/jay/jay_to_binary.c create mode 100644 src/intel/compiler/jay/jay_validate.c create mode 100644 src/intel/compiler/jay/jay_validate_ra.c create mode 100644 src/intel/compiler/jay/meson.build create mode 100644 src/intel/compiler/jay/register-file.md create mode 100644 src/intel/compiler/jay/test/jay_test.h create mode 100644 src/intel/compiler/jay/test/test-lower-post-ra.cpp create mode 100644 src/intel/compiler/jay/test/test-optimizer.cpp create mode 100644 src/intel/compiler/jay/test/test-repair-ssa.cpp diff --git a/src/.clang-format b/src/.clang-format index 7e22bed1676..d2df8c5b55d 100644 --- a/src/.clang-format +++ b/src/.clang-format @@ -300,6 +300,52 @@ ForEachMacros: - foreach_bo - foreach_bo_safe +# intel + - jay_foreach_ssa_file + - jay_foreach_function + - jay_foreach_block + - jay_foreach_block_safe + - jay_foreach_block_rev + - jay_foreach_block_from + - jay_foreach_block_from_rev + - jay_foreach_dst + - jay_foreach_dst_index + - jay_foreach_inst_in_block + - jay_foreach_inst_in_block_rev + - jay_foreach_inst_in_block_safe + - jay_foreach_inst_in_block_safe_rev + - jay_foreach_inst_in_block_from + - jay_foreach_inst_in_block_from_rev + - jay_foreach_inst_in_shader + - jay_foreach_inst_in_shader_rev + - jay_foreach_inst_in_shader_safe + - jay_foreach_inst_in_shader_safe_rev + - jay_foreach_inst_in_func + - jay_foreach_inst_in_func_rev + - jay_foreach_inst_in_func_safe + - jay_foreach_inst_in_func_safe_rev + - jay_foreach_successor + - jay_foreach_predecessor + - jay_foreach_comp + - jay_foreach_comp_rev + - jay_foreach_src + - jay_foreach_src_rev + - jay_foreach_ssa_src + - jay_foreach_ssa_src_rev + - jay_foreach_ssa_src_comp + - jay_foreach_index + - jay_foreach_index_rev + - jay_foreach_src_index + - jay_foreach_src_index_rev + - jay_repair_foreach_phi + - jay_foreach_phi_src_in_block + - jay_foreach_phi_dst_in_block + - jay_foreach_preload + - jay_foreach_killed + - jay_foreach_ra_src + - jay_foreach_ra_file + - jay_foreach_pipe + # Disable clang formatting by default. Drivers that use clang-format # inherit from this .clang-format file and re-enable formatting: # diff --git a/src/intel/compiler/jay/.clang-format b/src/intel/compiler/jay/.clang-format new file mode 100644 index 00000000000..04cf17f20bb --- /dev/null +++ b/src/intel/compiler/jay/.clang-format @@ -0,0 +1,31 @@ +BasedOnStyle: InheritParentConfig +DisableFormat: false + +AlignConsecutiveBitFields: Consecutive +BitFieldColonSpacing: None + +AlignAfterOpenBracket: Align +AlignConsecutiveMacros: + Enabled: true + AcrossComments: true +AlignArrayOfStructures: Left + +ColumnLimit: 80 + +BreakStringLiterals: false +SpaceBeforeParens: ControlStatementsExceptControlMacros +SpaceAfterCStyleCast: true +BinPackParameters: OnePerLine +AllowAllArgumentsOnNextLine: false +PenaltyBreakBeforeFirstCallParameter: 100 +ReferenceAlignment: Middle + +BreakBeforeBinaryOperators: None +PenaltyBreakAssignment: 0 + +SpacesInContainerLiterals: true +Cpp11BracedListStyle: false + +AlignOperands: Align +BreakBinaryOperations: RespectPrecedence +BreakBeforeTernaryOperators: false diff --git a/src/intel/compiler/jay/README.md b/src/intel/compiler/jay/README.md new file mode 100644 index 00000000000..8ac3ed0897b --- /dev/null +++ b/src/intel/compiler/jay/README.md @@ -0,0 +1,3 @@ +Xe2 compiler experiments. + +**Work-in-progress, not ready for users/benchmarks.** diff --git a/src/intel/compiler/jay/jay.h b/src/intel/compiler/jay/jay.h new file mode 100644 index 00000000000..914c0d8ea71 --- /dev/null +++ b/src/intel/compiler/jay/jay.h @@ -0,0 +1,25 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/brw/brw_compiler.h" +#include "util/shader_stats.h" +#include "nir.h" + +struct intel_device_info; +struct nir_shader_compiler_options; + +struct jay_shader_bin { + const uint32_t *kernel; + uint32_t size; + struct genisa_stats stats; +}; + +struct jay_shader_bin *jay_compile(const struct intel_device_info *devinfo, + void *mem_ctx, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key); diff --git a/src/intel/compiler/jay/jay_assign_flags.c b/src/intel/compiler/jay/jay_assign_flags.c new file mode 100644 index 00000000000..5442eb154a1 --- /dev/null +++ b/src/intel/compiler/jay/jay_assign_flags.c @@ -0,0 +1,365 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * Instruction selection works on SSA FLAG and UFLAG variables. This pass + * implements a flag register allocator, assigning each FLAG/UFLAG either to a + * hardware flag register and/or spilling to a GPR/UGPR. + * + * As a simplification, hardware flags are block-local. At block boundaries, + * 32-bit 0/~0 (U)GPRs are our canonical representation for (U)FLAGs. + * + * Producers: CMP produce both 0/~0 GPRs and flags, while conditional modifiers + * produce only flags. Boolean arithmetic is lowered to GPRs. + * + * Consumers: SEL/CSEL consumes both GPRs and flags, while predication consumes + * only flags. Boolean arithmetic again requires GPRs. + * + * Our strategy is to turn flags into GPR representations globally while keeping + * copies in flags where it makes sense locally. + */ + +static inline jay_def +canonicalize_flag(jay_def x) +{ + assert(jay_is_flag(x)); + x.file = x.file == UFLAG ? UGPR : GPR; + return x; +} + +struct var_info { + unsigned flag :3; + bool uniform :1; + bool read_by_predication:1; + bool free_canonical :1; + unsigned pad :2; +} PACKED; +static_assert(sizeof(struct var_info) == 1); + +struct flag_ra { + jay_builder *b; + struct var_info *vars; + uint32_t flag_to_global[JAY_MAX_FLAGS]; + uint32_t flag_to_local[JAY_MAX_FLAGS]; + unsigned roundrobin; + unsigned ballots:JAY_MAX_FLAGS; +}; + +static jay_def +assign_flag(struct flag_ra *ra, + jay_def flag, + enum jay_file file, + bool free_canonical, + bool ballot) +{ + jay_def canonical = canonicalize_flag(flag); + jay_def tmp = jay_alloc_def(ra->b, file, 1); + + /* Dedicate a flag for ballot since uniform access would clobber the zeroing. + * TODO: We could optimize this with more tracking. + */ + unsigned num_flags = jay_num_regs(ra->b->shader, FLAG); + tmp.reg = ballot ? 0 : (1 + (ra->roundrobin++) % (num_flags - 2)); + + ra->vars[jay_index(canonical)] = (struct var_info) { + .uniform = tmp.file == UFLAG, + .flag = tmp.reg, + .free_canonical = free_canonical, + }; + + ra->flag_to_global[tmp.reg] = jay_index(canonical); + ra->flag_to_local[tmp.reg] = jay_index(tmp); + + if (ballot) { + ra->ballots |= BITFIELD_BIT(tmp.reg); + } + + return tmp; +} + +static bool +rewrite_sel_with_zero(jay_inst *I, unsigned zero) +{ + jay_def flag = I->src[2]; + unsigned other = 1 - zero; + + if (!jay_defs_equivalent(I->src[zero], jay_imm(0)) || + I->src[other].abs || + I->src[other].negate || + jay_type_size_bits(I->type) != 32) { + return false; + } + + if (jay_defs_equivalent(I->src[other], jay_imm(0xffffffff)) && zero == 1) { + /* (c ? 0xffffffff : 0) -> canonical(c) */ + I->op = JAY_OPCODE_MOV; + I->src[0] = canonicalize_flag(flag); + jay_shrink_sources(I, 1); + } else { + /* ([!]c ? a : 0) --> (a & [~]canonical(c)) and + * ([!]c ? 0 : a) --> (a & ~[~]canonical(c)) + */ + I->op = JAY_OPCODE_AND; + I->src[0] = I->src[other]; + I->src[1] = canonicalize_flag(flag); + I->src[1].negate ^= (zero == 0); + jay_shrink_sources(I, 2); + } + + return true; +} + +static bool +rewrite_sel_to_csel(jay_inst *I) +{ + if (jay_type_size_bits(I->type) != 32) { + return false; + } + + /* SEL.f32 lowers to CSEL.f32 to preserve source modifiers & float controls. + * That works since we reinterpret 0/~0 as 0.0/NaN. + */ + jay_def flag = I->src[2]; + I->op = JAY_OPCODE_CSEL; + I->conditional_mod = flag.negate ? JAY_CONDITIONAL_EQ : JAY_CONDITIONAL_NE; + I->src[2] = canonicalize_flag(flag); + I->src[2].negate = false; + return true; +} + +static bool +rewrite_without_flag(struct flag_ra *ra, jay_inst *I, unsigned s, bool in_flag) +{ + if (I->op == JAY_OPCODE_PHI_SRC) { + I->src[s] = canonicalize_flag(I->src[s]); + return true; + } + + if (jay_debug & JAY_DBG_NOOPT) { + return false; + } + + if (I->op == JAY_OPCODE_SEL && + (!in_flag || ra->vars[jay_index(I->src[s])].free_canonical) && + !I->predication) { + + return rewrite_sel_with_zero(I, 0) || + rewrite_sel_with_zero(I, 1) || + (!in_flag && rewrite_sel_to_csel(I)); + } + + return false; +} + +static void +assign_block(jay_function *func, jay_block *block, struct var_info *var_to_flag) +{ + jay_builder b = { .shader = func->shader, .func = func }; + struct flag_ra ra_ = { .b = &b, .vars = var_to_flag }, *ra = &ra_; + + jay_foreach_inst_in_block_safe(block, I) { + if (I->op == JAY_OPCODE_CAST_CANONICAL_TO_FLAG) { + /* Assume the source is already 0/~0 canonical and use it. */ + I->op = JAY_OPCODE_MOV; + I->type = JAY_TYPE_U32; + I->dst = canonicalize_flag(I->dst); + continue; + } else if (I->type == JAY_TYPE_U1) { + /* Boolean logic turns into bitwise logic on the canonical form */ + if (!jay_is_null(I->dst)) { + I->dst = canonicalize_flag(I->dst); + } + + jay_foreach_src(I, s) { + if (!(s == 2 && I->op == JAY_OPCODE_SEL) && + jay_src_type(I, s) == JAY_TYPE_U1) { + if (jay_is_imm(I->src[s])) { + /* Convert 1-bit boolean to 0/~0 */ + assert(jay_is_imm(I->src[s]) && jay_as_uint(I->src[s]) <= 1); + I->src[s] = jay_imm(jay_as_uint(I->src[s]) ? ~0 : 0); + } else { + I->src[s] = canonicalize_flag(I->src[s]); + } + } + } + + I->type = JAY_TYPE_U32; + } + + /* Handle flag sources */ + jay_foreach_src(I, s) { + if (!jay_is_flag(I->src[s])) { + continue; + } + + unsigned index = jay_index(I->src[s]); + bool ballot = jay_src_type(I, s) != JAY_TYPE_U1; + enum jay_file file = I->dst.file == UGPR && !ballot ? UFLAG : FLAG; + bool in_flag = ra->flag_to_global[var_to_flag[index].flag] == index && + ((file == UFLAG) == var_to_flag[index].uniform); + + /* If we don't actually need the flag, we're done. */ + if (rewrite_without_flag(ra, I, s, in_flag)) { + continue; + } + + /* Otherwise, ensure we have the value in a flag. */ + if (!in_flag) { + jay_def tmp = assign_flag(ra, I->src[s], file, false, ballot); + + /* XXX: We need a more systematic approach to modifiers :/ */ + b.cursor = jay_before_inst(I); + jay_def d = I->src[s]; + d.negate = false; + jay_CMP(&b, JAY_TYPE_U32, JAY_CONDITIONAL_NE, tmp, + canonicalize_flag(d), 0); + } + + /* ...and rewrite to use the flag */ + unsigned reg = var_to_flag[index].flag; + jay_def flag = jay_scalar(file, ra->flag_to_local[reg]); + flag.reg = reg; + jay_replace_src(&I->src[s], flag); + } + + /* Handle flag writes */ + b.cursor = jay_after_inst(I); + + /* If the flag is written directly (for an inverse ballot), recover the + * canonical representation with a SEL. + */ + if (!jay_is_null(I->dst) && jay_is_flag(I->dst)) { + jay_def canonical = canonicalize_flag(I->dst); + I->dst = assign_flag(ra, I->dst, I->dst.file, false, false); + jay_SEL(&b, JAY_TYPE_U32, canonical, ~0, 0, I->dst); + } + + if (!jay_is_null(I->cond_flag)) { + I->broadcast_flag = + var_to_flag[jay_index(I->cond_flag)].read_by_predication && + I->cond_flag.file == UFLAG && + I->op == JAY_OPCODE_CMP; + + jay_def canonical = canonicalize_flag(I->cond_flag); + I->cond_flag = + assign_flag(ra, I->cond_flag, + I->broadcast_flag ? FLAG : I->cond_flag.file, + I->op == JAY_OPCODE_CMP, false); + + if (I->op == JAY_OPCODE_CMP) { + assert(jay_is_null(I->dst)); + + if (I->broadcast_flag) { + /* We need to recover the UGPR from the replicated FLAG. Thanks + * to our write-masking and broadcasting, the flag is already + * 0/~0. We simply need to sign-extend. + */ + jay_i2i32(&b, canonical, b.shader->dispatch_width, I->cond_flag); + } else if (jay_type_size_bits(I->type) != 32) { + I->dst = jay_alloc_def(&b, canonical.file, + jay_type_vector_length(I->type)); + jay_i2i32(&b, canonical, jay_type_size_bits(I->type), I->dst); + } else { + /* 32-bit CMP returns the canonical form */ + I->dst = canonical; + } + } else { + assert(jay_type_size_bits(I->type) == 32 && "limited cmod prop"); + + if (jay_is_null(I->dst)) { + I->dst = jay_alloc_def(&b, canonical.file, + jay_type_vector_length(I->type)); + } + + /* Recover the canonical representation with a CMP. Hopefully, + * either the CMP or the cmod will be eliminated by a later DCE. + */ + jay_CMP(&b, I->type, I->conditional_mod, canonical, I->dst, 0) + ->cond_flag.reg = + jay_num_regs(b.shader, FLAG) - 1; // TODO: no null flag + } + } + } + + /* Ballots require zeroing flags */ + b.cursor = jay_before_block(block); + u_foreach_bit(i, ra->ballots) { + jay_ZERO_FLAG(&b, i); + } +} + +static void +copyprop(jay_function *f) +{ + jay_inst **defs = calloc(f->ssa_alloc, sizeof(defs[0])); + + jay_foreach_inst_in_func_safe(f, block, I) { + jay_foreach_dst_index(I, _, d) { + defs[d] = I; + } + + if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND) + continue; + + jay_foreach_ssa_src(I, s) { + jay_def src = I->src[s]; + if (src.collect) + continue; + + jay_inst *def = defs[jay_base_index(src)]; + if (jay_defs_equivalent(def->dst, src) && + !def->predication && + def->op == JAY_OPCODE_MOV && + (I->src[s].file == def->src[0].file || + (I->op == JAY_OPCODE_CMP && jay_is_imm(def->src[0])))) { + + jay_replace_src(&I->src[s], def->src[0]); + } + } + } + + free(defs); +} + +void +jay_assign_flags(jay_shader *s) +{ + jay_foreach_function(s, f) { + struct var_info *map = calloc(f->ssa_alloc, sizeof(map[0])); + uint32_t *def_to_block = calloc(f->ssa_alloc, sizeof(def_to_block)); + + jay_foreach_inst_in_func(f, block, I) { + if (!jay_is_null(I->cond_flag)) { + def_to_block[jay_index(I->cond_flag)] = block->index + 1; + } + + if (I->predication) { + jay_def predicate = *jay_inst_get_predicate(I); + if (def_to_block[jay_index(predicate)] == block->index + 1) { + map[jay_index(predicate)].read_by_predication = true; + } + } + } + + jay_foreach_block(f, b) { + assign_block(f, b, map); + } + + free(map); + free(def_to_block); + + /* Flag RA leaves moves. Clean up after ourselves. */ + copyprop(f); + } +} +/* TODO: revisit + * dEQP-GLES3.functional.shaders.arrays.compare.equal_highp_vec4_highp_vec4_vertex + */ diff --git a/src/intel/compiler/jay/jay_builder.h b/src/intel/compiler/jay/jay_builder.h new file mode 100644 index 00000000000..a65b826e9f2 --- /dev/null +++ b/src/intel/compiler/jay/jay_builder.h @@ -0,0 +1,643 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "jay_ir.h" +#include "jay_opcodes.h" + +/* Like in NIR, for use with the builder */ +enum jay_cursor_option { + jay_cursor_after_block, + jay_cursor_before_inst, + jay_cursor_after_inst +}; + +typedef struct PACKED { + union { + jay_block *block; + jay_inst *inst; + }; + + enum jay_cursor_option option; +} jay_cursor; + +static inline bool +jay_cursors_equal(jay_cursor a, jay_cursor b) +{ + return !memcmp(&a, &b, sizeof(a)); +} + +static inline jay_cursor +jay_after_block(jay_block *block) +{ + return (jay_cursor) { .block = block, .option = jay_cursor_after_block }; +} + +static inline jay_cursor +jay_before_inst(jay_inst *I) +{ + return (jay_cursor) { .inst = I, .option = jay_cursor_before_inst }; +} + +static inline jay_cursor +jay_after_inst(jay_inst *I) +{ + return (jay_cursor) { .inst = I, .option = jay_cursor_after_inst }; +} + +static inline jay_cursor +jay_before_block(jay_block *block) +{ + jay_foreach_inst_in_block(block, I) { + if (I->op != JAY_OPCODE_PHI_DST && + I->op != JAY_OPCODE_PRELOAD && + I->op != JAY_OPCODE_ELSE) + return jay_before_inst(I); + } + + /* Whole block is phis, so insert at the end */ + return jay_after_block(block); +} + +static inline jay_cursor +jay_after_block_logical(jay_block *block) +{ + jay_foreach_inst_in_block_rev(block, I) { + if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) + return jay_after_inst(I); + } + + /* Whole block is phis, so insert at the start */ + return jay_before_block(block); +} + +static inline jay_cursor +jay_before_jump(jay_block *block) +{ + jay_inst *jump = jay_block_ending_jump(block); + return jump ? jay_before_inst(jump) : jay_after_block(block); +} + +/* Get a cursor at the start of a function, after any preloads */ +static inline jay_cursor +jay_before_function(jay_function *f) +{ + jay_block *block = jay_first_block(f); + + jay_foreach_inst_in_block(block, I) { + if (I->op != JAY_OPCODE_PRELOAD) + return jay_before_inst(I); + } + + /* The whole block is preloads, so insert at the end */ + return jay_after_block(block); +} + +/* + * Map a control flow edge to a block. If the block has one successor, the + * predecessor is unique. Else, the successor is unique; the successor must not + * have other predecessorss since there are no critical edges. + */ +static inline jay_block * +jay_edge_to_block(jay_block *pred, jay_block *succ) +{ + assert(jay_num_successors(pred) == 1 || jay_num_predecessors(succ) == 1); + return jay_num_successors(pred) == 1 ? pred : succ; +} + +/* + * Get a cursor to insert along a control flow edge: either at the start of + * the successor or the end of the predecessor. This relies on the control + * flow graph having no critical edges. + */ +static inline jay_cursor +jay_along_edge(jay_block *pred, jay_block *succ) +{ + jay_block *to = jay_edge_to_block(pred, succ); + + if (to == pred) + return jay_after_block_logical(pred); + else + return jay_before_block(succ); +} + +typedef struct { + jay_shader *shader; + jay_function *func; + jay_cursor cursor; +} jay_builder; + +static inline jay_builder +jay_init_builder(jay_function *f, jay_cursor cursor) +{ + return (jay_builder) { .shader = f->shader, .func = f, .cursor = cursor }; +} + +static inline void +jay_builder_insert(jay_builder *b, jay_inst *I) +{ + jay_cursor *cursor = &b->cursor; + + if (cursor->option == jay_cursor_after_inst) { + list_add(&I->link, &cursor->inst->link); + } else if (cursor->option == jay_cursor_after_block) { + list_addtail(&I->link, &cursor->block->instructions); + } else { + assert(cursor->option == jay_cursor_before_inst); + list_addtail(&I->link, &cursor->inst->link); + } + + cursor->option = jay_cursor_after_inst; + cursor->inst = I; +} + +static inline jay_def +jay_alloc_def(jay_builder *b, enum jay_file file, unsigned size) +{ + unsigned idx = b->func->ssa_alloc; + b->func->ssa_alloc += size; + return jay_contiguous_def(file, idx, size); +} + +/* + * Collect SSA indices into a source. If the indices are not contiguous, this + * uses a heap-allocated collect. Otherwise, a contiguous def is used. + */ +static inline jay_def +jay_collect(jay_builder *b, + enum jay_file file, + const uint32_t *indices, + unsigned nr) +{ + if (nr == 0) + return jay_null(); + + for (unsigned i = 1; i < nr; ++i) { + if (indices[i] != (indices[0] + i)) { + static_assert(sizeof(uintptr_t) <= sizeof(uint64_t) && + "sorry, no Morello support"); + void *dup = + linear_memdup(b->shader->lin_ctx, indices, sizeof(uint32_t) * nr); + uint64_t payload = (uintptr_t) dup; + + /* We require pointers to fit within (32+JAY_REG_BITS) bits. Luckily + * this will always be the case on common architectures. + */ + assert(payload < (1ull << (32 + JAY_REG_BITS))); + + return (jay_def) { + ._payload = (uint32_t) payload, + .reg = (uint32_t) (payload >> 32), + .file = file, + .num_values_m1 = nr - 1, + .collect = true, + }; + } + } + + return jay_contiguous_def(file, indices[0], nr); +} + +/* + * Set the n'th channel of a def to index. This requires a copy-on-write. + * + * This implementation could likely be optimized. + */ +static inline void +jay_insert_channel(jay_builder *b, jay_def *d, unsigned c, jay_def scalar) +{ + uint32_t indices[JAY_MAX_DEF_LENGTH]; + uint32_t count = jay_num_values(*d); + + assert(scalar.file == d->file && !scalar.negate && !scalar.abs); + assert(c < count && count <= ARRAY_SIZE(indices)); + + /* First, decompress the def. */ + jay_foreach_comp(*d, i) { + indices[i] = jay_channel(*d, i); + } + + /* Next, update the indices in place */ + indices[c] = jay_index(scalar); + + /* Now collect it back. */ + jay_replace_src(d, jay_collect(b, d->file, indices, count)); +} + +/* + * Concatenate a list of vectors, collecting all the indices in order. + */ +static inline jay_def +jay_collect_vectors(jay_builder *b, jay_def *vecs, uint32_t nr) +{ + uint32_t indices[JAY_MAX_DEF_LENGTH]; + uint32_t nr_indices = 0; + + for (unsigned i = 0; i < nr; ++i) { + assert(vecs[i].file == vecs[0].file && jay_is_ssa(vecs[i])); + assert(!vecs[i].negate && !vecs[i].abs); + + jay_foreach_comp(vecs[i], c) { + assert(nr_indices < ARRAY_SIZE(indices)); + indices[nr_indices++] = jay_channel(vecs[i], c); + } + } + + return jay_collect(b, vecs[0].file, indices, nr_indices); +} + +static inline jay_def +jay_collect_two(jay_builder *b, jay_def u, jay_def v) +{ + jay_def vecs[] = { u, v }; + return jay_collect_vectors(b, vecs, 2); +} + +static inline jay_inst * +jay_alloc_inst(jay_builder *b, + enum jay_opcode op, + uint8_t num_srcs, + unsigned extra_bytes) +{ + const size_t size = + offsetof(jay_inst, src) + num_srcs * sizeof(jay_def) + extra_bytes; + + jay_inst *I = (jay_inst *) linear_zalloc_child(b->shader->lin_ctx, size); + I->op = op; + I->num_srcs = num_srcs; + I->dst = jay_null(); + I->cond_flag = jay_null(); + + return I; +} + +static inline void +jay_shrink_sources(jay_inst *I, uint8_t new_num_srcs) +{ + assert(new_num_srcs < I->num_srcs); + unsigned info_size = jay_inst_info_size(I); + + memmove(&I->src[new_num_srcs], &I->src[I->num_srcs], info_size); + I->num_srcs = new_num_srcs; +} + +static inline jay_inst * +jay_clone_inst(jay_builder *b, jay_inst *I, uint8_t new_num_srcs) +{ + assert(new_num_srcs >= I->num_srcs); + unsigned info_size = jay_inst_info_size(I); + + jay_inst *clone = jay_alloc_inst(b, I->op, new_num_srcs, info_size); + + memcpy((uint8_t *) clone + sizeof(struct list_head), + (uint8_t *) I + sizeof(struct list_head), + sizeof(jay_inst) - sizeof(struct list_head)); + + clone->num_srcs = new_num_srcs; + + memcpy(clone->src, I->src, I->num_srcs * sizeof(jay_def)); + memcpy(&clone->src[new_num_srcs], &I->src[I->num_srcs], info_size); + return clone; +} + +static inline jay_inst * +jay_grow_sources(jay_builder *b, jay_inst *I, uint8_t new_num_srcs) +{ + jay_inst *clone = jay_clone_inst(b, I, new_num_srcs); + + if ((b->cursor.option == jay_cursor_before_inst || + b->cursor.option == jay_cursor_after_inst) && + b->cursor.inst == I) { + + b->cursor.inst = clone; + } + + jay_builder b_ = jay_init_builder(b->func, jay_before_inst(I)); + jay_builder_insert(&b_, clone); + jay_remove_instruction(I); + return clone; +} + +static inline jay_inst * +jay_add_predicate_else(jay_builder *b, + jay_inst *I, + jay_def predicate, + jay_def default_value) +{ + assert(!I->predication && "pre-condition"); + assert(jay_is_flag(predicate) && jay_is_ssa(default_value)); + + unsigned pred_index = I->num_srcs; + I = jay_grow_sources(b, I, pred_index + 2); + I->src[pred_index] = predicate; + I->src[pred_index + 1] = default_value; + I->predication = JAY_PREDICATED_DEFAULT; + return I; +} + +static inline jay_inst * +jay_add_predicate(jay_builder *b, jay_inst *I, jay_def predicate) +{ + assert(!I->predication && "pre-condition"); + assert(jay_is_flag(predicate)); + + unsigned pred_index = I->num_srcs; + I = jay_grow_sources(b, I, pred_index + 1); + I->src[pred_index] = predicate; + I->predication = JAY_PREDICATED; + return I; +} + +static inline jay_inst * +jay_set_cond_flag(jay_builder *b, jay_inst *I, jay_def cond_flag) +{ + assert(jay_is_flag(cond_flag) && jay_is_null(I->cond_flag)); + + I->cond_flag = cond_flag; + return I; +} + +static inline jay_inst * +jay_set_conditional_mod(jay_builder *b, + jay_inst *I, + jay_def cond_flag, + enum jay_conditional_mod cmod) +{ + I->conditional_mod = cmod; + return jay_set_cond_flag(b, I, cond_flag); +} + +static inline jay_def +jay_identity_def(jay_def x) +{ + return x; +} + +#ifdef __cplusplus +static inline jay_def +JAY_BUILD_SRC(jay_def x) +{ + return x; +} +static inline jay_def +JAY_BUILD_SRC(uint32_t x) +{ + return jay_imm(x); +} +#else +#define JAY_BUILD_SRC(X) \ + _Generic((X), \ + jay_def: jay_identity_def, \ + uint32_t: jay_imm, \ + int32_t: jay_imm, \ + uint8_t: jay_imm)(X) +#endif + +/* Include generated builder helpers */ +#include "jay_builder_opcodes.h" + +static inline jay_inst * +_jay_CMP(jay_builder *b, + enum jay_type src_type, + enum jay_conditional_mod cmod, + jay_def dst, + jay_def src0, + jay_def src1) +{ + jay_inst *I = jay_alloc_inst(b, JAY_OPCODE_CMP, 2, 0); + I->type = src_type; + I->src[0] = src0; + I->src[1] = src1; + + /* Even if we want to write a 32-bit 0/~0 result, we still need to + * register-allocate a flag, since the hardware will implicitly clobber one + * regardless. + */ + if (!jay_is_flag(dst)) { + I->dst = dst; + dst = jay_alloc_def(b, dst.file == UGPR ? UFLAG : FLAG, 1); + } + + jay_set_conditional_mod(b, I, dst, cmod); + jay_builder_insert(b, I); + return I; +} + +#define jay_CMP(b, st, cmod, dst, src0, src1) \ + _jay_CMP(b, st, cmod, dst, JAY_BUILD_SRC(src0), JAY_BUILD_SRC(src1)) + +struct jayb_send_params { + enum brw_sfid sfid; + uint64_t msg_desc; + jay_def dst; + jay_def header; + jay_def *srcs; + jay_def desc, ex_desc; + enum jay_type type; + enum jay_type src_type[2]; + unsigned nr_srcs; + uint32_t ex_desc_imm; + bool eot; + bool check_tdr; + bool uniform; + bool bindless; +}; + +static inline jay_inst * +_jay_SEND(jay_builder *b, const struct jayb_send_params p) +{ + const struct intel_device_info *devinfo = b->shader->devinfo; + jay_inst *I = jay_alloc_inst(b, JAY_OPCODE_SEND, 4, sizeof(jay_send_info)); + jay_send_info *info = jay_get_send_info(I); + bool has_header = !jay_is_null(p.header); + + I->dst = p.dst; + I->type = p.type; + + assert(I->type); + info->type_0 = p.src_type[0] ? p.src_type[0] : I->type; + info->type_1 = p.src_type[1] ? p.src_type[1] : info->type_0; + + if (has_header) { + assert(p.nr_srcs == 1 || info->type_0 == info->type_1); + + /* If there is a message header, split the send into
and + * since the header is UGPR but the payload is GPR. + */ + I->src[2] = p.header; + I->src[3] = jay_collect_vectors(b, &p.srcs[0], p.nr_srcs); + info->type_1 = info->type_0; + info->type_0 = JAY_TYPE_U32 /* header type */; + } else if (jay_type_size_bits(info->type_0) == 16 && + !p.uniform && + b->shader->dispatch_width == 32) { + /* Pack 16-bit vectors to match the hardware with the data model. + * + * XXX: This is a hack. Move to NIR for better + * codegen in tests like + * dEQP-GLES31.functional.texture.multisample.samples_4.use_texture_int_2d_array. + */ + assert(info->type_0 == info->type_1); + jay_def srcs[8]; + unsigned n = 0, i; + for (i = 0; i + 2 <= p.nr_srcs; i += 2) { + assert(p.srcs[i].file == p.srcs[i + 1].file); + assert(jay_num_values(p.srcs[i]) == jay_num_values(p.srcs[i + 1])); + + for (unsigned c = 1; c < jay_num_values(p.srcs[i]); ++c) { + assert(jay_channel(p.srcs[i], c) == 0); + assert(jay_channel(p.srcs[i + 1], c) == 0); + } + + jay_def lo = jay_extract(p.srcs[i], 0), + hi = jay_extract(p.srcs[i + 1], 0); + jay_def bfi = jay_BFI2_u32(b, 0xffff0000, hi, lo); + + if (p.srcs[i].file == UGPR) { + uint32_t defs[16] = { jay_index(bfi) }; + srcs[n++] = jay_collect(b, UGPR, defs, jay_ugpr_per_grf(b->shader)); + } else { + srcs[n++] = bfi; + } + } + if (i < p.nr_srcs) { + srcs[n++] = p.srcs[i++]; + } + assert(i == p.nr_srcs); + + I->src[2] = jay_collect_vectors(b, srcs, n); + I->src[3] = jay_null(); + } else if (p.nr_srcs <= 2) { + /* Easy case: keep everything scalar */ + I->src[2] = p.nr_srcs > 0 ? p.srcs[0] : jay_null(); + I->src[3] = p.nr_srcs > 1 ? p.srcs[1] : jay_null(); + } else { + /* Otherwise, we need to pick a point to split at. + * + * Heuristic: don't split render targer writes becuase RA gets confused + * with the EOT requirements. Split everything else in half. + * + * TODO: Come up with a better heuristic. + */ + assert(info->type_0 == info->type_1); + unsigned split = !p.check_tdr ? DIV_ROUND_UP(p.nr_srcs, 2) : p.nr_srcs; + I->src[2] = jay_collect_vectors(b, &p.srcs[0], split); + I->src[3] = jay_collect_vectors(b, &p.srcs[split], p.nr_srcs - split); + } + + /* For message headers we pack a UGPR vector as a single GRF */ + unsigned lens[3]; + for (unsigned i = 0; i < 3; ++i) { + jay_def x = i == 0 ? I->dst : I->src[1 + i]; + lens[i] = jay_num_values(x); + + /* XXX: For the non-transpose uniform case, do we need to pad out + * with undefs for correctness so we don't fall off the side of the + * regfile? for sends like: + * + * (1&W) mov.u32 u10.0, u0.8 | A@1 + (1&W) mov.u32 u10.1, u0.9 | A@1 + (1&W) send.u32 u12, g10, _, 0x04403580, 0x00000000 + ugm MsgDesc: ( load, a64, d32, V4, L1STATE_L3MOCS dst_len = + 4, src0_len = 2, src1_len = 0 flat ) base_offset 0 | A@1 $0 + + * We don't care what's in g11, but it has to *exist*. But that is + * probably implicitly correct as long as the reg file ends with GRFs. + * Which it has to shader)); + } else { + lens[i] *= jay_grf_per_gpr(b->shader); + } + + lens[i] *= reg_unit(devinfo); + } + + info->sfid = p.sfid; + info->eot = p.eot; + info->check_tdr = p.check_tdr; + info->uniform = p.uniform; + info->bindless = p.bindless; + info->ex_desc_imm = p.ex_desc_imm; + info->ex_mlen = lens[2]; + I->src[0] = jay_imm(((uint32_t) p.msg_desc) | + brw_message_desc(devinfo, lens[1], lens[0], has_header)); + + if (!jay_is_null(p.desc)) { + jay_def a = jay_alloc_def(b, J_ADDRESS, 1); + jay_OR(b, JAY_TYPE_U32, a, p.desc, I->src[0]); + I->src[0] = a; + } + + if (jay_is_null(p.ex_desc)) { + I->src[1] = + jay_imm(brw_message_ex_desc(devinfo, lens[2]) | (p.msg_desc >> 32)); + } else if (p.ex_desc.file == J_ADDRESS) { + I->src[1] = p.ex_desc; + } else { + I->src[1] = jay_alloc_def(b, J_ADDRESS, 1); + if (info->bindless) { + jay_MOV(b, I->src[1], p.ex_desc); + } else { + jay_OR(b, JAY_TYPE_U32, I->src[1], p.ex_desc, + brw_message_ex_desc(devinfo, info->ex_mlen)); + } + } + + assert(!info->uniform || jay_is_null(I->dst) || I->dst.file == UGPR); + jay_builder_insert(b, I); + return I; +} + +#define jay_SEND(b, ...) _jay_SEND(b, (struct jayb_send_params) { __VA_ARGS__ }) + +static inline void +jay_copy_strided(jay_builder *b, jay_def dst, jay_def src, bool src_strided) +{ + unsigned src_stride = src_strided ? jay_ugpr_per_grf(b->shader) : 1; + uint32_t n = MIN2(jay_num_values(dst), jay_num_values(src) / src_stride); + + for (unsigned i = 0; i < n; ++i) { + jay_MOV(b, jay_extract(dst, i), jay_extract(src, i * src_stride)); + } +} + +static inline void +jay_copy(jay_builder *b, jay_def dst, jay_def src) +{ + jay_copy_strided(b, dst, src, false); +} + +static inline jay_def +jay_as_gpr(jay_builder *b, jay_def src) +{ + if (src.file == GPR || jay_is_null(src)) + return src; + + jay_def def = jay_alloc_def(b, GPR, jay_num_values(src)); + jay_copy(b, def, src); + return def; +} + +static inline void +jay_i2i32(jay_builder *b, jay_def dst, unsigned src_bits, jay_def src) +{ + if (src_bits < 32) { + jay_CVT(b, JAY_TYPE_S32, dst, src, jay_type(JAY_TYPE_S, src_bits), + JAY_ROUND, 0); + } else if (src_bits == 32) { + jay_MOV(b, dst, src); + } else { + assert(src.reg == 0 && ".reg not preserved in this path but that's OK"); + jay_MOV(b, dst, jay_extract(src, 0)); + } +} diff --git a/src/intel/compiler/jay/jay_builder_opcodes.h.py b/src/intel/compiler/jay/jay_builder_opcodes.h.py new file mode 100644 index 00000000000..735a653f08e --- /dev/null +++ b/src/intel/compiler/jay/jay_builder_opcodes.h.py @@ -0,0 +1,153 @@ +# Copyright 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +from typing import TYPE_CHECKING +import argparse +import sys + +from mako import exceptions +from mako.template import Template + +from jay_opcodes import OPCODES + +if TYPE_CHECKING: + from jay_opcodes import Opcode + + +def infer_type(op: 'Opcode') -> bool: + return op.has_dest and (set(op.types) <= set(["u1", "u32", "u64"]) or + op.name == 'mov') + + +def signature(op: 'Opcode', with_dest: bool = True, with_types: bool = False, + mode: str = 'prototype', type_: str = 't', src: str = '{}') -> str: + arr = [('jay_builder *', 'b')] + + if with_types and len(op.types) > 1 and not infer_type(op): + arr += [('enum jay_type', type_)] + + if with_dest and op.has_dest: + arr += [('jay_def', 'dst')] + + arr += [('jay_def', src.format(f'src{i}')) for i in range(op.num_srcs)] + arr += [x for x in op.extra_struct if not x[1].startswith('pad')] + + return ', '.join([(t + ' ' if mode == 'prototype' else '') + v for t, v in arr]) + + +TEMPLATE = """ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#pragma once + +#include "jay_private.h" + +#ifndef NDEBUG +#define type_assert(op, ...) if (!(__VA_ARGS__)) { fprintf(stderr, "%s does not allow type: ", #op); jay_print_type(stderr, t); fprintf(stderr, "\\n"); } assert(__VA_ARGS__) +#else +#define type_assert(...) +#endif + +% for op in opcodes.values(): +<% + OPCODE = op.name.upper() + num_srcs = op.num_srcs + has_dest = op.has_dest + multi_type = len(op.types) > 1 + info_size = f'sizeof(jay_{op.name}_info)' if op.extra_struct else '0' + operands = ["dst"] + [f"src{i}" for i in range(num_srcs)] + if num_srcs > 0: + uniform = " && " .join([f"jay_is_uniform(src{i})" for i in range(num_srcs)]) + reg_file = f"({uniform}) ? UGPR : GPR" + else: + reg_file = "GPR" + if not op.types: + continue + # Ignore the lane index when determining the type of a shuffle + infer_operands = operands[0:-1] if op.name == "shuffle" else operands +%> +static inline jay_inst * +_jay_${OPCODE}(${signature(op, with_types = True)}) +{ +% if infer_type(op): + enum jay_type t = jay_num_values(dst) == 2 ? JAY_TYPE_U64 : + ${" && ".join([f"(jay_is_flag({x}) || jay_is_imm({x}))" for x in infer_operands])} + ? JAY_TYPE_U1 : JAY_TYPE_U32; +% elif multi_type: + type_assert(${OPCODE}, 0 +% for type in op.types: + || t == JAY_TYPE_${type.upper()} +% endfor + ); + +% else: + enum jay_type t = JAY_TYPE_${op.types[0].upper()}; + +% endif + jay_inst *inst = jay_alloc_inst(b, JAY_OPCODE_${OPCODE}, ${num_srcs}, ${info_size}); +% for _, prop in op.extra_struct: +% if not prop.startswith('pad'): + jay_set_${op.name}_${prop}(inst, ${prop}); +% endif +% endfor + + inst->type = t; +% if op.has_dest: + inst->dst = dst; +% endif +% for i in range(num_srcs): + inst->src[${i}] = src${i}; +% endfor + + jay_builder_insert(b, inst); + return inst; +} + +#define jay_${OPCODE}(${signature(op, with_types = True, mode = 'call')}) _jay_${OPCODE}(${signature(op, with_types = True, src = 'JAY_BUILD_SRC({})', mode='call')}) + +% for type in op.types: +static inline ${'jay_def' if op.has_dest else 'void'} +_jay_${OPCODE}_${type}(${signature(op, with_dest = False)}) +{ +% if op.has_dest: + jay_def dst = jay_alloc_def(b, ${reg_file}, ${2 if '64' in type else 1}); +%endif + jay_${OPCODE}(${signature(op, with_types = True, type_ = 'JAY_TYPE_'+type.upper(), mode = 'call')}); +% if op.has_dest: + return dst; +% endif +} +#define jay_${OPCODE}_${type}(${signature(op, with_dest = False, mode = +'call')}) _jay_${OPCODE}_${type}(${signature(op, src='JAY_BUILD_SRC({})', mode = 'call', with_dest = False)}) +% endfor + +% endfor + +#undef type_assert +""" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument('output', action='store') + args = parser.parse_args() + + ops = {op: v for (op, v) in OPCODES.items() if op not in {'cmp', 'send'}} + + try: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(Template(TEMPLATE).render( + opcodes=ops, + signature=signature, + infer_type=infer_type)) + except Exception: + print(exceptions.text_error_template().render()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/intel/compiler/jay/jay_extra_info.h.py b/src/intel/compiler/jay/jay_extra_info.h.py new file mode 100644 index 00000000000..cffe74fe5eb --- /dev/null +++ b/src/intel/compiler/jay/jay_extra_info.h.py @@ -0,0 +1,153 @@ +# Copyright 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +import argparse +import sys + +from mako import exceptions +from mako.template import Template + +from jay_opcodes import OPCODES, ENUMS + +TEMPLATE = """/* Do not include directly */ +PRAGMA_DIAGNOSTIC_PUSH +PRAGMA_DIAGNOSTIC_ERROR(-Wpadded) + +% for enum, (prefix, values) in enums.items(): +% if enum.startswith('jay'): +enum PACKED ${enum} { +% for v in values: + ${prefix}_${v.upper()}, +% endfor +}; +% endif +% endfor + +% for name, op in opcodes: +typedef struct jay_${name}_info { +% for T, prop in op.extra_struct: + ${T} ${prop}; +% endfor +} jay_${name}_info; + +% for prefix, _suffix in [('const ', '_const'), ('', '')]: +static inline ${prefix} struct jay_${name}_info * +jay_get_${name}_info${_suffix}(${prefix}jay_inst *I) +{ + assert(I->op == JAY_OPCODE_${name.upper()}); + return (${prefix}struct jay_${name}_info *) &I->src[I->num_srcs]; +} + +% endfor +% for T, prop in op.extra_struct: +% if not prop.startswith('pad'): +static inline ${T} +jay_${name}_${prop}(const jay_inst *I) +{ + return jay_get_${name}_info_const(I)->${prop}; +} + +static inline void +jay_set_${name}_${prop}(jay_inst *I, ${T} value) +{ + jay_get_${name}_info(I)->${prop} = value; +} + +% endif +% endfor +% endfor + +static inline unsigned +jay_inst_info_size(jay_inst *I) +{ + switch (I->op) { +% for name, op in opcodes: + case JAY_OPCODE_${name.upper()}: return sizeof(struct jay_${name}_info); +% endfor + default: return 0; + } +} + +#ifndef __cplusplus +static inline const char * +jay_print_inst_info(FILE *fp, const jay_inst *I, const char *sep) +{ + switch (I->op) { +% for name, op in opcodes: + case JAY_OPCODE_${name.upper()}: { +% for T, prop in op.extra_struct: +% if not (prop.startswith('pad') or name == 'bfn' or T == 'enum jay_type'): +<% + value = f"jay_{name}_{prop}(I)" + spec = '0x%"PRIx64"' if T == 'uint64_t' else "%u" +%> +% if T.startswith('enum') and T[5:] in enums: +<% + bare = T[5:] + prefix, values = enums[bare] +%> + const char *${bare}_str[] = { +% for v in values: + [${prefix}_${v.upper()}] = "${v}", +% endfor + }; + assert(${value} < ARRAY_SIZE(${bare}_str)); +<% + spec = "%s" + value = f'{T[5:]}_str[{value}]' +%> +% endif +% if T == 'enum jay_rounding_mode': + if (strcmp(${value}, "round")) { + fprintf(fp, "%s%s", sep, ${value}); + sep = ", "; + } +% elif T == 'bool': + if (${value}) { + fprintf(fp, "%s${prop}", sep); + sep = ", "; + } +% elif T.startswith('enum') or len(op.extra_struct) == 1: + fprintf(fp, "%s${spec}", sep, ${value}); + sep = ", "; +% else: + if (${value}) { + fprintf(fp, "%s${prop}=${spec}", sep, ${value}); + sep = ", "; + } +% endif +% endif +% endfor + break; + } +% endfor + default: break; + } + + return sep; +} +#endif + +PRAGMA_DIAGNOSTIC_POP +""" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument('output', action='store') + args = parser.parse_args() + + try: + with open(args.output, 'w', encoding='utf-8') as f: + f.write(Template(TEMPLATE).render( + opcodes=[(k, v) for k, v in OPCODES.items() if v.extra_struct], + enums=ENUMS)) + except Exception: + print(exceptions.text_error_template().render()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c new file mode 100644 index 00000000000..de24701b7ad --- /dev/null +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -0,0 +1,3838 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/brw/brw_compiler.h" +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/brw/brw_nir.h" +#include "compiler/brw/brw_private.h" +#include "compiler/brw/brw_sampler.h" +#include "compiler/intel_nir.h" +#include "compiler/intel_shader_enums.h" +#include "compiler/list.h" +#include "intel/dev/intel_debug.h" +#include "util/bitpack_helpers.h" +#include "util/bitscan.h" +#include "util/bitset.h" +#include "util/lut.h" +#include "util/macros.h" +#include "util/u_math.h" +#include "intel_device_info_gen.h" +#include "jay.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" +#include "nir.h" +#include "nir_builder.h" +#include "nir_builder_opcodes.h" +#include "nir_defines.h" +#include "nir_intrinsics.h" +#include "nir_intrinsics_indices.h" +#include "nir_opcodes.h" +#include "shader_enums.h" +#include "shader_stats.h" + +static const struct debug_named_value jay_debug_options[] = { + { "noopt", JAY_DBG_NOOPT, "Disable backend optimizer" }, + { "printdemand", JAY_DBG_PRINTDEMAND, "Print demand per instruction" }, + { "spill", JAY_DBG_SPILL, "Shrink register file to test spilling" }, + { "sync", JAY_DBG_SYNC, "Sync after every instruction" }, + DEBUG_NAMED_VALUE_END +}; + +DEBUG_GET_ONCE_FLAGS_OPTION(jay_debug, "JAY_DEBUG", jay_debug_options, 0) +int jay_debug = 0; + +typedef struct jay_vs_payload { + /* "the maximum limit is 30 elements per vertex" (bspec 56124) */ + jay_def attributes[30 * 4]; +} jay_vs_payload; + +typedef struct jay_cs_payload { + jay_def local_invocation_ids; +} jay_cs_payload; + +typedef struct jay_fs_payload { + jay_def bary[INTEL_BARYCENTRIC_MODE_COUNT]; + + struct { + jay_def xy, z, w; + } coord; + + jay_def pixel_sample_mask; + jay_def deltas[64]; +} jay_fs_payload; + +struct nir_to_jay_state { + jay_shader *s; + jay_function *f; + const nir_shader *nir; + const struct intel_device_info *devinfo; + + jay_builder bld; + + jay_block *current_block; + jay_block *after_block; + jay_block *break_block; + + unsigned indent; + + /* We cache ballot(true), ctz(ballot(true)), and 4*ctz(ballot(true)) within a + * block. If we had competent backend CSE - or emitted uniformize in NIR and + * taught NIR's CSE about ballots - we could remove this kludge. + */ + jay_def active_lane_mask, active_lane, active_lane_x4; + + /* These defs contain the extracted payload. They are only valid while + * translating NIR->Jay since they aren't maintained by Jay passes. + */ + struct { + jay_def u0, u1; + jay_def sampler_state_pointer, scratch_surface; + jay_def inline_data; + jay_def push_data[512]; + jay_def lane_id; + jay_def urb_handle; + + union { + jay_vs_payload vs; + jay_cs_payload cs; + jay_fs_payload fs; + }; + } payload; +}; + +static jay_def +payload_u1(struct nir_to_jay_state *nj, unsigned idx, unsigned len) +{ + if (jay_is_null(nj->payload.u1)) + return jay_null(); + else + return jay_extract_range(nj->payload.u1, idx, len); +} + +static jay_def +emit_active_lane_mask(struct nir_to_jay_state *nj) +{ + /* TODO: We don't use jay_exec_mask yet due to hardware issues */ + if (jay_is_null(nj->active_lane_mask)) { + nj->active_lane_mask = jay_alloc_def(&nj->bld, FLAG, 1); + jay_MOV(&nj->bld, nj->active_lane_mask, 1); + } + + return nj->active_lane_mask; +} + +static jay_def +emit_active_lane(struct nir_to_jay_state *nj) +{ + /* For this instruction to execute, some lane must be active. Therefore there + * is a 1 in the lower [dispatch width] bits of the lane mask, so we may + * equivalently use fbl.u32 instead of fbl.u[dispatch width]. + */ + if (jay_is_null(nj->active_lane)) { + nj->active_lane = jay_alloc_def(&nj->bld, UGPR, 1); + jay_FBL(&nj->bld, nj->active_lane, emit_active_lane_mask(nj)); + } + + return nj->active_lane; +} + +static jay_def +emit_uniformize(struct nir_to_jay_state *nj, jay_def x) +{ + jay_builder *b = &nj->bld; + if (x.file != GPR && x.file != FLAG) { + return x; + } + + if (jay_is_null(nj->active_lane_x4)) { + nj->active_lane_x4 = jay_SHL_u32(b, emit_active_lane(nj), 2); + } + + jay_def u = jay_alloc_def(b, x.file == FLAG ? UFLAG : UGPR, 1); + jay_SHUFFLE(b, u, x, nj->active_lane_x4); + return u; +} + +static jay_block *jay_emit_cf_list(struct nir_to_jay_state *nj, + struct exec_list *list); + +/** Returns true if the entire compute workgroup fits in a single subgroup. */ +static bool +jay_workgroup_is_one_subgroup(jay_builder *b, const nir_shader *nir) +{ + return mesa_shader_stage_uses_workgroup(nir->info.stage) && + !nir->info.workgroup_size_variable && + nir_static_workgroup_size(nir) <= b->shader->dispatch_width; +} + +static enum jay_type +jay_base_type_for_nir(nir_alu_type nir_type) +{ + /* clang-format off */ + switch (nir_alu_type_get_base_type(nir_type)) { + case nir_type_int: return JAY_TYPE_S; + case nir_type_uint: return JAY_TYPE_U; + case nir_type_bool: return JAY_TYPE_S; + case nir_type_float: return JAY_TYPE_F; + default: UNREACHABLE("invalid NIR type"); + } + /* clang-format on */ +} + +static enum jay_file +jay_file_for_def(const nir_def *def) +{ + return def->bit_size == 1 ? (def->divergent ? FLAG : UFLAG) : + (def->divergent ? GPR : UGPR); +} + +/** + * Returns an jay_type for the ALU op's i-th source. + * (Useful for conversions and comparisons.) + */ +static enum jay_type +jay_alu_source_type(nir_alu_instr *alu, unsigned i) +{ + return jay_type(jay_base_type_for_nir(nir_op_infos[alu->op].input_types[i]), + nir_src_bit_size(alu->src[i].src)); +} + +static inline jay_def +nj_def(nir_def *def) +{ + unsigned bits = def->num_components * MAX2(def->bit_size, 32); + unsigned words = DIV_ROUND_UP(bits, 32); + + return jay_contiguous_def(jay_file_for_def(def), def->index, words); +} + +static inline jay_def +nj_src(nir_src src) +{ + return nj_def(src.ssa); +} + +static void +jay_emit_alu(struct nir_to_jay_state *nj, nir_alu_instr *alu) +{ + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&alu->def); + + nir_alu_type nir_type = nir_op_infos[alu->op].output_type; + enum jay_type base_type = jay_base_type_for_nir(nir_type); + enum jay_type type = jay_type(base_type, alu->def.bit_size); + + jay_def src[NIR_ALU_MAX_INPUTS]; + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + unsigned len = nir_src_bit_size(alu->src[i].src) == 64 ? 2 : 1; + src[i] = jay_extract_range(nj_src(alu->src[i].src), + len * alu->src[i].swizzle[0], len); + } + + switch (alu->op) { +#define CMP(op, jay) \ + case nir_op_##op: \ + jay_CMP(b, jay_alu_source_type(alu, 0), JAY_CONDITIONAL_##jay, dst, \ + src[0], src[1]); \ + break; + +#define UNOP(nir, jay_op) \ + case nir_op_##nir: \ + jay_##jay_op(b, type, dst, src[0]); \ + break; + +#define MATH(nir, jay_op) \ + case nir_op_##nir: \ + jay_MATH(b, type, dst, src[0], JAY_MATH_##jay_op); \ + break; + +#define UNOP_UNTYPED(nir, jay_op) \ + case nir_op_##nir: \ + jay_##jay_op(b, dst, src[0]); \ + break; + +#define BINOP(nir, jay_op) \ + case nir_op_##nir: \ + jay_##jay_op(b, type, dst, src[0], src[1]); \ + break; + +#define DP4A(nir, jay_op, sat_) \ + case nir_op_##nir: \ + jay_DP4A_##jay_op(b, dst, src[2], src[0], src[1])->saturate = sat_; \ + break; + + CMP(flt, LT) + CMP(ilt, LT) + CMP(ult, LT) + CMP(fge, GE) + CMP(ige, GE) + CMP(uge, GE) + CMP(feq, EQ) + CMP(ieq, EQ) + CMP(fneu, NE) + CMP(ine, NE) + + MATH(frcp, INV) + MATH(fexp2, EXP) + MATH(flog2, LOG) + MATH(fsin, SIN) + MATH(fcos, COS) + MATH(fsqrt, SQRT) + MATH(frsq, RSQ) + UNOP(ffract, FRC) + UNOP(ftrunc, RNDZ) + UNOP(ffloor, RNDD) + UNOP(fround_even, RNDE) + + UNOP_UNTYPED(mov, copy) + UNOP_UNTYPED(unpack_32_2x16_split_x, MOV) + UNOP_UNTYPED(b2b1, CAST_CANONICAL_TO_FLAG) + UNOP_UNTYPED(inot, NOT) + UNOP_UNTYPED(bitfield_reverse, BFREV) + UNOP_UNTYPED(bit_count, CBIT) + UNOP_UNTYPED(uclz, LZD) + UNOP_UNTYPED(find_lsb, FBL) + + BINOP(imin, MIN) + BINOP(umin, MIN) + BINOP(fmin, MIN) + BINOP(imax, MAX) + BINOP(umax, MAX) + BINOP(fmax, MAX) + BINOP(fadd, ADD) + BINOP(iadd, ADD) + BINOP(fmul, MUL) + BINOP(imul_32x16, MUL_32X16) + BINOP(umul_32x16, MUL_32X16) + BINOP(ishl, SHL) + BINOP(ishr, ASR) + BINOP(ushr, SHR) + BINOP(urol, ROL) + BINOP(uror, ROR) + BINOP(urhadd, AVG) + BINOP(irhadd, AVG) + BINOP(iand, AND) + BINOP(ior, OR) + BINOP(ixor, XOR) + + DP4A(sdot_4x8_iadd, SS, false) + DP4A(sdot_4x8_iadd_sat, SS, true) + DP4A(udot_4x8_uadd, UU, false) + DP4A(udot_4x8_uadd_sat, UU, true) + DP4A(sudot_4x8_iadd, SU, false) + DP4A(sudot_4x8_iadd_sat, SU, true) + +#undef CMP +#undef UNOP +#undef UNOP_UNTYPED +#undef BINOP +#undef DP4A + + case nir_op_imul: + if (jay_type_size_bits(type) == 32) { + jay_MUL_32(b, type, dst, src[0], src[1], false); + } else { + jay_MUL(b, type, dst, src[0], src[1]); + } + + break; + + case nir_op_imul_high: + case nir_op_umul_high: + jay_MUL_32(b, type, dst, src[0], src[1], true); + break; + + case nir_op_bfm: + jay_BFI1(b, dst, src[0], src[1]); + break; + + case nir_op_b2f64: + jay_SEL(b, JAY_TYPE_U32, jay_extract(dst, 1), 0x3ff00000, 0, src[0]); + jay_MOV(b, jay_extract(dst, 0), 0); + break; + + case nir_op_ufind_msb_rev: + case nir_op_ifind_msb_rev: + jay_FBH(b, jay_alu_source_type(alu, 0), dst, src[0]); + break; + + case nir_op_u2u8: + case nir_op_u2u16: + case nir_op_u2u32: + case nir_op_i2i8: + case nir_op_i2i16: + case nir_op_i2i32: + assert(nir_src_bit_size(alu->src[0].src) > 1 && + "predicate conversions are lowered"); + + if (alu->def.bit_size <= nir_src_bit_size(alu->src[0].src)) { + /* Downconversion. Upper bits garbage convention makes this a no-op. + * The extract handles 64->32 narrowing conversions. + */ + jay_MOV(b, dst, jay_extract(src[0], 0)); + break; + } + + FALLTHROUGH; + case nir_op_i2f64: + case nir_op_i2i64: + case nir_op_u2u64: + case nir_op_u2f64: + case nir_op_f2f64: + case nir_op_f2i64: + case nir_op_f2u64: + case nir_op_f2i32: + case nir_op_f2u32: + case nir_op_f2i32_sat: + case nir_op_f2u32_sat: + case nir_op_i2f32: + case nir_op_u2f32: + case nir_op_f2f32: + case nir_op_i2f16: + case nir_op_u2f16: + case nir_op_f2f16: + case nir_op_f2i16: + case nir_op_f2u16: + case nir_op_f2i8: + case nir_op_f2u8: { + enum jay_type src_type = jay_alu_source_type(alu, 0); + + /* UGPR byte to float is not supported. Do it in 2 steps. */ + if (jay_type_size_bits(src_type) == 8 && + jay_base_type(type) == JAY_TYPE_F && + dst.file == UGPR) { + + enum jay_type integer = jay_type_rebase(type, jay_base_type(src_type)); + jay_def tmp = jay_alloc_def(b, UGPR, 1); + jay_CVT(b, integer, tmp, src[0], src_type, JAY_ROUND, 0); + jay_CVT(b, type, dst, tmp, integer, JAY_ROUND, 0); + } else { + jay_CVT(b, type, dst, src[0], src_type, JAY_ROUND, 0); + } + + break; + } + + case nir_op_f2f16_rtne: + case nir_op_f2f16_rtz: + jay_CVT(b, JAY_TYPE_F16, dst, src[0], jay_alu_source_type(alu, 0), + alu->op == nir_op_f2f16_rtz ? JAY_RTZ : JAY_RNE, 0); + break; + + case nir_op_fsat: + jay_MODIFIER(b, type, dst, src[0])->saturate = true; + break; + + case nir_op_fneg: + case nir_op_ineg: + jay_MODIFIER(b, type, dst, jay_negate(src[0])); + break; + + case nir_op_fabs: + case nir_op_iabs: + jay_MODIFIER(b, type, dst, jay_abs(src[0])); + break; + + case nir_op_iadd3: + jay_ADD3(b, type, dst, src[0], src[1], src[2]); + break; + + case nir_op_uadd_sat: + case nir_op_iadd_sat: + jay_ADD(b, type, dst, src[0], src[1])->saturate = true; + break; + + case nir_op_usub_sat: + case nir_op_isub_sat: + jay_ADD(b, type, dst, src[0], jay_negate(src[1]))->saturate = true; + break; + + case nir_op_ihadd: + case nir_op_uhadd: { + /* AVG(x, y) - ((x ^ y) & 1) */ + jay_def avg = jay_alloc_def(b, dst.file, 1); + jay_def bfn = jay_alloc_def(b, dst.file, 1); + jay_AVG(b, type, avg, src[0], src[1]); + jay_BFN(b, bfn, 1, src[0], src[1], UTIL_LUT3(a & (b ^ c))); + jay_ADD(b, type, dst, avg, jay_negate(bfn)); + break; + } + + case nir_op_unpack_64_2x32_split_x: + jay_MOV(b, dst, jay_extract(src[0], 0)); + break; + case nir_op_unpack_64_2x32_split_y: + jay_MOV(b, dst, jay_extract(src[0], 1)); + break; + case nir_op_unpack_32_2x16_split_y: + jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, 1); + break; + + case nir_op_pack_32_4x8_split: { + /* TODO: Optimize */ + jay_def r = jay_BFI2_u32(b, 0x0000ff00, src[1], src[0]); + r = jay_BFI2_u32(b, 0x00ff0000, src[2], r); + jay_BFI2(b, dst, 0xff000000, src[3], r); + break; + } + + case nir_op_pack_32_2x16_split: + /* TODO: Optimize */ + jay_BFI2(b, dst, 0xffff0000, src[1], src[0]); + break; + + case nir_op_pack_64_2x32_split: + jay_MOV(b, jay_extract(dst, 0), src[0]); + jay_MOV(b, jay_extract(dst, 1), src[1]); + break; + + case nir_op_bitfield_select: + assert(jay_type_size_bits(type) <= 32); + jay_BFN(b, dst, src[0], src[1], src[2], UTIL_LUT3((a & b) | (~a & c))); + break; + + case nir_op_ubfe: + case nir_op_ibfe: + jay_BFE(b, type, dst, src[0], src[1], src[2]); + break; + case nir_op_bfi: + jay_BFI2(b, dst, src[0], src[1], src[2]); + break; + + case nir_op_ffma: + jay_MAD(b, type, dst, src[0], src[1], src[2]); + break; + + case nir_op_fcsel: + jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod = + JAY_CONDITIONAL_NE; + break; + + case nir_op_fcsel_gt: + case nir_op_i32csel_gt: + jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod = + JAY_CONDITIONAL_GT; + break; + + case nir_op_fcsel_ge: + case nir_op_i32csel_ge: + jay_CSEL(b, type, dst, src[1], src[2], src[0])->conditional_mod = + JAY_CONDITIONAL_GE; + break; + + case nir_op_bcsel: + assert(alu->def.bit_size < 64); + assert(jay_is_flag(src[0])); + + /* b2i8 gets lowered into 8-bit csel. Just use the upper bits garbage + * convention to implement with SEL.u16 instead. + */ + if (type == JAY_TYPE_U8) { + type = JAY_TYPE_U16; + } + + jay_SEL(b, type, dst, src[1], src[2], src[0]); + break; + + case nir_op_extract_u8: + jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U8, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + case nir_op_extract_i8: + jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S8, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + case nir_op_extract_u16: + jay_CVT(b, JAY_TYPE_U32, dst, src[0], JAY_TYPE_U16, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + case nir_op_extract_i16: + jay_CVT(b, JAY_TYPE_S32, dst, src[0], JAY_TYPE_S16, JAY_ROUND, + nir_alu_src_as_uint(alu->src[1])); + break; + + default: + if (nir_op_is_vec(alu->op)) { + for (unsigned i = 0; i < nir_op_infos[alu->op].num_inputs; i++) { + unsigned len = jay_type_vector_length(type); + jay_copy(b, jay_extract_range(dst, len * i, len), src[i]); + } + + break; + } + + nir_print_instr(&alu->instr, stderr); + fprintf(stderr, "\n"); + UNREACHABLE("unhandled instruction"); + } +} + +static void +jay_emit_load_const(struct nir_to_jay_state *nj, nir_load_const_instr *lc) +{ + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&lc->def); + assert(lc->def.num_components == 1 && "must be scalarized"); + + if (lc->def.bit_size == 64 && lc->value[0].u64 >> 32) { + jay_MOV_IMM64(b, dst, lc->value[0].u64); + } else { + jay_MOV(b, dst, lc->value[0].u32); + } +} + +static jay_def +jay_resource_handle(jay_builder *b, + nir_src *nsrc, + unsigned *bti_const, + bool *internal, + bool *bindless) +{ + if (!nsrc) { + return jay_null(); + } + + nir_intrinsic_instr *rin = nir_src_as_intrinsic(*nsrc); + + if (nir_src_is_const(*nsrc)) { + *bti_const = nir_src_as_uint(*nsrc); + return jay_null(); + } else if (!rin || rin->intrinsic != nir_intrinsic_resource_intel) { + return nj_src(*nsrc); + } + + uint32_t flags = nir_intrinsic_resource_access_intel(rin); + if (internal) { + *internal = !!(flags & nir_resource_intel_internal); + } + if (bindless) { + *bindless = !!(flags & nir_resource_intel_bindless); + } + + if (nir_src_is_const(rin->src[1])) { + *bti_const = nir_src_as_uint(rin->src[1]); + return jay_null(); + } else { + return nj_src(rin->src[1]); + } +} + +static inline enum lsc_flush_type +translate_flush_type(nir_intrinsic_instr *intr) +{ + switch (nir_intrinsic_memory_semantics(intr)) { + case NIR_MEMORY_ACQUIRE: + return LSC_FLUSH_TYPE_INVALIDATE; + case NIR_MEMORY_RELEASE: + return LSC_FLUSH_TYPE_CLEAN; + case NIR_MEMORY_ACQ_REL: + return LSC_FLUSH_TYPE_EVICT; + case NIR_MEMORY_MAKE_AVAILABLE: + case NIR_MEMORY_MAKE_VISIBLE: + default: + UNREACHABLE("unexpected memory semantic"); + } +} + +static void +emit_lsc_fence(struct nir_to_jay_state *nj, + nir_intrinsic_instr *intr, + enum brw_sfid sfid) +{ + bool device = nir_intrinsic_memory_scope(intr) >= SCOPE_QUEUE_FAMILY; + enum lsc_fence_scope scope = device ? LSC_FENCE_TILE : LSC_FENCE_THREADGROUP; + enum lsc_flush_type type = + sfid == BRW_SFID_SLM ? LSC_FLUSH_TYPE_NONE : translate_flush_type(intr); + + jay_def notif = jay_alloc_def(&nj->bld, UGPR, jay_ugpr_per_grf(nj->s)); + uint32_t desc = lsc_fence_msg_desc(nj->s->devinfo, scope, type, false); + + jay_SEND(&nj->bld, .sfid = sfid, .msg_desc = desc, .srcs = &nj->payload.u0, + .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true, .dst = notif); +} + +static void +jay_emit_memory_barrier(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) +{ + nir_variable_mode modes = nir_intrinsic_memory_modes(intr); + + jay_SYNC(&nj->bld, TGL_SYNC_ALLWR); + + if (modes & nir_var_image) { + emit_lsc_fence(nj, intr, BRW_SFID_TGM); + assert(!nj->nir->info.use_lowered_image_to_global && "fix common code"); + } + + if (modes & (nir_var_mem_ssbo | nir_var_mem_global)) { + emit_lsc_fence(nj, intr, BRW_SFID_UGM); + } + + if (modes & (nir_var_shader_out | nir_var_mem_task_payload)) { + emit_lsc_fence(nj, intr, BRW_SFID_URB); + } + + if ((modes & nir_var_mem_shared) && + !jay_workgroup_is_one_subgroup(&nj->bld, nj->nir)) { + emit_lsc_fence(nj, intr, BRW_SFID_SLM); + } +} + +static void +jay_emit_signal_barrier(struct nir_to_jay_state *nj) +{ + jay_builder *b = &nj->bld; + + /* Signal barrier / Active threads only (BSpec 72052). + * + * Source 0 is the number of subgroups in [31:24], which comes from the u0.2 + * payload in [31:24]. Mask out the other bits, then replicate to [23:15]. + * + * TODO: This can be done faster with a SIMD2 8-bit move. + */ + jay_def a = jay_AND_u32(b, jay_extract(nj->payload.u0, 2), 0xff000000); + jay_def m2 = jay_OR_u32(b, a, jay_SHR_u32(b, a, 8)); + + /* Use an active threads only barrier. TODO: I think we can optimize. */ + if (b->shader->devinfo->ver >= 20) { + m2 = jay_OR_u32(b, m2, BITFIELD_BIT(8)); + } + + uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; + indices[2] = jay_index(m2); + jay_def zipped = jay_collect(b, UGPR, indices, 3); + + jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, + .msg_desc = BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG, .srcs = &zipped, + .nr_srcs = 1, .type = JAY_TYPE_U32, .uniform = true); + + jay_SYNC(b, TGL_SYNC_BAR); +} + +static void +jay_emit_derivative(jay_builder *b, + jay_def dst, + nir_intrinsic_instr *intr, + enum jay_quad_swizzle swz0, + enum jay_quad_swizzle swz1) +{ + assert(intr->def.bit_size == 32 && "todo"); + jay_def val = nj_src(intr->src[0]); + + jay_ADD(b, JAY_TYPE_F32, dst, jay_QUAD_SWIZZLE_u32(b, val, swz1), + jay_negate(jay_QUAD_SWIZZLE_u32(b, val, swz0))); +} + +static void +jay_emit_fb_write(jay_builder *b, nir_intrinsic_instr *intr) +{ + jay_def data = nj_src(intr->src[0]); + jay_def srcs[8]; + + /* Optimize unconditional discards. Should probably do this in NIR. */ + bool trivial = + nir_src_is_const(intr->src[2]) && nir_src_as_bool(intr->src[2]); + + for (unsigned i = 0; i < nir_src_num_components(intr->src[0]); ++i) { + srcs[i] = trivial ? jay_INDETERMINATE_u32(b) : + jay_as_gpr(b, jay_extract(data, i)); + } + + jay_inst *send = + jay_SEND(b, .sfid = BRW_SFID_RENDER_CACHE, .check_tdr = true, + .msg_desc = nir_scalar_as_uint(nir_scalar_chase_movs( + nir_get_scalar(intr->src[1].ssa, 0))) | + (nir_scalar_as_uint(nir_scalar_chase_movs( + nir_get_scalar(intr->src[1].ssa, 1))) + << 32), + .srcs = srcs, .nr_srcs = nir_src_num_components(intr->src[0]), + .type = JAY_TYPE_U32, .eot = nir_intrinsic_eot(intr)); + + /* Handle the disable predicate. It is logically inverted. */ + if (!nir_src_is_const(intr->src[2]) || nir_src_as_bool(intr->src[2])) { + jay_add_predicate(b, send, jay_negate(nj_src(intr->src[2]))); + } +} + +static enum lsc_data_size +lsc_bits_to_data_size(unsigned bit_size) +{ + /* clang-format off */ + switch (bit_size / 8) { + case 1: return LSC_DATA_SIZE_D8U32; + case 2: return LSC_DATA_SIZE_D16U32; + case 4: return LSC_DATA_SIZE_D32; + case 8: return LSC_DATA_SIZE_D64; + default: UNREACHABLE("Unsupported data size."); + } + /* clang-format on */ +} + +static enum lsc_opcode +lsc_op_for_atomic(nir_atomic_op op) +{ + /* clang-format off */ + switch (op) { + case nir_atomic_op_iadd: return LSC_OP_ATOMIC_ADD; + case nir_atomic_op_imin: return LSC_OP_ATOMIC_MIN; + case nir_atomic_op_umin: return LSC_OP_ATOMIC_UMIN; + case nir_atomic_op_imax: return LSC_OP_ATOMIC_MAX; + case nir_atomic_op_umax: return LSC_OP_ATOMIC_UMAX; + case nir_atomic_op_iand: return LSC_OP_ATOMIC_AND; + case nir_atomic_op_ior: return LSC_OP_ATOMIC_OR; + case nir_atomic_op_ixor: return LSC_OP_ATOMIC_XOR; + case nir_atomic_op_xchg: return LSC_OP_ATOMIC_STORE; + case nir_atomic_op_cmpxchg: return LSC_OP_ATOMIC_CMPXCHG; + case nir_atomic_op_fmin: return LSC_OP_ATOMIC_FMIN; + case nir_atomic_op_fmax: return LSC_OP_ATOMIC_FMAX; + case nir_atomic_op_fcmpxchg: return LSC_OP_ATOMIC_FCMPXCHG; + case nir_atomic_op_fadd: return LSC_OP_ATOMIC_FADD; + default: UNREACHABLE("Unsupported NIR atomic"); + } + /* clang-format on */ +} + +static jay_def +jay_src_as_strided(jay_builder *b, + jay_def x, + unsigned element_sz, + enum jay_file dst_file) +{ + if (dst_file == UGPR) { + assert(jay_is_uniform(x) && "Uniform dests require uniform sources"); + + if (x.file != UGPR) { + jay_def tmp = jay_alloc_def(b, UGPR, jay_num_values(x)); + jay_copy(b, tmp, x); + x = tmp; + } + + uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; + unsigned nr = jay_num_values(x) * jay_ugpr_per_grf(b->shader); + assert(nr < ARRAY_SIZE(indices)); + + for (unsigned i = 0; i < jay_num_values(x) / element_sz; ++i) { + for (unsigned j = 0; j < element_sz; ++j) { + indices[(i * jay_ugpr_per_grf(b->shader)) + j] = + jay_channel(x, (i * element_sz) + j); + } + } + + return jay_collect(b, UGPR, indices, nr); + } else { + /* Could be a GPR or UGPR source */ + assert(dst_file == GPR); + return jay_as_gpr(b, x); + } +} + +static jay_def +jay_scratch_surface(struct nir_to_jay_state *nj) +{ + if (jay_is_null(nj->payload.scratch_surface)) { + jay_function *func = nj->f; + assert(func->is_entrypoint && "todo: this needs ABI"); + + jay_builder b = jay_init_builder(func, jay_before_function(func)); + nj->payload.scratch_surface = jay_alloc_def(&b, J_ADDRESS, 1); + + jay_def u0_5 = jay_extract(nj->payload.u0, 5); + jay_def state = jay_AND_u32(&b, u0_5, ~BITFIELD_MASK(10)); + jay_SHR(&b, JAY_TYPE_U32, nj->payload.scratch_surface, state, 4); + } + + return nj->payload.scratch_surface; +} + +static void +jay_emit_mem_access(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) +{ + jay_builder *b = &nj->bld; + bool slm = nir_is_shared_access(intr); + bool tgm = nir_intrinsic_has_image_dim(intr); + bool urb = intr->intrinsic == nir_intrinsic_store_urb_lsc_intel || + intr->intrinsic == nir_intrinsic_store_urb_vec4_intel; + enum brw_sfid sfid = slm ? BRW_SFID_SLM : + tgm ? BRW_SFID_TGM : + urb ? BRW_SFID_URB : + BRW_SFID_UGM; + + nir_src *data_src = nir_get_io_data_src(intr); + bool scratch = intr->intrinsic == nir_intrinsic_load_scratch_intel || + intr->intrinsic == nir_intrinsic_store_scratch_intel; + + enum lsc_opcode op; + if (nir_intrinsic_has_atomic_op(intr)) + op = lsc_op_for_atomic(nir_intrinsic_atomic_op(intr)); + else if (sfid == BRW_SFID_TGM) + op = data_src ? LSC_OP_STORE_CMASK : LSC_OP_LOAD_CMASK; + else + op = data_src ? LSC_OP_STORE : LSC_OP_LOAD; + + nir_src *bti = nir_get_io_index_src(intr), *ubo = NULL; + nir_src *offset_src = tgm ? &intr->src[1] : nir_get_io_offset_src(intr); + + if (intr->intrinsic == nir_intrinsic_load_ubo || + intr->intrinsic == nir_intrinsic_load_ubo_uniform_block_intel) { + ubo = bti; + bti = NULL; + b->shader->prog_data->base.has_ubo_pull = true; + } + + const struct intel_device_info *devinfo = b->shader->devinfo; + bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest; + jay_def data = data_src ? nj_src(*data_src) : jay_null(); + unsigned bti_const = 0; + bool internal = false; + bool bindless = false; + jay_def bti_indirect = + jay_resource_handle(b, bti ?: ubo, &bti_const, &internal, &bindless); + jay_def offset = nj_src(*offset_src); + nir_def *ndata = data_src ? data_src->ssa : &intr->def; + jay_def dst = has_dest ? nj_def(&intr->def) : jay_null(); + int32_t base_offset = + nir_intrinsic_has_base(intr) ? nir_intrinsic_base(intr) : 0; + + /* Optimize increment/decrement */ + if (op == LSC_OP_ATOMIC_ADD && nir_src_is_const(*data_src)) { + int64_t add_val = nir_src_as_int(*data_src); + if (add_val == 1 || add_val == -1) { + op = add_val == 1 ? LSC_OP_ATOMIC_INC : LSC_OP_ATOMIC_DEC; + data = jay_null(); + } + } + + /* Pack the coordinates. TODO: MSAA */ + if (tgm) { + unsigned nr = nir_image_intrinsic_coord_components(intr); + offset = jay_extract_range(offset, 0, nr); + } + + internal |= scratch; + enum lsc_addr_surface_type surf_type = internal ? LSC_ADDR_SURFTYPE_SS : + bindless ? LSC_ADDR_SURFTYPE_BSS : + (bti || ubo) ? LSC_ADDR_SURFTYPE_BTI : + LSC_ADDR_SURFTYPE_FLAT; + + bool a64 = surf_type == LSC_ADDR_SURFTYPE_FLAT && sfid == BRW_SFID_UGM; + enum lsc_addr_size addr_size = a64 ? LSC_ADDR_SIZE_A64 : LSC_ADDR_SIZE_A32; + enum jay_type offset_type = a64 ? JAY_TYPE_U64 : JAY_TYPE_U32; + + bool cmask = op == LSC_OP_LOAD_CMASK || op == LSC_OP_STORE_CMASK; + bool uniform = !(has_dest && dst.file != UGPR); + + if (nir_intrinsic_has_align(intr)) { + assert(nir_intrinsic_align(intr) >= (ndata->bit_size / 8)); + } + + if (!has_dest) { + uniform &= jay_is_null(data) || data.file == UGPR; + uniform &= jay_is_null(offset) || offset.file == UGPR; + uniform &= !(cmask || urb); + } + + /* Per bspec 57330, 8-bit/16-bit are not supported for transpose */ + bool transpose = uniform && !cmask && ndata->bit_size >= 32; + bool scalar_uniform = uniform && !cmask && ndata->bit_size < 32; + + if (!uniform) { + offset = jay_as_gpr(b, offset); + } else if (!transpose) { + offset = jay_src_as_strided(b, offset, a64 ? 2 : 1, UGPR); + } + + if (!jay_is_null(data) && !transpose && !scalar_uniform) + data = jay_as_gpr(b, data); + + unsigned access = + nir_intrinsic_has_access(intr) ? nir_intrinsic_access(intr) : 0; + + bool volatile_access = access & ACCESS_VOLATILE; + bool coherent_access = access & ACCESS_COHERENT; + + /* Bspec: Atomic instruction -> Cache section: + * + * Atomic messages are always forced to "un-cacheable" in the L1 + * cache. + * + * Bspec: Overview of memory Access: + * + * If a read from a Null tile gets a cache-hit in a virtually-addressed + * GPU cache, then the read may not return zeroes. + * + * If a shader writes to a null tile and wants to be able to read it back + * as zero, it will use the 'volatile' decoration for the access, otherwise + * the compiler may choose to optimize things out, breaking the + * residencyNonResidentStrict guarantees. Due to the above, we need to make + * these operations uncached. + */ + unsigned cache = + urb ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) : + lsc_opcode_is_atomic(op) ? + LSC_CACHE(devinfo, STORE, L1UC_L3WB) : + volatile_access ? + (devinfo->ver >= 20 ? + /* Xe2 has a better L3 that can deal with null tiles.*/ + (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) : + LSC_CACHE(devinfo, LOAD, L1UC_L3C)) : + /* On older platforms, all caches have to be bypassed. */ + (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3UC) : + LSC_CACHE(devinfo, LOAD, L1UC_L3UC))) : + /* Skip L1 for coherent accesses */ + coherent_access ? (!has_dest ? LSC_CACHE(devinfo, STORE, L1UC_L3WB) : + LSC_CACHE(devinfo, LOAD, L1UC_L3C)) : + !has_dest ? LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS) : + LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS); + + unsigned max_imm_bits = brw_max_immediate_offset_bits(surf_type); + assert(base_offset >= u_intN_min(max_imm_bits)); + assert(base_offset <= u_intN_max(max_imm_bits)); + assert(base_offset == 0 || sfid != BRW_SFID_TGM); + + const unsigned base_offs_bits = + util_bitpack_sint(base_offset, 0, max_imm_bits - 1); + + unsigned nr = ndata->num_components; + uint64_t desc = + lsc_msg_desc(devinfo, op, surf_type, addr_size, + lsc_bits_to_data_size(ndata->bit_size), + cmask ? BITFIELD_MASK(nr) : nr, transpose, cache); + + jay_def tmp = dst; + + if (dst.file == UGPR) { + if (transpose) { + /* Transpose writes whole GRFs, so round up */ + tmp = jay_alloc_def(b, UGPR, + ALIGN_POT(jay_num_values(dst), + jay_ugpr_per_grf(b->shader))); + } else { + /* Without transpose we write at GRF granularity. Pad out. */ + tmp = jay_alloc_def(b, UGPR, + jay_ugpr_per_grf(b->shader) * jay_num_values(dst)); + } + } + + jay_def srcs[] = { offset, data }; + + /* Second data source immediately follows the first */ + if (op == LSC_OP_ATOMIC_CMPXCHG || op == LSC_OP_ATOMIC_FCMPXCHG) { + jay_def data2 = nj_src(*(data_src + 1)); + + if (!transpose) { + data2 = jay_as_gpr(b, data2); + } + + srcs[1] = jay_collect_two(b, data, data2); + } + + jay_def ex_desc = jay_null(); + uint32_t ex_desc_imm = 0; + if (scratch) { + ex_desc = jay_scratch_surface(nj); + + if (has_dest) { + b->shader->fills++; + } else { + b->shader->spills++; + } + } else if (surf_type == LSC_ADDR_SURFTYPE_FLAT) { + desc |= ((uint64_t) lsc_flat_ex_desc(devinfo, base_offs_bits) << 32); + } else if (jay_is_null(bti_indirect)) { + desc |= + ((uint64_t) lsc_bti_ex_desc(devinfo, bti_const, base_offs_bits) << 32); + } else if (!jay_is_null(bti_indirect)) { + ex_desc = bti_indirect; + + if (surf_type == LSC_ADDR_SURFTYPE_SS || + surf_type == LSC_ADDR_SURFTYPE_BSS) { + ex_desc_imm = SET_BITS(GET_BITS(base_offs_bits, 16, 4), 31, 19) | + SET_BITS(GET_BITS(base_offs_bits, 3, 0), 15, 12); + } else { + /* TODO: Move the SHL to NIR for CSE? */ + assert(surf_type == LSC_ADDR_SURFTYPE_BTI); + assert(base_offs_bits == 0); + ex_desc = jay_SHL_u32(b, bti_indirect, 24); + } + } + + enum jay_type data_type = jay_type(JAY_TYPE_U, MAX2(ndata->bit_size, 32)); + jay_SEND(b, .sfid = sfid, .msg_desc = desc, .srcs = srcs, + .nr_srcs = jay_is_null(data) ? 1 : 2, .dst = tmp, .type = data_type, + .src_type = { offset_type, data_type }, .uniform = uniform, + .bindless = surf_type == LSC_ADDR_SURFTYPE_BSS, .ex_desc = ex_desc, + .ex_desc_imm = ex_desc_imm); + + if (has_dest && !jay_defs_equivalent(tmp, dst)) { + jay_copy_strided(b, dst, tmp, !transpose); + } +} + +static void +jay_emit_barycentric(struct nir_to_jay_state *nj, + nir_intrinsic_instr *intr, + enum intel_barycentric_mode mode) +{ + assert(nj->s->stage == MESA_SHADER_FRAGMENT); + enum glsl_interp_mode glsl_mode = nir_intrinsic_interp_mode(intr); + + if (glsl_mode == INTERP_MODE_NOPERSPECTIVE) { + mode += INTEL_BARYCENTRIC_NONPERSPECTIVE_PIXEL; + } else { + assert(glsl_mode == INTERP_MODE_SMOOTH); + } + + jay_copy(&nj->bld, nj_def(&intr->def), nj->payload.fs.bary[mode]); +} + +static void +jay_emit_intrinsic(struct nir_to_jay_state *nj, nir_intrinsic_instr *intr) +{ + jay_shader *s = nj->s; + jay_function *f = nj->f; + jay_builder *b = &nj->bld; + jay_cs_payload *cs = + mesa_shader_stage_is_compute(s->stage) ? &nj->payload.cs : NULL; + + const bool has_dest = nir_intrinsic_infos[intr->intrinsic].has_dest; + jay_def dst = has_dest ? nj_def(&intr->def) : jay_null(); + + switch (intr->intrinsic) { + case nir_intrinsic_resource_intel: + /* No code to generate here */ + break; + + case nir_intrinsic_global_atomic: + case nir_intrinsic_global_atomic_swap: + case nir_intrinsic_image_atomic: + case nir_intrinsic_image_atomic_swap: + case nir_intrinsic_image_load: + case nir_intrinsic_image_store: + case nir_intrinsic_load_global: + case nir_intrinsic_load_global_constant: + case nir_intrinsic_load_global_constant_uniform_block_intel: + case nir_intrinsic_load_scratch_intel: + case nir_intrinsic_load_shared: + case nir_intrinsic_load_shared_uniform_block_intel: + case nir_intrinsic_load_ssbo: + case nir_intrinsic_load_ssbo_intel: + case nir_intrinsic_load_ssbo_uniform_block_intel: + case nir_intrinsic_load_ubo: + case nir_intrinsic_load_ubo_uniform_block_intel: + case nir_intrinsic_shared_atomic: + case nir_intrinsic_shared_atomic_swap: + case nir_intrinsic_ssbo_atomic: + case nir_intrinsic_ssbo_atomic_swap: + case nir_intrinsic_store_global: + case nir_intrinsic_store_urb_lsc_intel: + case nir_intrinsic_store_scratch_intel: + case nir_intrinsic_store_shared: + case nir_intrinsic_store_ssbo: + case nir_intrinsic_store_ssbo_intel: + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic: + case nir_intrinsic_bindless_image_atomic_swap: + jay_emit_mem_access(nj, intr); + break; + + case nir_intrinsic_load_push_data_intel: { + unsigned sz = intr->def.bit_size / 8; + unsigned base_offset = nir_intrinsic_base(intr); + assert(util_is_aligned(base_offset, sz)); + + if (nir_src_is_const(intr->src[0])) { + unsigned load_offset = nir_src_as_uint(intr->src[0]); + unsigned offs = base_offset + load_offset; + assert(util_is_aligned(load_offset, sz)); + + if (sz >= 4) { + jay_foreach_comp(dst, c) { + jay_MOV(b, jay_extract(dst, c), + nj->payload.push_data[(offs / 4) + c]); + } + } else { + jay_foreach_comp(dst, c) { + unsigned comp_offs = offs + c * sz; + if (util_is_aligned(comp_offs, 4)) { + jay_MOV(b, jay_extract(dst, c), + nj->payload.push_data[comp_offs / 4]); + } else { + jay_CVT(b, JAY_TYPE_U32, jay_extract(dst, c), + nj->payload.push_data[comp_offs / 4], + JAY_TYPE_U | intr->def.bit_size, JAY_ROUND, + (comp_offs % 4) / sz); + } + } + } + } else { + UNREACHABLE("todo: indirect push data"); + } + break; + } + + case nir_intrinsic_barrier: + if (nir_intrinsic_memory_scope(intr) != SCOPE_NONE) { + jay_emit_memory_barrier(nj, intr); + } + + if (cs) { + if (nir_intrinsic_execution_scope(intr) == SCOPE_WORKGROUP) { + if (jay_workgroup_is_one_subgroup(b, nj->nir)) { + // XXX: when we have a scheduler, jay_SCHEDULE_BARRIER(b); + } else { + jay_emit_signal_barrier(nj); + s->prog_data->cs.uses_barrier = true; + } + } + } else { + // XXX: when we have a scheduler, jay_SCHEDULE_BARRIER(b); + } + break; + + case nir_intrinsic_begin_invocation_interlock: + case nir_intrinsic_end_invocation_interlock: + UNREACHABLE("TODO"); + + case nir_intrinsic_load_reloc_const_intel: + jay_RELOC(b, dst, nir_intrinsic_param_idx(intr), + nir_intrinsic_base(intr)); + break; + + case nir_intrinsic_store_render_target_intel: + assert(nj->nir->info.stage == MESA_SHADER_FRAGMENT); + jay_emit_fb_write(b, intr); + break; + + case nir_intrinsic_shader_clock: + /* We must access the timestamp register atomically, but 64-bit + * instructions cannot read ARF. Instead use a 2x32-bit vectorized move. + */ + assert(dst.file == UGPR && "required for vectorization"); + jay_MOV(b, dst, jay_contiguous_def(J_ARF, JAY_ARF_TIMESTAMP, 2))->type = + JAY_TYPE_U32; + break; + + case nir_intrinsic_load_sample_mask_in: { + jay_def mask = jay_extract(nj->payload.u0, 15); + + if (nj->s->dispatch_width == 32) { + /* TODO: Optimize */ + jay_def hi = jay_extract(nj->payload.u1, 15); + mask = jay_BFI2_u32(b, 0xffff0000, hi, mask); + } + + jay_MOV(b, dst, mask); + break; + } + + case nir_intrinsic_load_subgroup_invocation: + /* TODO: Lower this in NIR? */ + jay_CVT(b, JAY_TYPE_U32, dst, nj->payload.lane_id, JAY_TYPE_U16, + JAY_ROUND, 0); + break; + + case nir_intrinsic_demote: + case nir_intrinsic_demote_if: + /* TODO: Already lowered, but need to implement for performance. */ + break; + + case nir_intrinsic_ddx: + case nir_intrinsic_ddx_coarse: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX, + JAY_QUAD_SWIZZLE_YYYY); + break; + case nir_intrinsic_ddx_fine: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXZZ, + JAY_QUAD_SWIZZLE_YYWW); + break; + + case nir_intrinsic_ddy: + case nir_intrinsic_ddy_coarse: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XXXX, + JAY_QUAD_SWIZZLE_ZZZZ); + break; + case nir_intrinsic_ddy_fine: + jay_emit_derivative(b, dst, intr, JAY_QUAD_SWIZZLE_XYXY, + JAY_QUAD_SWIZZLE_ZWZW); + break; + + case nir_intrinsic_first_invocation: + jay_MOV(b, dst, emit_active_lane(nj)); + break; + + case nir_intrinsic_read_first_invocation: + jay_MOV(b, dst, emit_uniformize(nj, nj_src(intr->src[0]))); + break; + + case nir_intrinsic_ballot: + case nir_intrinsic_ballot_relaxed: { + jay_def val = nj_src(intr->src[0]); + if (nir_src_is_const(intr->src[0]) && nir_src_as_bool(intr->src[0])) { + val = emit_active_lane_mask(nj); + } else if (val.file == UFLAG) { + /* Move to a FLAG temporary so we can ballot it. */ + val = jay_MOV(b, jay_alloc_def(b, FLAG, 1), val)->dst; + } else { + assert(val.file == FLAG); + } + + assert(intr->def.bit_size == b->shader->dispatch_width); + jay_MOV(b, dst, val); + break; + } + + /* We prefer to inverse_ballot by copying a UGPR to the flag. If we have a + * GPR input, we could uniformize (as behaviour is undefined for + * non-uniform inputs) but a lowered bit extract is cheaper than uniformize. + */ + case nir_intrinsic_inverse_ballot: { + assert(dst.file == FLAG); + jay_def x = nj_src(intr->src[0]); + if (x.file == GPR) { + jay_def shr = jay_SHR_u32(b, x, nj->payload.lane_id); + jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), shr, 1); + jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_NE); + } else { + jay_MOV(b, dst, x)->type = JAY_TYPE_U | b->shader->dispatch_width; + } + + break; + } + + case nir_intrinsic_load_local_invocation_id: + assert(cs); + UNREACHABLE("todo: implement me from payload"); + jay_copy(b, dst, cs->local_invocation_ids); + break; + + case nir_intrinsic_load_barycentric_pixel: + jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL); + break; + + case nir_intrinsic_load_barycentric_sample: + jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); + break; + + case nir_intrinsic_load_barycentric_centroid: + jay_emit_barycentric(nj, intr, INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID); + break; + + case nir_intrinsic_load_pixel_coord_intel: + jay_MOV(b, dst, nj->payload.fs.coord.xy); + break; + + case nir_intrinsic_load_frag_coord_z: + jay_MOV(b, dst, nj->payload.fs.coord.z); + break; + + case nir_intrinsic_load_frag_coord_w_rcp: + jay_MOV(b, dst, nj->payload.fs.coord.w); + break; + + case nir_intrinsic_load_urb_output_handle_intel: + jay_MOV(b, dst, nj->payload.urb_handle); + break; + + case nir_intrinsic_load_layer_id: + jay_EXTRACT_LAYER(b, dst, jay_extract(nj->payload.u0, 9), + payload_u1(nj, 9, 1)); + break; + + case nir_intrinsic_load_front_face: { + /* Bit 11 is facingness for the first polygon. TODO: Multipolygon. */ + jay_inst *and = jay_AND(b, JAY_TYPE_U32, jay_null(), + jay_extract(nj->payload.u0, 9), BITFIELD_BIT(11)); + + /* The bit is actually backfacingness so check for equality with 0 */ + jay_set_conditional_mod(b, and, dst, JAY_CONDITIONAL_EQ); + break; + } + + /* Sample ID comes in as 4-bit numbers in g1.0: + * + * 15:12 Slot 3 SampleID + * 11:8 Slot 2 SampleID + * 7:4 Slot 1 SampleID + * 3:0 Slot 0 SampleID + * + * Each slot corresponds to four channels, so we want to replicate each + * half-byte value to 4 channels in a row: + * + * dst+0: .7 .6 .5 .4 .3 .2 .1 .0 + * 7:4 7:4 7:4 7:4 3:0 3:0 3:0 3:0 + * + * dst+1: .7 .6 .5 .4 .3 .2 .1 .0 + * 15:12 15:12 15:12 15:12 11:8 11:8 11:8 11:8 + * + * First, we read g1.0 with a <1,8,0>UB region, causing the first 8 + * channels to read the first byte (7:0), and the second group of 8 + * channels to read the second byte (15:8). Then, we shift right by + * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3 + * values into place. Finally, we AND with 0xf to keep the low nibble. + * + * According to the "PS Thread Payload for Normal Dispatch" + * pages on the BSpec, the sample ids are stored in R0.8/R1.8 + * on gfx20+ and in R1.0/R2.0 on gfx8+. + */ + case nir_intrinsic_load_sample_id: { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_EXTRACT_BYTE_PER_8LANES(b, x, jay_extract(nj->payload.u0, 8), + payload_u1(nj, 8, 1)); + jay_AND_U32_U16(b, dst, jay_SHR_ODD_SUBSPANS_BY_4_u16(b, x), 0xF); + break; + } + + case nir_intrinsic_load_input: + if (s->stage == MESA_SHADER_VERTEX) { + unsigned offs = nir_intrinsic_base(intr) * 4; + offs += nir_intrinsic_component(intr); + assert(intr->def.bit_size == 32 && "todo"); + + jay_copy(b, dst, + jay_collect_vectors(b, nj->payload.vs.attributes + offs, + intr->def.num_components)); + break; + } + + FALLTHROUGH; + case nir_intrinsic_load_fs_input_interp_deltas: { + assert(s->stage == MESA_SHADER_FRAGMENT); + unsigned location = nir_intrinsic_io_semantics(intr).location + + nir_src_as_uint(intr->src[0]); + unsigned i = (s->prog_data->fs.urb_setup[location] * 4) + + nir_intrinsic_component(intr); + + if (intr->intrinsic == nir_intrinsic_load_input) { + assert(intr->def.num_components == 1 && "should be scalarized"); + } + + /* Zeroth delta is the flat value */ + jay_copy(b, dst, nj->payload.fs.deltas[i]); + break; + } + + case nir_intrinsic_load_subgroup_id: + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + /* Subgroup ID in Thread Group is u0.2 bits 7:0 */ + jay_AND(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 0xFF); + break; + + case nir_intrinsic_load_num_subgroups: + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + /* Number of subgroups in Thread Group is u0.2 bits 31:24 */ + jay_SHR(b, JAY_TYPE_U32, dst, jay_extract(nj->payload.u0, 2), 24); + break; + + case nir_intrinsic_load_workgroup_id: + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + jay_MOV(b, jay_extract(dst, 0), jay_extract(nj->payload.u0, 1)); + jay_MOV(b, jay_extract(dst, 1), jay_extract(nj->payload.u0, 6)); + jay_MOV(b, jay_extract(dst, 2), jay_extract(nj->payload.u0, 7)); + break; + + case nir_intrinsic_shuffle_intel: { + jay_def data = nj_src(intr->src[0]); + + if (nir_src_is_const(intr->src[1])) { + /* Broadcast takes a lane index, with only 32-bit registers */ + jay_BROADCAST_IMM(b, dst, data, nir_src_as_uint(intr->src[1]) / 4); + } else { + /* Shuffle takes a byte index */ + jay_SHUFFLE(b, dst, data, nj_src(intr->src[1])); + } + + break; + } + + case nir_intrinsic_quad_broadcast: + jay_QUAD_SWIZZLE(b, dst, nj_src(intr->src[0]), + JAY_QUAD_SWIZZLE_XXXX + nir_src_as_uint(intr->src[1])); + break; + + case nir_intrinsic_load_inline_data_intel: { + assert(cs && f->is_entrypoint && "todo: this needs ABI"); + b->shader->prog_data->cs.uses_inline_data = true; + + unsigned offset = nir_intrinsic_base(intr) / 4; + unsigned nr = jay_num_values(dst); + jay_copy(b, dst, jay_extract_range(nj->payload.inline_data, offset, nr)); + break; + } + + default: +#ifndef NDEBUG + assert(intr->intrinsic < nir_num_intrinsics); + fprintf(stdout, "intrinsic: %s\n", + nir_intrinsic_infos[intr->intrinsic].name); +#endif + UNREACHABLE("unknown intrinsic"); + } +} + +static bool +sampler_needs_header(enum brw_sampler_opcode op, + nir_texop nir_op, + const struct intel_device_info *devinfo) +{ + switch (op) { + case BRW_SAMPLER_OPCODE_SAMPLEINFO: + return true; + case BRW_SAMPLER_OPCODE_LD: + case BRW_SAMPLER_OPCODE_LD_LZ: + /* Xe3 HW does not seem to work unless we force a header. */ + return devinfo->ver >= 30; + default: + return nir_op == nir_texop_tg4; + } +} + +static void +jay_emit_texture(struct nir_to_jay_state *nj, nir_tex_instr *tex) +{ + /* SKL PRMs: Volume 7: 3D-Media-GPGPU: + * + * "The Pixel Null Mask field, when enabled via the Pixel Null Mask + * Enable will be incorect for sample_c when applied to a surface with + * 64-bit per texel format such as R16G16BA16_UNORM. Pixel Null mask + * Enable may incorrectly report pixels as referencing a Null surface." + * + * We'll take care of this in NIR. + */ + assert(!tex->is_sparse || + nir_tex_instr_src_index(tex, nir_tex_src_comparator) == -1); + + jay_builder *b = &nj->bld; + jay_def dst = nj_def(&tex->def); + jay_def tmp = dst; + + const enum brw_sampler_opcode op = (enum brw_sampler_opcode)( + tex->backend_flags & ~BRW_TEX_INSTR_FUSED_EU_DISABLE); + const struct brw_sampler_payload_desc *payload_desc = + brw_get_sampler_payload_desc(op); + + /* First deal with surface & sampler */ + unsigned payload_type_bit_size = 0; + bool surface_bindless = false; + bool sampler_bindless = false; + jay_def surface, sampler, packed_offsets = jay_null(); + jay_def payload[JAY_MAX_SAMPLER_MESSAGE_SIZE]; + int i; + if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_handle)) >= 0) { + unsigned x; + surface = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless); + if (jay_is_null(surface)) + surface = jay_imm(x); + assert(tex->texture_index == 0); + } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_texture_offset)) >= + 0) { + unsigned x; + surface = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &surface_bindless); + if (jay_is_null(surface)) + surface = jay_imm(x + tex->texture_index); + else if (tex->texture_index) + surface = jay_ADD_u32(b, surface, tex->texture_index); + } else { + surface = jay_imm(tex->texture_index); + } + + if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_handle)) >= 0) { + unsigned x; + sampler = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless); + if (jay_is_null(sampler)) + surface = jay_imm(x); + assert(tex->sampler_index == 0); + } else if ((i = nir_tex_instr_src_index(tex, nir_tex_src_sampler_offset)) >= + 0) { + unsigned x; + sampler = + jay_resource_handle(b, &tex->src[i].src, &x, NULL, &sampler_bindless); + if (jay_is_null(sampler)) + sampler = jay_imm(x + tex->sampler_index); + else + sampler = jay_ADD_u32(b, sampler, tex->sampler_index); + } else { + sampler = jay_imm(tex->sampler_index); + } + + surface = emit_uniformize(nj, surface); + sampler = emit_uniformize(nj, sampler); + + /* Now the sampler payload */ + bool has_offset_in_payload = false; + bool payload_uniform = true; + uint32_t n_sources = TEX_LOGICAL_SRC_PAYLOAD0; + for (uint32_t i = 0; + payload_desc->sources[i].param != BRW_SAMPLER_PAYLOAD_PARAM_INVALID; + i++) { + nir_tex_src_type nir_source; + unsigned nir_comp; + +#define P(name) BRW_SAMPLER_PAYLOAD_PARAM_##name +#define S(name, component) \ + do { \ + nir_source = nir_tex_src_##name; \ + nir_comp = component; \ + } while (0) + + struct brw_sampler_payload_src sampler_src = payload_desc->sources[i]; + + switch (sampler_src.param) { + case P(U): + S(coord, 0); + break; + case P(V): + S(coord, 1); + break; + case P(R): + S(coord, 2); + break; + case P(AI): + S(coord, 3); + break; + case P(BIAS): + S(bias, 0); + break; + case P(LOD): + S(lod, 0); + break; + case P(MLOD): + S(min_lod, 0); + break; + case P(REF): + S(comparator, 0); + break; + case P(DUDX): + S(ddx, 0); + break; + case P(DUDY): + S(ddy, 0); + break; + case P(DVDX): + S(ddx, 1); + break; + case P(DVDY): + S(ddy, 1); + break; + case P(DRDX): + S(ddx, 2); + break; + case P(DRDY): + S(ddy, 2); + break; + case P(SI): + S(ms_index, 0); + break; + case P(MCSL): + S(ms_mcs_intel, 0); + break; + case P(MCSH): + S(ms_mcs_intel, 1); + break; + case P(MCS0): + S(ms_mcs_intel, 0); + break; + case P(MCS1): + S(ms_mcs_intel, 1); + break; + case P(MCS2): + S(ms_mcs_intel, 2); + break; + case P(MCS3): + S(ms_mcs_intel, 3); + break; + + case P(OFFU): + S(offset, 0); + has_offset_in_payload = true; + break; + case P(OFFV): + S(offset, 1); + has_offset_in_payload = true; + break; + case P(OFFUV4): + case P(OFFUVR4): + case P(OFFUV6): + case P(OFFUVR6): + case P(BIAS_OFFUV6): + case P(BIAS_OFFUVR4): + case P(LOD_OFFUV6): + case P(LOD_OFFUVR4): + case P(OFFUV4_R): + case P(OFFUV6_R): + case P(OFFUVR4_R): + /* There is no payload with 2 packed entries, so backend1 is always + * the one payload parameter packed. */ + S(backend1, 0); + has_offset_in_payload = true; + break; + + case P(BIAS_AI): + case P(LOD_AI): + case P(MLOD_R): + /* There is no payload with 2 packed entries, so backend1 is always + * the one payload parameter packed. */ + S(backend1, 0); + break; + + default: + UNREACHABLE("unhandled sampler param"); + } + +#undef P +#undef S + + jay_def param_val = jay_null(); + + int j = nir_tex_instr_src_index(tex, nir_source); + if (j >= 0 && nir_comp < tex->src[j].src.ssa->num_components) { + param_val = jay_extract(nj_src(tex->src[j].src), nir_comp); + + unsigned bitsize = nir_src_bit_size(tex->src[j].src); + assert(payload_type_bit_size == 0 || payload_type_bit_size == bitsize); + payload_type_bit_size = bitsize; + } + + /* The hardware requires a LOD for buffer textures */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_BUF && + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_LOD) { + sampler_src.optional = false; + } + + /* Wa_14012688258: + * + * Don't trim zeros at the end of payload for sample operations + * in cube and cube arrays. + * + * Compiler should send U,V,R parameters even if V,R are 0. + */ + if (tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE && + intel_needs_workaround(nj->devinfo, 14012688258) && + (sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_U || + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_V || + sampler_src.param == BRW_SAMPLER_PAYLOAD_PARAM_R)) { + sampler_src.optional = false; + } + + /* The last source present in the payload dictates the number of + * sources, unless it's required. + * + * We can skip the last source if it's zero. + */ + if (!sampler_src.optional || !jay_is_null(param_val)) + n_sources = i + 1; + + if (jay_is_null(param_val)) { + param_val = jay_alloc_def(b, dst.file, 1); + jay_MOV(b, param_val, 0); + } + + payload[i] = param_val; + payload_uniform &= jay_is_uniform(payload[i]); + } + + i = nir_tex_instr_src_index(tex, nir_tex_src_backend2); + if (i >= 0) { + packed_offsets = nj_src(tex->src[i].src); + } + + /* Xe2+ should never used packed offsets since it has enough opcodes to + * handle any programmable offset. + */ + assert(jay_is_null(packed_offsets) || nj->devinfo->ver < 20); + + /* If the NIR instruction has an offset param but the sampler payload + * doesn't, we can put the offset into the header of the message. + * + * The restriction though is that it should be a constant value. + */ + int offs_idx = nir_tex_instr_src_index(tex, nir_tex_src_offset); + bool has_const_offsets = offs_idx != -1 && !has_offset_in_payload; + + bool is_high_sampler = !jay_is_imm(sampler) || jay_as_uint(sampler) >= 16; + bool residency = tex->is_sparse; + unsigned null_mask_component = 0; + + const bool needs_header = sampler_needs_header(op, tex->op, nj->devinfo) || + has_const_offsets || + !jay_is_null(packed_offsets) || + sampler_bindless || + is_high_sampler || + residency; + + uint8_t component_mask; + if (tex->op == nir_texop_tg4) { + component_mask = WRITEMASK_XYZW; + } else if (residency) { + /* intel_nir_lower_sparse guarantees that texturing operations only + * read the data, or the sparse residency code, but not both at once. + * + * We need to use UGPRs for the residency result because the sampler + * returns the null pixel mask in lane 0, regardless of lanemasking. + * + * Unfortunately, the sampler doesn't allow us to writemask out all + * four colour channels, so we have to needlessly return red. This + * isn't uniform data, but we store it in an array of UGPRs anyway + * in order to have a consistent def file. The colour data will be + * immediately dead anyway. + */ + assert(tex->op == nir_texop_sparse_residency_intel || + tex->op == nir_texop_sparse_residency_txf_intel); + assert(nir_def_components_read(&tex->def) == WRITEMASK_Y); + component_mask = WRITEMASK_X; + unsigned red_grfs = payload_uniform ? 1 : jay_grf_per_gpr(b->shader); + unsigned grfs = red_grfs + 1; + tmp = jay_alloc_def(b, UGPR, grfs * jay_ugpr_per_grf(b->shader)); + null_mask_component = red_grfs * jay_ugpr_per_grf(b->shader); + } else { + component_mask = nir_def_components_read(&tex->def); + + /* We can reduce the return length of the message to drop unused + * trailing components, but shrinking with a discontiguous mask + * requires a message header. We only do that if we need a header + * for other reasons, as it's more expensive than writing extra data. + */ + if (!needs_header) { + component_mask = + (uint8_t) BITFIELD_MASK(util_last_bit(component_mask)); + } + + /* TODO: Shrink 16-bit textures too. Shrinking is problematic for some + * component masks due to 32-bit granularity of ISA registers. + */ + if (tex->def.bit_size != 32 || (jay_debug & JAY_DBG_NOOPT)) + component_mask = nir_component_mask(tex->def.num_components); + + /* If we shrunk the destination, we need a temporary */ + if (component_mask != BITFIELD_MASK(tex->def.num_components)) { + tmp = jay_alloc_def(b, GPR, util_bitcount(component_mask)); + } + } + + /* SENDs always write entire GRFs so we need to pad out for uniform dests */ + if (dst.file == UGPR && !residency) { + unsigned nr = jay_ugpr_per_grf(b->shader) * jay_num_values(tmp); + tmp = jay_alloc_def(b, UGPR, nr); + } + + if (tex->op == nir_texop_texture_samples) { + assert(needs_header); + payload_type_bit_size = 32; + n_sources = 0; + } + + jay_def header = jay_null(); + if (needs_header) { + uint32_t header2; + if (tex->op == nir_texop_tg4) { + /* Gathers have a component but no write mask */ + header2 = (tex->component << 16); + } else { + /* If present, the header write mask are inverted compared to NIR */ + header2 = (~component_mask & 0xf) << 12; + } + + if (residency) + header2 |= 1 << 23; /* g0.2 bit 23: Pixel Null Mask Enable */ + + if (has_const_offsets) { + const unsigned num_components = nir_tex_instr_src_size(tex, offs_idx); + for (unsigned i = 0; i < num_components; i++) { + nir_scalar s = nir_get_scalar(tex->src[offs_idx].src.ssa, i); + s = nir_scalar_chase_movs(s); + assert(nir_scalar_is_const(s)); + int offset = nir_scalar_as_int(s); + + /* Offsets are 4-bits, reversed order */ + header2 |= (offset & 0xf) << ((2 - i) * 4); + } + } + + /* Vectorized zeroing of the header. TODO: This can be optimized more. */ + jay_def zeroes = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader)); + jay_MOV(b, zeroes, 0); + + jay_def ugprs[JAY_MAX_DEF_LENGTH]; + jay_foreach_comp(zeroes, i) { + ugprs[i] = jay_extract(zeroes, i); + } + + /* Set the main immediate part of the header */ + if (header2 != 0) { + ugprs[2] = jay_MOV_u32(b, header2); + } + + if (sampler_bindless) { + /* Bindless sampler handles aren't relative to the sampler state + * pointer passed into the shader through SAMPLER_STATE_POINTERS_*. + * Instead, it's an absolute pointer relative to dynamic state base + * address. + * + * Sampler states are 16 bytes each and the pointer we give here has + * to be 32-byte aligned. In order to avoid more indirect messages + * than required, we assume that all bindless sampler states are + * 32-byte aligned. This sacrifices a bit of general state base + * address space but means we can do something more efficient in the + * shader. + */ + ugprs[3] = sampler; + } else { + /* Select the default dynamic state base address + offset */ + jay_def sampler_ptr = nj->payload.sampler_state_pointer; + + /* Gfx11+ sampler message headers include bits in 4:0 which conflict + * with the ones included in g0.3 bits 4:0. Mask them out. + */ + if (b->shader->devinfo->ver >= 11) { + sampler_ptr = jay_AND_u32(b, sampler_ptr, INTEL_MASK(31, 5)); + } + + /* TODO: We should probably lower this in NIR. */ + if (is_high_sampler) { + if (jay_is_imm(sampler)) { + unsigned s = jay_as_uint(sampler); + const int sampler_state_size_B = 16; + unsigned offs_B = ROUND_DOWN_TO(s, 16) * sampler_state_size_B; + assert(offs_B > 0 && "since s > 0"); + sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B); + } else { + jay_def offs_B = + jay_SHL_u32(b, jay_AND_u32(b, sampler, 0xf0), 4); + sampler_ptr = jay_ADD_u32(b, sampler_ptr, offs_B); + } + } + + ugprs[3] = sampler_ptr; + } + /* Zip it all up into a vector of UGPRs which will RA to a single GRF */ + header = jay_collect_vectors(b, ugprs, jay_num_values(zeroes)); + } + + assert(payload_type_bit_size == 16 || payload_type_bit_size == 32); + unsigned simd_mode = 0; + unsigned simd_width = payload_uniform ? 1 : nj->s->dispatch_width; + if (nj->devinfo->ver < 20) { + if (payload_type_bit_size == 16) { + assert(nj->devinfo->ver >= 11); + simd_mode = simd_width <= 8 ? GFX10_SAMPLER_SIMD_MODE_SIMD8H : + GFX10_SAMPLER_SIMD_MODE_SIMD16H; + } else { + simd_mode = simd_width <= 8 ? BRW_SAMPLER_SIMD_MODE_SIMD8 : + BRW_SAMPLER_SIMD_MODE_SIMD16; + } + } else { + if (payload_type_bit_size == 16) { + simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16H : + XE2_SAMPLER_SIMD_MODE_SIMD32H; + } else { + simd_mode = simd_width <= 16 ? XE2_SAMPLER_SIMD_MODE_SIMD16 : + XE2_SAMPLER_SIMD_MODE_SIMD32; + } + } + + uint64_t desc = 0; + jay_def desc_src = jay_null(), desc_ex_src = jay_null(); + + unsigned sampler_imm = 0; + if (jay_is_imm(sampler) && !sampler_bindless) { + sampler_imm = jay_as_uint(sampler) % 16; + } + + const unsigned msg_type = brw_get_sampler_hw_opcode(op); + bool is_16 = false; /* TODO */ + unsigned ret_type = is_16 ? GFX8_SAMPLER_RETURN_FORMAT_16BITS : + GFX8_SAMPLER_RETURN_FORMAT_32BITS; + + if (!surface_bindless && + jay_is_imm(surface) && + (jay_is_imm(sampler) || sampler_bindless)) { + desc = brw_sampler_desc(nj->devinfo, jay_as_uint(surface), sampler_imm, + msg_type, simd_mode, ret_type); + } else if (surface_bindless) { + /* Bindless surface */ + desc = brw_sampler_desc(nj->devinfo, GFX9_BTI_BINDLESS, sampler_imm, + msg_type, simd_mode, ret_type); + + /* For bindless samplers, the entire address is included in the message + * header so we can leave the portion in the message descriptor 0. + */ + if (!sampler_bindless && !jay_is_imm(sampler)) { + desc_src = jay_SHL_u32(b, sampler, 8); + } + + /* We assume that the driver provided the handle in the top 20 bits so + * we can use the surface handle directly as the extended descriptor. + */ + desc_ex_src = jay_alloc_def(b, J_ADDRESS, 1); + jay_MOV(b, desc_ex_src, surface); + } else { + /* Immediate portion of the descriptor */ + desc = brw_sampler_desc(nj->devinfo, 0, 0, msg_type, simd_mode, ret_type); + + if (sampler_bindless) { + desc_src = surface; + } else if (!sampler_bindless && jay_is_imm(sampler)) { + desc_src = jay_OR_u32(b, surface, jay_as_uint(sampler) << 8); + } else { + desc_src = jay_OR_u32(b, jay_SHL_u32(b, sampler, 8), surface); + } + + desc_src = jay_AND_u32(b, desc_src, 0xfff); + } + + if (n_sources > 2 || !jay_is_null(header)) { + for (unsigned i = 0; i < n_sources; ++i) { + payload[i] = + jay_src_as_strided(b, payload[i], 1, payload_uniform ? UGPR : GPR); + } + } + + enum jay_type src_type = jay_type(JAY_TYPE_U, payload_type_bit_size); + jay_SEND(b, .sfid = BRW_SFID_SAMPLER, .msg_desc = desc, .desc = desc_src, + .ex_desc = desc_ex_src, .header = header, .srcs = payload, + .nr_srcs = n_sources, .type = JAY_TYPE_U32, + .src_type = { src_type }, .dst = tmp, .uniform = payload_uniform, + .bindless = surface_bindless); + + /* If we sampled into a temporary, copy out to the final */ + if (residency) { + jay_MOV(b, jay_extract(dst, 1), jay_extract(tmp, null_mask_component)); + } else if (!jay_defs_equivalent(dst, tmp)) { + unsigned i = 0; + unsigned tmp_stride = dst.file == UGPR ? jay_ugpr_per_grf(b->shader) : 1; + + u_foreach_bit(c, component_mask) { + jay_MOV(b, jay_extract(dst, c), jay_extract(tmp, (i++) * tmp_stride)); + } + } + + if (mesa_shader_stage_is_compute(b->shader->stage)) { + b->shader->prog_data->cs.uses_sampler |= !nir_tex_instr_is_query(tex); + } +} + +static void +jay_emit_jump(struct nir_to_jay_state *nj, nir_jump_instr *instr) +{ + switch (instr->type) { + case nir_jump_break: + jay_block_add_successor(nj->current_block, nj->break_block); + jay_BREAK(&nj->bld); + break; + case nir_jump_halt: + // TODO: Do we want a predicated EOT here, or a jump to the end? + assert(!"TODO: implement HALT"); + break; + case nir_jump_return: + /* Should be lowered */ + default: + UNREACHABLE("unknown jump"); + } +} + +static void +jay_emit_instr(struct nir_to_jay_state *nj, jay_block *block, nir_instr *instr) +{ + switch (instr->type) { + case nir_instr_type_alu: + jay_emit_alu(nj, nir_instr_as_alu(instr)); + break; + + case nir_instr_type_intrinsic: + jay_emit_intrinsic(nj, nir_instr_as_intrinsic(instr)); + break; + + case nir_instr_type_tex: + jay_emit_texture(nj, nir_instr_as_tex(instr)); + break; + + case nir_instr_type_load_const: + jay_emit_load_const(nj, nir_instr_as_load_const(instr)); + break; + + case nir_instr_type_phi: + case nir_instr_type_undef: { + jay_def def = nj_def(nir_instr_def(instr)); + + jay_foreach_comp(def, c) { + if (instr->type == nir_instr_type_phi) { + jay_PHI_DST(&nj->bld, jay_extract(def, c)); + } else { + jay_INDETERMINATE(&nj->bld, jay_extract(def, c)); + } + } + + break; + } + + case nir_instr_type_jump: + jay_emit_jump(nj, nir_instr_as_jump(instr)); + break; + + case nir_instr_type_deref: + UNREACHABLE("All derefs should've been lowered"); + + default: + UNREACHABLE("unknown instruction type"); + } +} + +static jay_block * +jay_create_block(struct nir_to_jay_state *nj) +{ + jay_block *block = jay_new_block(nj->f); + block->indent = nj->indent; + return block; +} + +static jay_inst * +jay_block_ending_unconditional_jump(jay_block *block) +{ + jay_inst *jump = jay_block_ending_jump(block); + return jump && !jump->predication ? jump : NULL; +} + +static void +jay_emit_if(struct nir_to_jay_state *nj, nir_if *nif) +{ + jay_builder *b = &nj->bld; + jay_def condition = nj_src(nif->condition); + + jay_block *before_block = nj->current_block; + jay_block *after_block = jay_create_block(nj); + + /* Push */ + ++nj->indent; + + jay_block *else_first = jay_create_block(nj); + + jay_block *then_first = jay_emit_cf_list(nj, &nif->then_list); + jay_block *then_last = nj->current_block; + + nj->after_block = else_first; + + jay_block *else_first_2 = jay_emit_cf_list(nj, &nif->else_list); + jay_block *else_last = nj->current_block; + assert(else_first == else_first_2); + + /* Pop */ + --nj->indent; + + jay_block_add_successor(before_block, then_first); + jay_block_add_successor(before_block, else_first); + + if (!jay_block_ending_unconditional_jump(then_last)) + jay_block_add_successor(then_last, after_block); + + if (!jay_block_ending_unconditional_jump(else_last)) + jay_block_add_successor(else_last, after_block); + + nj->after_block = after_block; + + /* Emit the if-else-endif sequence */ + b->cursor = jay_after_block(before_block); + jay_add_predicate(b, jay_IF(b), condition); + + b->cursor = jay_before_block(else_first); + jay_ELSE(b); + + b->cursor = jay_after_block(else_last); + jay_ENDIF(b); +} + +static void +jay_emit_loop(struct nir_to_jay_state *nj, nir_loop *nloop) +{ + assert(!nir_loop_has_continue_construct(nloop)); + + jay_builder *b = &nj->bld; + jay_block *saved_break = nj->break_block; + + /* Make the block that will be after the loop exit */ + nj->break_block = jay_create_block(nj); + ++nj->indent; + + /* Make a block for the loop body, which is also the loop header */ + jay_block *loop_header = jay_create_block(nj); + loop_header->loop_header = true; + + /* The current block falls through to the start of the loop */ + jay_block_add_successor(nj->current_block, loop_header); + + /* Emit the loop body */ + nj->after_block = loop_header; + jay_emit_cf_list(nj, &nloop->body); + + /* Emit the backedge */ + jay_inst *jump = jay_block_ending_jump(nj->current_block); + if (jump && jump->op == JAY_OPCODE_BREAK) { + jump->op = JAY_OPCODE_LOOP_ONCE; + } else { + jay_block_add_successor(nj->current_block, loop_header); + jay_WHILE(b); + } + + /* Pop */ + --nj->indent; + nj->after_block = nj->break_block; + nj->break_block = saved_break; + + b->cursor = jay_after_block(nj->after_block); +} + +static jay_block * +jay_emit_block(struct nir_to_jay_state *nj, nir_block *nb) +{ + jay_builder *b = &nj->bld; + + if (nj->after_block) { + nj->current_block = nj->after_block; + nj->after_block = NULL; + } else { + nj->current_block = jay_create_block(nj); + } + + jay_block *block = nj->current_block; + block->uniform = !nb->divergent; + list_addtail(&block->link, &nj->f->blocks); + + b->cursor = jay_after_block(block); + + /* Emit the contents of the block */ + nir_foreach_instr(instr, nb) { + jay_emit_instr(nj, block, instr); + } + + /* Look in the current NIR block's successors for any phis. Each of them + * should have a source corresponding to a value coming from our current + * block. Create PHI_SRC opcodes in the current block for those values. + * The corresponding PHI_DST may not have been emitted yet, but that's ok. + */ + for (unsigned bs = 0; bs < ARRAY_SIZE(nb->successors); ++bs) { + nir_block *nb_successor = nb->successors[bs]; + if (!nb_successor) + continue; + + nir_foreach_phi(nphi, nb_successor) { + jay_def val = nj_src(nir_phi_get_src_from_block(nphi, nb)->src); + + /* The phi def might be nonuniform but have uniform source (like a + * constant). Move to the correct file in the the source block and + * reference that in PHI_SRC. + */ + if (jay_file_for_def(&nphi->def) != val.file) { + b->cursor = jay_after_block_logical(block); + jay_def tmp = val; + val = jay_alloc_def(b, jay_file_for_def(&nphi->def), + jay_num_values(val)); + jay_copy(b, val, tmp); + } + + jay_foreach_comp(val, c) { + b->cursor = jay_before_jump(block); + jay_PHI_SRC(b, JAY_TYPE_U32, jay_extract(val, c), + nphi->def.index + c); + } + } + } + + b->cursor = jay_after_block(block); + nj->active_lane_mask = jay_null(); + nj->active_lane = jay_null(); + nj->active_lane_x4 = jay_null(); + + return block; +} + +static jay_block * +jay_emit_cf_list(struct nir_to_jay_state *nj, struct exec_list *list) +{ + jay_block *start_block = NULL; + + foreach_list_typed(nir_cf_node, node, node, list) { + switch (node->type) { + case nir_cf_node_block: { + jay_block *block = jay_emit_block(nj, nir_cf_node_as_block(node)); + + if (!start_block) + start_block = block; + break; + } + + case nir_cf_node_if: + jay_emit_if(nj, nir_cf_node_as_if(node)); + break; + + case nir_cf_node_loop: + jay_emit_loop(nj, nir_cf_node_as_loop(node)); + break; + + default: + UNREACHABLE("Unknown NIR control flow node"); + } + } + + return start_block; +} + +static void +jay_emit_eot(struct nir_to_jay_state *nj) +{ + jay_builder *b = &nj->bld; + + if (mesa_shader_stage_is_compute(nj->nir->info.stage)) { + /* Vectorized copy into the EOT register. Not necessary for correctness + * but keeps RA from inserting 16 scalar copies instead. + */ + jay_def copy = jay_alloc_def(b, UGPR, jay_ugpr_per_grf(b->shader)); + jay_MOV(b, copy, nj->payload.u0); + + jay_SEND(b, .sfid = BRW_SFID_MESSAGE_GATEWAY, .eot = true, .msg_desc = 0, + .srcs = ©, .nr_srcs = 1, .type = JAY_TYPE_U32, + .uniform = true); + } else if (nj->nir->info.stage == MESA_SHADER_VERTEX) { + jay_block *block = jay_last_block(nj->f); + jay_inst *I = jay_last_inst(block); + + /* TODO: What if this isn't the case? Do we need a no-op store...? */ + assert(I && I->op == JAY_OPCODE_SEND && jay_send_sfid(I) == BRW_SFID_URB); + jay_set_send_eot(I, true); + } +} + +static void +set_cr0(jay_function *f, jay_cursor cursor, uint32_t *cr0, uint32_t desired) +{ + /* Only touch cr0 if we are changing bits */ + if ((*cr0) != desired) { + jay_builder b = jay_init_builder(f, cursor); + jay_XOR(&b, JAY_TYPE_U32, jay_control(), jay_control(), (*cr0) ^ desired); + *cr0 = desired; + } +} + +static void +jay_insert_fp_mode(jay_shader *shader, uint32_t api, uint32_t float_sizes) +{ + /* First, work out the global float control mode for the shader */ + uint32_t global = 0x0; + + /* Initially fp16 denorms are flushed-to-zero, handle preserve. */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP16) && (float_sizes & 16)) { + global |= BRW_CR0_FP16_DENORM_PRESERVE; + } + + /* Initially fp32 denorms are flushed-to-zero, handle preserve. + * + * TODO: Optimize this, we have a dispatch bit. + */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP32) && (float_sizes & 32)) { + global |= BRW_CR0_FP32_DENORM_PRESERVE; + } + + /* Initially fp64 denorms are flushed to zero, handle preserve. */ + if ((api & FLOAT_CONTROLS_DENORM_PRESERVE_FP64) && (float_sizes & 64)) { + global |= BRW_CR0_FP64_DENORM_PRESERVE; + } + + /* By default, we are in round-to-even mode. Note we do not permit setting + * round mode separately by bitsize but this is ok for current APIs. The + * Vulkan driver sets roundingModeIndependence = NONE. + * + * TODO: Optimize this, there is a command buffer bit for it. + */ + if (((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP16) && (float_sizes & 16)) || + ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP32) && (float_sizes & 32)) || + ((api & FLOAT_CONTROLS_ROUNDING_MODE_RTZ_FP64) && (float_sizes & 64))) { + global |= (BRW_RND_MODE_RTZ << BRW_CR0_RND_MODE_SHIFT); + } + + uint32_t cr0 = 0; + jay_function *entrypoint = jay_shader_get_entrypoint(shader); + set_cr0(entrypoint, jay_before_function(entrypoint), &cr0, global); + + /* Now handle per-instruction deltas to the global mode */ + jay_foreach_function(shader, func) { + jay_foreach_block(func, block) { + uint32_t current = cr0; + + jay_foreach_inst_in_block(block, I) { + uint32_t required = cr0; + enum jay_rounding_mode round = + (I->op == JAY_OPCODE_CVT) ? jay_cvt_rounding_mode(I) : JAY_ROUND; + + if (round != JAY_ROUND) { + required &= ~BRW_CR0_RND_MODE_MASK; + required |= ((round - JAY_RNE) << BRW_CR0_RND_MODE_SHIFT); + } + + if (jay_type_is_any_float(I->type)) { + set_cr0(func, jay_before_inst(I), ¤t, required); + } + } + + /* Restore to global state on block boundaries */ + if (jay_num_successors(block) > 0) { + set_cr0(func, jay_after_block(block), ¤t, cr0); + } + } + } +} + +struct payload_builder { + jay_builder *b; + unsigned offsets[JAY_NUM_SSA_FILES]; + jay_def vecs[JAY_NUM_SSA_FILES]; +}; + +static jay_def +read_payload(struct payload_builder *b, enum jay_file file) +{ + unsigned granularity = file == UGPR ? 16 : 1; + unsigned channel = b->offsets[file] % granularity; + + if (channel == 0) { + b->vecs[file] = jay_alloc_def(b->b, file, granularity); + jay_PRELOAD(b->b, b->vecs[file], b->offsets[file]); + } + + b->offsets[file]++; + return jay_extract(b->vecs[file], channel); +} + +static jay_def +read_vector_payload(struct payload_builder *b, enum jay_file file, unsigned len) +{ + jay_def defs[JAY_MAX_DEF_LENGTH]; + assert(len < ARRAY_SIZE(defs)); + + for (unsigned i = 0; i < len; ++i) { + defs[i] = read_payload(b, file); + } + + return jay_collect_vectors(b->b, defs, len); +} + +static void +setup_payload_push(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + unsigned push_size_B = 0; + for (int i = 0; i < ARRAY_SIZE(nj->s->prog_data->base.push_sizes); i++) { + push_size_B += nj->s->prog_data->base.push_sizes[i]; + } + + assert(util_is_aligned(push_size_B, 32)); + for (unsigned i = 0; i < (push_size_B / 4); ++i) { + nj->payload.push_data[i] = read_payload(p, UGPR); + } + + nj->s->push_grfs = push_size_B / (4 * jay_ugpr_per_grf(nj->s)); +} + +static void +setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + nj->payload.urb_handle = read_payload(p, GPR); + + /* XXX: This is a hack to line up with the partition chosen in RA. This whole + * thing needs an overhaul. Need to think harder about partitioning. + */ + p->offsets[GPR] += 7; + + for (unsigned i = 0; i < (8 * nj->s->prog_data->vue.urb_read_length); ++i) { + assert(i < ARRAY_SIZE(nj->payload.vs.attributes)); + nj->payload.vs.attributes[i] = read_payload(p, GPR); + } + + setup_payload_push(nj, p); +} + +static void +setup_compute_payload(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + assert(!nj->s->prog_data->cs.generate_local_id); + assert(!nj->s->prog_data->cs.uses_btd_stack_ids); + + nj->payload.inline_data = + read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); +} + +static inline enum intel_barycentric_mode +brw_barycentric_mode(const struct brw_fs_prog_key *key, + nir_intrinsic_instr *intr) +{ + const enum glsl_interp_mode mode = nir_intrinsic_interp_mode(intr); + + /* Barycentric modes don't make sense for flat inputs. */ + assert(mode != INTERP_MODE_FLAT); + + unsigned bary; + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_at_offset: + /* When per sample interpolation is dynamic, assume sample interpolation. + * We'll dynamically remap things so that the FS payload is not affected. + */ + bary = key->persample_interp == INTEL_SOMETIMES ? + INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE : + INTEL_BARYCENTRIC_PERSPECTIVE_PIXEL; + break; + case nir_intrinsic_load_barycentric_centroid: + bary = INTEL_BARYCENTRIC_PERSPECTIVE_CENTROID; + break; + case nir_intrinsic_load_barycentric_sample: + case nir_intrinsic_load_barycentric_at_sample: + bary = INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE; + break; + default: + UNREACHABLE("invalid intrinsic"); + } + + if (mode == INTERP_MODE_NOPERSPECTIVE) + bary += 3; + + return (enum intel_barycentric_mode) bary; +} + +struct fs_info_ctx { + const struct brw_fs_prog_key *key; + struct brw_fs_prog_data *prog_data; + const struct intel_device_info *devinfo; +}; + +static bool +gather_fs_info(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + struct fs_info_ctx *ctx = data; + struct brw_fs_prog_data *prog_data = ctx->prog_data; + + switch (intr->intrinsic) { + case nir_intrinsic_load_barycentric_pixel: + case nir_intrinsic_load_barycentric_centroid: + case nir_intrinsic_load_barycentric_sample: + prog_data->barycentric_interp_modes |= + 1 << brw_barycentric_mode(ctx->key, intr); + break; + + case nir_intrinsic_load_barycentric_at_sample: + case nir_intrinsic_load_barycentric_at_offset: { + unsigned mode = brw_barycentric_mode(ctx->key, intr); + prog_data->barycentric_interp_modes |= 1 << mode; + prog_data->uses_sample_offsets |= + mode == INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE || + mode == INTEL_BARYCENTRIC_NONPERSPECTIVE_SAMPLE; + + if ((1 << mode) & INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) + prog_data->uses_npc_bary_coefficients = true; + else + prog_data->uses_pc_bary_coefficients = true; + break; + } + + case nir_intrinsic_load_frag_coord_z: + prog_data->uses_src_depth = true; + break; + + case nir_intrinsic_load_frag_coord_w_rcp: + prog_data->uses_src_w = true; + break; + + case nir_intrinsic_load_sample_mask_in: + /* TODO: Sample masks are broken and discards are broken and simd32 + * layouts are broken too. XXX. + */ + // prog_data->uses_sample_mask = true; + break; + + case nir_intrinsic_load_pixel_coord_intel: + BITSET_SET(b->shader->info.system_values_read, SYSTEM_VALUE_FRAG_COORD); + break; + + default: + break; + } + + return false; +} + +static void +brw_compute_flat_inputs(struct brw_fs_prog_data *prog_data, + const nir_shader *shader) +{ + prog_data->flat_inputs = 0; + + nir_foreach_shader_in_variable(var, shader) { + if (var->data.interpolation != INTERP_MODE_FLAT || + var->data.per_primitive) + continue; + + unsigned slots = glsl_count_attribute_slots(var->type, false); + for (unsigned s = 0; s < slots; s++) { + int input_index = prog_data->urb_setup[var->data.location + s]; + + if (input_index >= 0) + prog_data->flat_inputs |= 1 << input_index; + } + } +} + +static uint8_t +computed_depth_mode(const nir_shader *shader) +{ + if (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { + switch (shader->info.fs.depth_layout) { + case FRAG_DEPTH_LAYOUT_NONE: + case FRAG_DEPTH_LAYOUT_ANY: + return BRW_PSCDEPTH_ON; + case FRAG_DEPTH_LAYOUT_GREATER: + return BRW_PSCDEPTH_ON_GE; + case FRAG_DEPTH_LAYOUT_LESS: + return BRW_PSCDEPTH_ON_LE; + case FRAG_DEPTH_LAYOUT_UNCHANGED: + /* We initially set this to OFF, but having the shader write the + * depth means we allocate register space in the SEND message. The + * difference between the SEND register count and the OFF state + * programming makes the HW hang. + * + * Removing the depth writes also leads to test failures. So use + * LesserThanOrEqual, which fits writing the same value + * (unchanged/equal). + * + */ + return BRW_PSCDEPTH_ON_LE; + } + } + return BRW_PSCDEPTH_OFF; +} + +/* + * Build up an array of indices into the urb_setup array that + * references the active entries of the urb_setup array. + * Used to accelerate walking the active entries of the urb_setup array + * on each upload. + */ +static void +brw_compute_urb_setup_index(struct brw_fs_prog_data *fs_prog_data) +{ + /* TODO(mesh): Review usage of this in the context of Mesh, we may want to + * skip per-primitive attributes here. + */ + + /* Make sure uint8_t is sufficient */ + static_assert(VARYING_SLOT_MAX <= 0xff); + uint8_t index = 0; + for (uint8_t attr = 0; attr < VARYING_SLOT_MAX; attr++) { + if (fs_prog_data->urb_setup[attr] >= 0) { + fs_prog_data->urb_setup_attribs[index++] = attr; + } + } + fs_prog_data->urb_setup_attribs_count = index; +} + +static void +calculate_urb_setup(const struct intel_device_info *devinfo, + const struct brw_fs_prog_key *key, + struct brw_fs_prog_data *prog_data, + nir_shader *nir, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) +{ + memset(prog_data->urb_setup, -1, sizeof(prog_data->urb_setup)); + int urb_next = 0; /* in vec4s */ + + /* Figure out where the PrimitiveID lives, either in the per-vertex block + * or in the per-primitive block or both. + */ + const uint64_t per_vert_primitive_id = + key->mesh_input == INTEL_ALWAYS ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t per_prim_primitive_id = + key->mesh_input == INTEL_NEVER ? 0 : VARYING_BIT_PRIMITIVE_ID; + const uint64_t inputs_read = + nir->info.inputs_read & + (~nir->info.per_primitive_inputs | per_vert_primitive_id); + const uint64_t per_primitive_header_bits = + VARYING_BIT_PRIMITIVE_SHADING_RATE | + VARYING_BIT_LAYER | + VARYING_BIT_VIEWPORT | + VARYING_BIT_CULL_PRIMITIVE; + const uint64_t per_primitive_inputs = + nir->info.inputs_read & + (nir->info.per_primitive_inputs | per_prim_primitive_id) & + ~per_primitive_header_bits; + struct intel_vue_map vue_map; + uint32_t per_primitive_stride = 0, first_read_offset = UINT32_MAX; + + if (mue_map != NULL) { + memcpy(&vue_map, &mue_map->vue_map, sizeof(vue_map)); + memcpy(per_primitive_offsets, mue_map->per_primitive_offsets, + sizeof(mue_map->per_primitive_offsets)); + + if (!mue_map->wa_18019110168_active) { + u_foreach_bit64(location, per_primitive_inputs) { + assert(per_primitive_offsets[location] != -1); + + first_read_offset = + MIN2(first_read_offset, + (uint32_t) per_primitive_offsets[location]); + per_primitive_stride = + MAX2((uint32_t) per_primitive_offsets[location] + 16, + per_primitive_stride); + } + } else { + first_read_offset = per_primitive_stride = 0; + } + } else { + brw_compute_vue_map(devinfo, &vue_map, inputs_read, key->base.vue_layout, + 1 /* pos_slots, TODO */); + brw_compute_per_primitive_map(per_primitive_offsets, + &per_primitive_stride, &first_read_offset, + 0, nir, nir_var_shader_in, + per_primitive_inputs, + true /* separate_shader */); + } + + if (per_primitive_stride > first_read_offset) { + first_read_offset = ROUND_DOWN_TO(first_read_offset, 32); + + /* Remove the first few unused registers */ + for (uint32_t i = 0; i < VARYING_SLOT_MAX; i++) { + if (per_primitive_offsets[i] == -1) + continue; + per_primitive_offsets[i] -= first_read_offset; + } + + prog_data->num_per_primitive_inputs = + 2 * DIV_ROUND_UP(per_primitive_stride - first_read_offset, 32); + } else { + prog_data->num_per_primitive_inputs = 0; + } + + /* Now do the per-vertex stuff (what used to be legacy pipeline) */ + + /* If Mesh is involved, we cannot do any packing. Documentation doesn't say + * anything about this but 3DSTATE_SBE_SWIZ does not appear to work when + * using Mesh. + */ + if (util_bitcount64(inputs_read) <= 16 && key->mesh_input == INTEL_NEVER) { + /* When not in Mesh pipeline mode, the SF/SBE pipeline stage can do + * arbitrary rearrangement of the first 16 varying inputs, so we can put + * them wherever we want. Just put them in order. + * + * This is useful because it means that (a) inputs not used by the + * fragment shader won't take up valuable register space, and (b) we + * won't have to recompile the fragment shader if it gets paired with a + * different vertex (or geometry) shader. + */ + for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { + if (inputs_read & BITFIELD64_BIT(i)) { + prog_data->urb_setup[i] = urb_next++; + } + } + } else { + /* We have enough input varyings that the SF/SBE pipeline stage can't + * arbitrarily rearrange them to suit our whim; we have to put them in + * an order that matches the output of the previous pipeline stage + * (geometry or vertex shader). + */ + int first_slot = 0; + for (int i = 0; i < vue_map.num_slots; i++) { + int varying = vue_map.slot_to_varying[i]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying)) != 0) { + first_slot = ROUND_DOWN_TO(i, 2); + break; + } + } + + for (int slot = first_slot; slot < vue_map.num_slots; slot++) { + int varying = vue_map.slot_to_varying[slot]; + if (varying > 0 && (inputs_read & BITFIELD64_BIT(varying))) { + prog_data->urb_setup[varying] = slot - first_slot; + } + } + urb_next = vue_map.num_slots - first_slot; + } + + prog_data->num_varying_inputs = urb_next; + prog_data->inputs = inputs_read; + prog_data->per_primitive_inputs = per_primitive_inputs; + + brw_compute_urb_setup_index(prog_data); +} + +static void +populate_fs_prog_data(nir_shader *shader, + const struct intel_device_info *devinfo, + const struct brw_fs_prog_key *key, + struct brw_fs_prog_data *prog_data, + const struct brw_mue_map *mue_map, + int *per_primitive_offsets) +{ + struct fs_info_ctx ctx = { + .key = key, + .prog_data = prog_data, + .devinfo = devinfo, + }; + nir_shader_intrinsics_pass(shader, gather_fs_info, nir_metadata_all, &ctx); + + prog_data->uses_kill = shader->info.fs.uses_discard; + prog_data->uses_omask = + !key->ignore_sample_mask_out && + (shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)); + prog_data->max_polygons = 1; + prog_data->computed_depth_mode = computed_depth_mode(shader); + prog_data->computed_stencil = + shader->info.outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL); + + prog_data->sample_shading = shader->info.fs.uses_sample_shading; + prog_data->api_sample_shading = key->api_sample_shading; + prog_data->min_sample_shading = key->min_sample_shading; + + assert(key->multisample_fbo != INTEL_NEVER || + key->persample_interp == INTEL_NEVER); + + prog_data->persample_dispatch = key->persample_interp; + if (prog_data->sample_shading) + prog_data->persample_dispatch = INTEL_ALWAYS; + + /* We can only persample dispatch if we have a multisample FBO */ + prog_data->persample_dispatch = + MIN2(prog_data->persample_dispatch, key->multisample_fbo); + + /* Currently only the Vulkan API allows alpha_to_coverage to be dynamic. If + * persample_dispatch & multisample_fbo are not dynamic, Anv should be able + * to definitively tell whether alpha_to_coverage is on or off. + */ + prog_data->alpha_to_coverage = key->alpha_to_coverage; + + assert(devinfo->verx10 >= 125 || key->mesh_input == INTEL_NEVER); + prog_data->mesh_input = key->mesh_input; + + assert(devinfo->verx10 >= 200 || key->provoking_vertex_last == INTEL_NEVER); + prog_data->provoking_vertex_last = key->provoking_vertex_last; + + /* From the Ivy Bridge PRM documentation for 3DSTATE_PS: + * + * "MSDISPMODE_PERSAMPLE is required in order to select + * POSOFFSET_SAMPLE" + * + * So we can only really get sample positions if we are doing real + * per-sample dispatch. If we need gl_SamplePosition and we don't have + * persample dispatch, we hard-code it to 0.5. + */ + prog_data->uses_pos_offset = + prog_data->persample_dispatch != INTEL_NEVER && + (BITSET_TEST(shader->info.system_values_read, SYSTEM_VALUE_SAMPLE_POS) || + BITSET_TEST(shader->info.system_values_read, + SYSTEM_VALUE_SAMPLE_POS_OR_CENTER)); + + prog_data->early_fragment_tests = shader->info.fs.early_fragment_tests; + prog_data->post_depth_coverage = shader->info.fs.post_depth_coverage; + prog_data->inner_coverage = shader->info.fs.inner_coverage; + + /* From the BDW PRM documentation for 3DSTATE_WM: + * + * "MSDISPMODE_PERSAMPLE is required in order to select Perspective + * Sample or Non- perspective Sample barycentric coordinates." + * + * So cleanup any potentially set sample barycentric mode when not in per + * sample dispatch. + */ + if (prog_data->persample_dispatch == INTEL_NEVER) { + prog_data->barycentric_interp_modes &= + ~BITFIELD_BIT(INTEL_BARYCENTRIC_PERSPECTIVE_SAMPLE); + } + + if (devinfo->ver >= 20) { + prog_data->vertex_attributes_bypass = + brw_needs_vertex_attributes_bypass(shader); + } + + prog_data->uses_nonperspective_interp_modes = + (prog_data->barycentric_interp_modes & + INTEL_BARYCENTRIC_NONPERSPECTIVE_BITS) || + prog_data->uses_npc_bary_coefficients; + + /* The current VK_EXT_graphics_pipeline_library specification requires + * coarse to specified at compile time. But per sample interpolation can be + * dynamic. So we should never be in a situation where coarse & + * persample_interp are both respectively true & INTEL_ALWAYS. + * + * Coarse will dynamically turned off when persample_interp is active. + */ + assert(!key->coarse_pixel || key->persample_interp != INTEL_ALWAYS); + + prog_data->coarse_pixel_dispatch = + intel_sometimes_invert(prog_data->persample_dispatch); + if (!key->coarse_pixel || + /* DG2 should support this, but Wa_22012766191 says there are issues + * with CPS 1x1 + MSAA + FS writing to oMask. + */ + (devinfo->verx10 < 200 && + (prog_data->uses_omask || prog_data->uses_sample_mask)) || + prog_data->sample_shading || + (prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) || + prog_data->computed_stencil || + devinfo->ver < 11) { + prog_data->coarse_pixel_dispatch = INTEL_NEVER; + } + + /* ICL PRMs, Volume 9: Render Engine, Shared Functions Pixel Interpolater, + * Message Descriptor : + * + * "Message Type. Specifies the type of message being sent when + * pixel-rate evaluation is requested : + * + * Format = U2 + * 0: Per Message Offset (eval_snapped with immediate offset) + * 1: Sample Position Offset (eval_sindex) + * 2: Centroid Position Offset (eval_centroid) + * 3: Per Slot Offset (eval_snapped with register offset) + * + * Message Type. Specifies the type of message being sent when + * coarse-rate evaluation is requested : + * + * Format = U2 + * 0: Coarse to Pixel Mapping Message (internal message) + * 1: Reserved + * 2: Coarse Centroid Position (eval_centroid) + * 3: Per Slot Coarse Pixel Offset (eval_snapped with register offset)" + * + * The Sample Position Offset is marked as reserved for coarse rate + * evaluation and leads to hangs if we try to use it. So disable coarse + * pixel shading if we have any intrinsic that will result in a pixel + * interpolater message at sample. + */ + if (intel_nir_pulls_at_sample(shader)) + prog_data->coarse_pixel_dispatch = INTEL_NEVER; + + /* We choose to always enable VMask prior to XeHP, as it would cause + * us to lose out on the eliminate_find_live_channel() optimization. + */ + prog_data->uses_vmask = + devinfo->verx10 < 125 || + shader->info.fs.needs_coarse_quad_helper_invocations || + shader->info.uses_wide_subgroup_intrinsics || + prog_data->coarse_pixel_dispatch != INTEL_NEVER; + + prog_data->uses_depth_w_coefficients = prog_data->uses_pc_bary_coefficients; + + if (prog_data->coarse_pixel_dispatch != INTEL_NEVER) { + prog_data->uses_depth_w_coefficients |= prog_data->uses_src_depth; + prog_data->uses_src_depth = false; + } + + calculate_urb_setup(devinfo, key, prog_data, shader, mue_map, + per_primitive_offsets); + brw_compute_flat_inputs(prog_data, shader); +} + +static void +populate_vs_prog_data(nir_shader *nir, + const struct intel_device_info *devinfo, + const struct brw_vs_prog_key *key, + struct brw_vs_prog_data *prog_data, + unsigned nr_packed_regs, + bool debug) +{ + unsigned nr_attribute_slots = util_bitcount64(prog_data->inputs_read); + BITSET_WORD *sysvals = nir->info.system_values_read; + + /* gl_VertexID and gl_InstanceID are system values, but arrive via an + * incoming vertex attribute. So, add an extra slot. + */ + if (BITSET_TEST(sysvals, SYSTEM_VALUE_FIRST_VERTEX) || + BITSET_TEST(sysvals, SYSTEM_VALUE_BASE_INSTANCE) || + BITSET_TEST(sysvals, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) || + BITSET_TEST(sysvals, SYSTEM_VALUE_INSTANCE_ID)) { + nr_attribute_slots++; + } + + /* gl_DrawID and IsIndexedDraw share its very own vec4 */ + if (BITSET_TEST(sysvals, SYSTEM_VALUE_DRAW_ID) || + BITSET_TEST(sysvals, SYSTEM_VALUE_IS_INDEXED_DRAW)) { + nr_attribute_slots++; + } + + const struct { + bool *data; + gl_system_value val; + } bool_sysvals[] = { + { &prog_data->uses_is_indexed_draw, SYSTEM_VALUE_IS_INDEXED_DRAW }, + { &prog_data->uses_firstvertex, SYSTEM_VALUE_FIRST_VERTEX }, + { &prog_data->uses_baseinstance, SYSTEM_VALUE_BASE_INSTANCE }, + { &prog_data->uses_vertexid, SYSTEM_VALUE_VERTEX_ID_ZERO_BASE }, + { &prog_data->uses_instanceid, SYSTEM_VALUE_INSTANCE_ID }, + { &prog_data->uses_drawid, SYSTEM_VALUE_DRAW_ID }, + }; + + for (unsigned i = 0; i < ARRAY_SIZE(bool_sysvals); ++i) { + *bool_sysvals[i].data = BITSET_TEST(sysvals, bool_sysvals[i].val); + } + + unsigned nr_attribute_regs; + if (key->vf_component_packing) { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_packed_regs, 8); + nr_attribute_regs = nr_packed_regs; + } else { + prog_data->base.urb_read_length = DIV_ROUND_UP(nr_attribute_slots, 2); + nr_attribute_regs = 4 * nr_attribute_slots; + } + + /* Since vertex shaders reuse the same VUE entry for inputs and outputs + * (overwriting the original contents), we need to make sure the size is + * the larger of the two. + */ + const unsigned vue_entries = MAX2(DIV_ROUND_UP(nr_attribute_regs, 4), + prog_data->base.vue_map.num_slots); + prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4); + prog_data->base.dispatch_mode = INTEL_DISPATCH_MODE_SIMD8; + + if (unlikely(debug)) { + fprintf(stderr, "VS Output "); + brw_print_vue_map(stderr, &prog_data->base.vue_map, MESA_SHADER_VERTEX); + } +} + +static void +setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p) +{ + jay_fs_payload *fs = &nj->payload.fs; + + if (nj->s->dispatch_width == 32) { + nj->payload.u1 = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); + } + + setup_payload_push(nj, p); + + u_foreach_bit(i, nj->s->prog_data->fs.barycentric_interp_modes) { + fs->bary[i] = read_vector_payload(p, GPR, 2); + } + + if (nj->s->prog_data->fs.uses_src_depth) { + fs->coord.z = read_payload(p, GPR); + } + + if (nj->s->prog_data->fs.uses_src_w) { + fs->coord.w = read_payload(p, GPR); + } + + unsigned nr_attribs = 16 * 4; /* TODO */ + for (unsigned i = 0; i < nr_attribs; ++i) { + jay_def comps[] = { read_payload(p, UGPR), read_payload(p, UGPR), + read_payload(p, UGPR) }; + + /* The .yz components are swizzled in the hardware compared to NIR. */ + SWAP(comps[1], comps[2]); + fs->deltas[i] = jay_collect_vectors(&nj->bld, comps, ARRAY_SIZE(comps)); + + /* Padding */ + if ((i % 5) == 4) { + read_payload(p, UGPR); + } + } + + /* XXX: I do not love this */ + if (BITSET_TEST(nj->nir->info.system_values_read, SYSTEM_VALUE_FRAG_COORD)) { + jay_def t = jay_alloc_def(&nj->bld, GPR, 1); + jay_def lo = jay_extract_range(nj->payload.u0, 10, 4); + jay_EXPAND_QUAD(&nj->bld, t, lo, payload_u1(nj, 10, 4)); + fs->coord.xy = jay_OFFSET_PACKED_PIXEL_COORDS_u32(&nj->bld, t); + } + + /* Due to complexities of the physical payload, the logical payload is split + * into even/odd halves. Fix up the offsets and insert copies. + */ + if (nj->s->dispatch_width == 32) { + jay_builder *b = &nj->bld; + jay_foreach_inst_in_block(nj->after_block, I) { + if (I->op == JAY_OPCODE_PRELOAD && I->dst.file == GPR) { + unsigned base = (jay_preload_reg(I) % 2) ? p->offsets[GPR] : 0; + jay_set_preload_reg(I, base + (jay_preload_reg(I) / 2)); + } + } + + b->cursor = jay_before_block(nj->after_block); + unsigned size = p->offsets[GPR]; + + /* Odd: copy both halves to contiguous pair after payload */ + for (unsigned i = 1; i < size; i += 2) { + jay_DESWIZZLE_16(b, size + size + i + 1, 2 + i); + jay_DESWIZZLE_16(b, size + size + i + 2, 2 + i + size); + } + + /* Even: leave the bottom half in place, copy top half. If size=1 (rare + * but possible), this would be a no-op move so skip it. + */ + if (size > 1) { + for (unsigned i = 0; i < size; i += 2) { + jay_inst *I = jay_DESWIZZLE_16(b, 2 + i + 1, 2 + size + i); + + /* Stall in between to avoid a write-after-read hazard */ + if (i == 0) { + I->dep = (struct tgl_swsb) { 1, TGL_PIPE_INT }; + } + } + } + } +} + +static void +jay_setup_payload(struct nir_to_jay_state *nj) +{ + jay_shader *s = nj->s; + jay_builder *b = &nj->bld; + nj->after_block = jay_create_block(nj); + b->cursor = jay_after_block(nj->after_block); + + struct payload_builder p = { .b = &nj->bld }; + nj->payload.u0 = read_vector_payload(&p, UGPR, jay_ugpr_per_grf(s)); + nj->payload.sampler_state_pointer = jay_extract(nj->payload.u0, 3); + + switch (s->stage) { + case MESA_SHADER_VERTEX: + setup_vertex_payload(nj, &p); + break; + case MESA_SHADER_FRAGMENT: + setup_fragment_payload(nj, &p); + break; + case MESA_SHADER_COMPUTE: + case MESA_SHADER_KERNEL: + setup_compute_payload(nj, &p); + break; + default: + UNREACHABLE("unimplemented shader stages"); + } + + /* Lane ID calculations require &W and therefore are calculated in + * uniform control flow to sidestep RA problems. The easy solution is + * calculating the lane ID in the first block. + * + * XXX: This doesn't work for multi-function. Reconsider. + */ + nj->payload.lane_id = jay_LANE_ID_8_u16(b); + + for (unsigned i = 8; i < s->dispatch_width; i *= 2) { + nj->payload.lane_id = jay_LANE_ID_EXPAND_u16(b, nj->payload.lane_id, i); + } +} + +/* + * NIR sometimes contains unreachable blocks (e.g. due to infinite loops). These + * blocks have no predecessors, but do have successors and can contribute to + * phis. They are dead and violate the IR invariant: + * + * Live-in sources are live-out in all predecessors. + * + * ...which RA (validation) depends on. The simplest solution is to simply + * delete these dead blocks. Fortunately, because they are unreachable, this + * does not have any ill effects. Notably, this cannot introduce critical edges. + * + * Deleting a block may cause a successor to become unreachable, so we use a + * fixed-point algorithm to converge. + */ +static void +jay_remove_unreachable_blocks(jay_function *func) +{ + bool progress; + do { + progress = false; + + jay_foreach_block(func, pred) { + if (pred != jay_first_block(func) && + jay_num_predecessors(pred) == 0 && + jay_num_successors(pred) > 0) { + + jay_foreach_successor(pred, succ) { + util_dynarray_delete_unordered(&succ->predecessors, jay_block *, + pred); + } + + pred->successors[0] = NULL; + pred->successors[1] = NULL; + progress = true; + } + } + } while (progress); +} + +static void +jay_from_nir_function(const struct intel_device_info *devinfo, + nir_shader *nir, + jay_shader *s, + nir_function_impl *impl) +{ + jay_function *f = jay_new_function(s); + f->is_entrypoint = impl->function->is_entrypoint; + + struct nir_to_jay_state nj = { + .s = s, + .f = f, + .nir = nir, + .devinfo = devinfo, + .bld = (jay_builder) { .shader = s, .func = f }, + }; + + /* Jay indices match NIR indices. Therefore the first impl->ssa_alloc + * indices are reserved. Our own temporaries go after. + */ + f->ssa_alloc = impl->ssa_alloc; + + if (f->is_entrypoint) { + jay_setup_payload(&nj); + } + + jay_emit_cf_list(&nj, &impl->body); + jay_emit_eot(&nj); + jay_remove_unreachable_blocks(f); +} + +static void +jay_gather_stats(const jay_shader *s, struct genisa_stats *stats) +{ + jay_foreach_inst_in_shader(s, f, I) { + stats->instrs += I->op != JAY_OPCODE_SYNC; + stats->loops += I->op == JAY_OPCODE_WHILE; + stats->sends += I->op == JAY_OPCODE_SEND; + + /* XXX: Write a real cycle model */ + stats->cycles++; + + /* Calculate register usage */ + if (I->dst.file == GPR) + stats->grf_registers = + MAX2(stats->grf_registers, I->dst.reg + jay_num_values(I->dst)); + } + + stats->spills = s->spills; + stats->fills = s->fills; + stats->sends -= (s->spills + s->fills); +} + +/* + * Jay-to-NIR relies on a careful indexing of defs: every 32-bit word has + * its own index. Vectors/64-bit use contiguous indices. We therefore run a + * modified version of nir_index_ssa_defs right before translating NIR->Jay. + */ +static bool +index_ssa_def_cb(nir_def *def, void *state) +{ + unsigned *index = (unsigned *) state; + def->index = *index; + *index += DIV_ROUND_UP(def->num_components * MAX2(def->bit_size, 32), 32); + return true; +} + +static void +nj_index_ssa_defs(nir_shader *nir) +{ + nir_foreach_function_impl(impl, nir) { + /* The zero index means null in Jay, so start SSA indices at 1 */ + unsigned index = 1; + + nir_foreach_block_unstructured(block, impl) { + nir_foreach_instr(instr, block) + nir_foreach_def(instr, index_ssa_def_cb, &index); + } + + impl->ssa_alloc = index; + } +} + +static bool +lower_helper_invocation(nir_builder *b, nir_intrinsic_instr *intr, void *_) +{ + if (intr->intrinsic != nir_intrinsic_load_helper_invocation) + return false; + + /* TODO: Is this right for multisampling? */ + b->cursor = nir_before_instr(&intr->instr); + nir_def *active = + nir_inot(b, nir_inverse_ballot(b, nir_load_sample_mask_in(b))); + + nir_def_replace(&intr->def, active); + return true; +} + +static bool +lower_frag_coord(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) +{ + if (intr->intrinsic != nir_intrinsic_load_frag_coord && + intr->intrinsic != nir_intrinsic_load_pixel_coord) + return false; + + b->cursor = nir_before_instr(&intr->instr); + nir_def *c = nir_unpack_32_2x16(b, nir_load_pixel_coord_intel(b)); + + if (intr->intrinsic == nir_intrinsic_load_frag_coord) { + c = nir_vec4(b, nir_u2f32(b, nir_channel(b, c, 0)), + nir_u2f32(b, nir_channel(b, c, 1)), nir_load_frag_coord_z(b), + nir_frcp(b, nir_load_frag_coord_w_rcp(b))); + } + + nir_def_replace(&intr->def, c); + return true; +} + +static bool +jay_nir_lower_simd(nir_builder *b, nir_intrinsic_instr *intr, void *simd_) +{ + b->cursor = nir_after_instr(&intr->instr); + unsigned *simd_width = simd_; + + /* mask & -mask isolates the lowest set bit in the mask. */ + if (intr->intrinsic == nir_intrinsic_elect) { + nir_def *mask = nir_ballot(b, 1, *simd_width, nir_imm_true(b)); + mask = nir_iand(b, mask, nir_ineg(b, mask)); + nir_def_replace(&intr->def, nir_inverse_ballot(b, mask)); + return true; + } + + /* Ballots must match the SIMD size */ + if (intr->intrinsic == nir_intrinsic_ballot || + intr->intrinsic == nir_intrinsic_ballot_relaxed) { + unsigned old_bitsize = intr->def.bit_size; + intr->def.bit_size = *simd_width; + nir_def *u2uN = nir_u2uN(b, &intr->def, old_bitsize); + nir_def_rewrite_uses_after(&intr->def, u2uN); + return true; + } + + /* Note: we don't treat read_invocation specially because there's little + * benefit but doing so would require expensive uniformizing in some cases. + */ + if (intr->intrinsic != nir_intrinsic_shuffle && + intr->intrinsic != nir_intrinsic_read_invocation) + return false; + + nir_def *data = intr->src[0].ssa; + assert(data->num_components == 1 && data->bit_size <= 32 && "scalarized"); + + nir_def *offset_B = nir_imul_imm(b, intr->src[1].ssa, 4); + nir_def_replace(&intr->def, nir_shuffle_intel(b, 1, data, offset_B)); + return true; +} + +struct frag_out_ctx { + nir_def *colour[8], *depth, *stencil, *sample_mask; +}; + +static bool +collect_fragment_output(nir_builder *b, nir_intrinsic_instr *intr, void *ctx_) +{ + struct frag_out_ctx *ctx = ctx_; + if (intr->intrinsic != nir_intrinsic_store_output) + return false; + + unsigned wrmask = nir_intrinsic_write_mask(intr); + assert(nir_intrinsic_component(intr) == 0 && "component should be lowered"); + assert(util_is_power_of_two_nonzero(wrmask + 1) && + "complex writemasks should be lowered"); + + /* TODO: Optimize with write mask? */ + + gl_frag_result loc = nir_intrinsic_io_semantics(intr).location; + assert(!nir_intrinsic_io_semantics(intr).dual_source_blend_index && "todo"); + nir_def **out; + if (loc == FRAG_RESULT_COLOR) { + out = &ctx->colour[0]; + } else if (loc >= FRAG_RESULT_DATA0 && loc <= FRAG_RESULT_DATA7) { + out = &ctx->colour[loc - FRAG_RESULT_DATA0]; + } else if (loc == FRAG_RESULT_DEPTH) { + out = &ctx->depth; + } else if (loc == FRAG_RESULT_STENCIL) { + UNREACHABLE("todo"); + out = &ctx->stencil; + } else if (loc == FRAG_RESULT_SAMPLE_MASK) { + UNREACHABLE("todo"); + out = &ctx->sample_mask; + } else { + UNREACHABLE("invalid location"); + } + + assert((*out) == NULL && "each location written exactly once"); + *out = intr->src[0].ssa; + + nir_instr_remove(&intr->instr); + return true; +} + +static void +append_payload(nir_builder *b, + nir_def **payload, + unsigned *len, + unsigned max_len, + nir_def *value) +{ + if (value != NULL) { + for (unsigned i = 0; i < value->num_components; ++i) { + payload[*len] = nir_channel(b, value, i); + (*len)++; + assert((*len) <= max_len); + } + } +} + +static void +insert_rt_store(nir_builder *b, + const struct intel_device_info *devinfo, + signed target, + bool last, + nir_def *colour, + nir_def *src0_alpha, + nir_def *depth, + nir_def *stencil, + nir_def *sample_mask, + unsigned dispatch_width) +{ + bool null_rt = target < 0; + target = MAX2(target, 0); + + if (!colour) { + colour = nir_undef(b, 4, 32); + } + + colour = nir_pad_vec4(b, colour); + + if (null_rt) { + /* Even if we don't write a RT, we still need to write alpha for + * alpha-to-coverage and alpha testing. Optimize the other channels out. + */ + colour = nir_vector_insert_imm(b, nir_undef(b, 4, 32), + nir_channel(b, colour, 3), 3); + } + + /* TODO: Not sure I like this. We'll see what 2src looks like. */ + unsigned op = dispatch_width == 32 ? + XE2_DATAPORT_RENDER_TARGET_WRITE_SIMD32_SINGLE_SOURCE : + BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE; + uint64_t desc = + brw_fb_write_desc(devinfo, target, op, last, false /* coarse write */); + + uint64_t ex_desc = 0; + if (devinfo->ver >= 20) { + ex_desc = target << 21 | + null_rt << 20 | + (src0_alpha ? (1 << 15) : 0) | + (stencil ? (1 << 14) : 0) | + (depth ? (1 << 13) : 0) | + (sample_mask ? (1 << 12) : 0); + } else if (devinfo->ver >= 11) { + /* Set the "Render Target Index" and "Src0 Alpha Present" fields + * in the extended message descriptor, in lieu of using a header. + */ + ex_desc = target << 12 | null_rt << 20 | (src0_alpha ? (1 << 15) : 0); + } + + /* Build the payload */ + nir_def *payload[8] = { NULL }; + unsigned len = 0; + append_payload(b, payload, &len, ARRAY_SIZE(payload), colour); + append_payload(b, payload, &len, ARRAY_SIZE(payload), depth); + /* TODO */ + + nir_def *disable = b->shader->info.fs.uses_discard ? + nir_is_helper_invocation(b, 1) : + nir_imm_false(b); + + nir_store_render_target_intel(b, nir_vec(b, payload, len), + nir_imm_ivec2(b, desc, ex_desc), disable, + .eot = last); +} + +static void +lower_fragment_outputs(nir_function_impl *impl, + const struct intel_device_info *devinfo, + unsigned nr_color_regions, + unsigned dispatch_width) +{ + struct frag_out_ctx ctx = { { NULL } }; + nir_function_intrinsics_pass(impl, collect_fragment_output, + nir_metadata_control_flow, &ctx); + nir_builder b_ = nir_builder_at(nir_after_impl(impl)); + nir_builder *b = &b_; + assert(nr_color_regions <= ARRAY_SIZE(ctx.colour)); + + signed first = -1; + for (unsigned i = 0; i < ARRAY_SIZE(ctx.colour); ++i) { + if (ctx.colour[i]) { + first = i; + break; + } + } + + /* Do the later render targets first */ + for (unsigned i = first + 1; i < nr_color_regions; ++i) { + if (ctx.colour[i]) { + insert_rt_store(b, devinfo, i, false, ctx.colour[i], NULL, NULL, NULL, + NULL, dispatch_width); + } + } + + /* Finally do render target zero attaching all the sideband things and + * setting the LastRT bit. This needs to exist even if nothing is written + * since it also signals end-of-thread. + */ + insert_rt_store(b, devinfo, first < nr_color_regions ? first : -1, true, + first >= 0 ? ctx.colour[first] : NULL, NULL, ctx.depth, + ctx.stencil, ctx.sample_mask, dispatch_width); +} + +struct jay_shader_bin * +jay_compile(const struct intel_device_info *devinfo, + void *mem_ctx, + nir_shader *nir, + union brw_any_prog_data *prog_data, + union brw_any_prog_key *key) +{ + jay_debug = debug_get_option_jay_debug(); + enum mesa_shader_stage stage = nir->info.stage; + bool debug = INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage)); + struct brw_compiler compiler = { .devinfo = devinfo }; + unsigned nr_packed_regs = 0; + + brw_pass_tracker pt_ = { + .nir = nir, + .key = &key->base, + .dispatch_width = 0, + .compiler = &compiler, + .archiver = NULL, //params->base.archiver, + }, *pt = &pt_; + + BRW_NIR_SNAPSHOT("first"); + + prog_data->base.ray_queries = nir->info.ray_queries; + prog_data->base.stage = stage; + // TODO: Make the driver do this? + // prog_data->base.source_hash = params->source_hash; + prog_data->base.total_shared = nir->info.shared_size; + + /* TODO: Real heuristic */ + bool do_simd32 = INTEL_SIMD(FS, 32); + do_simd32 &= stage == MESA_SHADER_COMPUTE || stage == MESA_SHADER_FRAGMENT; + unsigned simd_width = do_simd32 ? (nir->info.api_subgroup_size ?: 32) : 16; + + if (stage == MESA_SHADER_VERTEX) { + /* We only expect slot compaction to be disabled when using device + * generated commands, to provide an independent 3DSTATE_VERTEX_ELEMENTS + * programming. This should always be enabled together with VF component + * packing to minimize the size of the payload. + */ + assert(!key->vs.no_vf_slot_compaction || key->vs.vf_component_packing); + + /* When using Primitive Replication for multiview, each view gets its own + * position slot. + */ + const uint32_t pos_slots = + (nir->info.per_view_outputs & VARYING_BIT_POS) ? + MAX2(1, util_bitcount(key->base.view_mask)) : + 1; + + /* Only position is allowed to be per-view */ + assert(!(nir->info.per_view_outputs & ~VARYING_BIT_POS)); + + brw_compute_vue_map(devinfo, &prog_data->vue.vue_map, + nir->info.outputs_written, key->base.vue_layout, + pos_slots); + + brw_nir_apply_key(pt, &key->base, simd_width); + + prog_data->vs.inputs_read = nir->info.inputs_read; + prog_data->vs.double_inputs_read = nir->info.vs.double_inputs; + prog_data->vs.no_vf_slot_compaction = key->vs.no_vf_slot_compaction; + + brw_nir_lower_vs_inputs(nir); + brw_nir_lower_vue_outputs(nir); + BRW_NIR_SNAPSHOT("after_lower_io"); + + memset(prog_data->vs.vf_component_packing, 0, + sizeof(prog_data->vs.vf_component_packing)); + if (key->vs.vf_component_packing) { + nr_packed_regs = brw_nir_pack_vs_input(nir, &prog_data->vs); + } + + /* Get constant offsets out of the way for proper clip/cull handling */ + BRW_NIR_PASS(nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); + BRW_NIR_PASS(nir_opt_constant_folding); + BRW_NIR_PASS(brw_nir_lower_deferred_urb_writes, devinfo, + &prog_data->vue.vue_map, 0, 0); + } else if (stage == MESA_SHADER_FRAGMENT) { + assert(key->fs.mesh_input == INTEL_NEVER && "todo"); + assert(!key->fs.force_dual_color_blend && "todo"); + brw_nir_apply_key(pt, &key->base, 32); + brw_nir_lower_fs_inputs(nir, devinfo, &key->fs); + brw_nir_lower_fs_outputs(nir); + NIR_PASS(_, nir, nir_lower_io_to_scalar, nir_var_shader_in, NULL, NULL); + + if (!brw_can_coherent_fb_fetch(devinfo)) + NIR_PASS(_, nir, brw_nir_lower_fs_load_output, &key->fs); + + NIR_PASS(_, nir, nir_opt_frag_coord_to_pixel_coord); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_frag_coord, + nir_metadata_control_flow, NULL); + NIR_PASS(_, nir, nir_opt_barycentric, true); + + lower_fragment_outputs(nir_shader_get_entrypoint(nir), devinfo, + key->fs.nr_color_regions, simd_width); + NIR_PASS(_, nir, nir_lower_helper_writes, true); + NIR_PASS(_, nir, nir_lower_is_helper_invocation); + NIR_PASS(_, nir, nir_shader_intrinsics_pass, lower_helper_invocation, + nir_metadata_control_flow, NULL); + + if (key->fs.alpha_to_coverage != INTEL_NEVER) { + /* Run constant fold optimization in order to get the correct source + * offset to determine render target 0 store instruction in + * emit_alpha_to_coverage pass. + */ + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, brw_nir_lower_alpha_to_coverage); + } + + // TODO + // NIR_PASS(_, nir, brw_nir_move_interpolation_to_top); + + if (!brw_fs_prog_key_is_dynamic(&key->fs)) { + uint32_t f = 0; + + if (key->fs.multisample_fbo == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_MULTISAMPLE_FBO; + + if (key->fs.alpha_to_coverage == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_ALPHA_TO_COVERAGE; + + if (key->fs.provoking_vertex_last == INTEL_ALWAYS) + f |= INTEL_FS_CONFIG_PROVOKING_VERTEX_LAST; + + if (key->fs.persample_interp == INTEL_ALWAYS) { + f |= INTEL_FS_CONFIG_PERSAMPLE_DISPATCH | + INTEL_FS_CONFIG_PERSAMPLE_INTERP; + } + + NIR_PASS(_, nir, nir_inline_sysval, nir_intrinsic_load_fs_config_intel, + f); + } + } else { + brw_nir_apply_key(pt, &key->base, simd_width); + } + + brw_postprocess_nir_opts(pt); + + NIR_PASS(_, nir, nir_shader_intrinsics_pass, jay_nir_lower_simd, + nir_metadata_control_flow, &simd_width); + NIR_PASS(_, nir, nir_opt_algebraic_late); + NIR_PASS(_, nir, intel_nir_opt_peephole_imul32x16); + + /* Late postprocess while remaining in SSA */ + /* Run fsign lowering again after the last time brw_nir_optimize is called. + * As is the case with conversion lowering (below), brw_nir_optimize can + * create additional fsign instructions. + */ + NIR_PASS(_, nir, jay_nir_lower_fsign); + NIR_PASS(_, nir, jay_nir_lower_bool); + NIR_PASS(_, nir, nir_opt_cse); + NIR_PASS(_, nir, nir_opt_dce); + NIR_PASS(_, nir, jay_nir_opt_sel_zero); + + /* Run nir_split_conversions only after the last tiem + * brw_nir_optimize is called. Various optimizations invoked there can + * rematerialize the conversions that the lowering pass eliminates. + */ + const nir_split_conversions_options split_conv_opts = { + .callback = intel_nir_split_conversions_cb, + }; + NIR_PASS(_, nir, nir_split_conversions, &split_conv_opts); + + /* Do this only after the last opt_gcm. GCM will undo this lowering. */ + if (stage == MESA_SHADER_FRAGMENT) { + NIR_PASS(_, nir, intel_nir_lower_non_uniform_barycentric_at_sample); + } + + NIR_PASS(_, nir, nir_opt_constant_folding); + NIR_PASS(_, nir, nir_lower_load_const_to_scalar); + NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); + NIR_PASS(_, nir, nir_opt_copy_prop); + NIR_PASS(_, nir, nir_opt_dce); + + /* Run divergence analysis at the end */ + nir_sweep(nir); + nj_index_ssa_defs(nir); + nir_divergence_analysis(nir); + + if (debug) { + /* We can't use nir_print_shader since it reindexes SSA defs. */ + fprintf(stdout, "NIR right before from_nir:\n\n"); + nir_print_shader_annotated(nir, stdout, NULL); + fflush(stdout); + } + + if (stage == MESA_SHADER_VERTEX) { + populate_vs_prog_data(nir, devinfo, &key->vs, &prog_data->vs, + nr_packed_regs, debug); + } else if (stage == MESA_SHADER_FRAGMENT) { + int per_primitive_offsets[VARYING_SLOT_MAX]; + memset(per_primitive_offsets, -1, sizeof(per_primitive_offsets)); + + populate_fs_prog_data(nir, devinfo, &key->fs, &prog_data->fs, + NULL /* TODO: mue_map */, per_primitive_offsets); + } + + jay_shader *s = jay_new_shader(NULL, stage); + s->dispatch_width = simd_width; + s->scratch_size = align(nir->scratch_size, 4) * s->dispatch_width; + s->devinfo = devinfo; + s->prog_data = prog_data; + + nir_foreach_function_impl(impl, nir) { + jay_from_nir_function(devinfo, nir, s, impl); + } + + /* Re-number block indices to be sequential and match the NIR. This ensures + * block indices are ordered with respect to the control flow graph which is + * a convenient IR invariant. + */ + jay_foreach_function(s, f) { + unsigned index = 0; + + jay_foreach_block(f, b) { + b->index = index++; + } + } + + jay_validate(s, "NIR->Jay translation"); + + if (!(jay_debug & JAY_DBG_NOOPT)) { + JAY_PASS(s, jay_opt_propagate_forwards); + JAY_PASS(s, jay_opt_propagate_backwards); + JAY_PASS(s, jay_opt_dead_code); + } + + if (debug) { + fprintf(stdout, "Jay shader:\n\n"); + jay_print(stdout, s); + } + + JAY_PASS(s, jay_assign_flags); + if (!(jay_debug & JAY_DBG_NOOPT)) { + JAY_PASS(s, jay_opt_dead_code); + } + + JAY_PASS(s, jay_lower_pre_ra); + JAY_PASS(s, jay_partition_grf); + JAY_PASS(s, jay_register_allocate); + JAY_PASS(s, jay_lower_post_ra); + JAY_PASS(s, jay_insert_fp_mode, nir->info.float_controls_execution_mode, + nir->info.bit_sizes_float); + + if (!(jay_debug & JAY_DBG_NOOPT)) { + JAY_PASS(s, jay_opt_control_flow); + } + + JAY_PASS(s, jay_lower_scoreboard); + + if (debug) { + fprintf(stdout, "Jay shader (post-RA):\n\n"); + jay_print(stdout, s); + } + + struct jay_shader_bin *bin = + jay_to_binary(s, nir->constant_data, nir->constant_data_size); + assert(bin->kernel); + ralloc_steal(mem_ctx, bin); + + jay_gather_stats(s, &bin->stats); + bin->stats.code_size = bin->size; + + if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(stage))) { + if (nir->info.label) { + printf("%s - ", nir->info.label); + } + + const char *shader_name = + ralloc_asprintf(s, "%s SIMD%u", _mesa_shader_stage_to_abbrev(stage), + s->dispatch_width); + genisa_stats_fprintf(stdout, shader_name, &bin->stats); + } + + bin->stats.workgroup_memory_size = nir->info.shared_size; + bin->stats.dispatch_width = simd_width; + + if (stage == MESA_SHADER_FRAGMENT) { + if (simd_width == 8) { + prog_data->fs.dispatch_8 = true; + } else if (simd_width == 16) { + prog_data->fs.dispatch_16 = true; + prog_data->fs.prog_offset_16 = 0; + } else if (simd_width == 32) { + prog_data->fs.dispatch_32 = true; + prog_data->fs.prog_offset_32 = 0; + } + + prog_data->fs.has_side_effects = nir->info.writes_memory; + } else if (mesa_shader_stage_is_compute(stage)) { + unsigned i = simd_width == 8 ? 0 : simd_width == 16 ? 1 : 2; + prog_data->cs.prog_offset[i] = 0; + prog_data->cs.prog_mask = BITFIELD_BIT(i); + prog_data->cs.uses_inline_push_addr = key->base.uses_inline_push_addr; + prog_data->cs.uses_inline_data |= key->base.uses_inline_push_addr; + prog_data->cs.prog_spilled = s->scratch_size > 0; /* XXX */ + } + + prog_data->base.program_size = bin->size; + + if (s->scratch_size > 0) { + /* We currently only support up to 2MB of scratch space. If we + * need to support more eventually, the documentation suggests + * that we could allocate a larger buffer, and partition it out + * ourselves. We'd just have to undo the hardware's address + * calculation by subtracting (FFTID * Per Thread Scratch Space) + * and then add FFTID * (Larger Per Thread Scratch Space). + * + * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline > + * Thread Group Tracking > Local Memory/Scratch Space. + */ + assert(s->scratch_size <= devinfo->max_scratch_size_per_thread && + "maximum scratch size"); + + /* Take the max of any previously compiled variant of the shader. In the + * case of bindless shaders with return parts, this will also take the + * max of all parts. + */ + prog_data->base.total_scratch = + MAX2(prog_data->base.total_scratch, + util_next_power_of_two(s->scratch_size)); + } + + if (stage == MESA_SHADER_VERTEX || + stage == MESA_SHADER_TESS_EVAL || + stage == MESA_SHADER_GEOMETRY || + stage == MESA_SHADER_MESH) { + + uint32_t clip_mask = BITFIELD_MASK(nir->info.clip_distance_array_size); + uint32_t cull_mask = BITFIELD_RANGE(nir->info.clip_distance_array_size, + nir->info.cull_distance_array_size); + + if (stage == MESA_SHADER_MESH) { + prog_data->mesh.clip_distance_mask = clip_mask; + prog_data->mesh.cull_distance_mask = cull_mask; + } else { + prog_data->vue.clip_distance_mask = clip_mask; + prog_data->vue.cull_distance_mask = cull_mask; + } + } + + /* Scratch is allocated in 1KiB increments. */ + prog_data->base.total_scratch = align(prog_data->base.total_scratch, 1024); + + ralloc_free(s); + return bin; +} diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h new file mode 100644 index 00000000000..37d0b722319 --- /dev/null +++ b/src/intel/compiler/jay/jay_ir.h @@ -0,0 +1,1408 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "compiler/brw/brw_compiler.h" +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/shader_enums.h" +#include "util/bitset.h" +#include "util/list.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "util/sparse_bitset.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "jay_opcodes.h" + +/* TODO: switch to brw_conditional_mod */ +enum PACKED jay_conditional_mod { + JAY_CONDITIONAL_EQ = 1, /**< Equal to zero */ + JAY_CONDITIONAL_NE = 2, /**< Not equal to zero */ + JAY_CONDITIONAL_GT = 3, /**< Greater than zero */ + JAY_CONDITIONAL_LT = 5, /**< Less than zero */ + JAY_CONDITIONAL_GE = 4, /**< Greater than or equal to zero */ + JAY_CONDITIONAL_LE = 6, /**< Less than or equal to zero */ + JAY_CONDITIONAL_OV = 8, /**< Overflow has occurred */ + JAY_CONDITIONAL_NAN = 9, /**< Result is NaN */ +}; + +static inline enum jay_conditional_mod +jay_conditional_mod_swap_sources(enum jay_conditional_mod mod) +{ + /* clang-format off */ + switch (mod) { + case JAY_CONDITIONAL_GT: return JAY_CONDITIONAL_LT; + case JAY_CONDITIONAL_LT: return JAY_CONDITIONAL_GT; + case JAY_CONDITIONAL_GE: return JAY_CONDITIONAL_LE; + case JAY_CONDITIONAL_LE: return JAY_CONDITIONAL_GE; + default: return mod; + } + /* clang-format on */ +} + +enum PACKED jay_arf { + JAY_ARF_NULL = 0, + JAY_ARF_MASK = BRW_ARF_MASK, + JAY_ARF_CONTROL = BRW_ARF_CONTROL, + JAY_ARF_TIMESTAMP = BRW_ARF_TIMESTAMP, +}; + +enum PACKED jay_file { + /** Non-uniform general purpose registers: 32-bits per SIMT lane. */ + GPR, + + /** Uniform general purpose registers: 32-bit uniform values */ + UGPR, + + /** Memory registers representing spilled values: 32-bits per SIMT lane. */ + MEM, + + /** Memory registers representing spilled values: 32-bits uniform values */ + UMEM, + + /** Non-uniform flags (predicates): 1-bit per SIMT lane */ + FLAG, + + /** Uniform flags (predicates): 1-bit uniform value */ + UFLAG, + + /** Address registers */ + J_ADDRESS, + + /* Non-SSA files below: */ + + /** Accumulators: 32-bits per SIMT lane */ + ACCUM, + + /** Uniform accumulators: 32-bit uniform value */ + UACCUM, + + /** Architecture registers: direct access scalar */ + J_ARF, + + /** Inputs within Jay unit tests */ + TEST_FILE, + + /* Immediate value */ + J_IMM, + + JAY_FILE_LAST = J_IMM, + JAY_NUM_SSA_FILES = J_ADDRESS + 1, + + /* Set of files that the main RA (and not eg flag RA) allocates. */ + JAY_NUM_RA_FILES = UMEM + 1, + JAY_NUM_GRF_FILES = UGPR + 1, +}; +static_assert(JAY_FILE_LAST <= 0b1111, "must fit in 4 bits (see jay_def)"); + +#define jay_foreach_ssa_file(file) \ + for (enum jay_file file = 0; file < JAY_NUM_SSA_FILES; ++file) + +/* Value stuffed into the index field of instructions post-RA that are not + * null (0) but do not have an associated SSA index (as they are post-RA). + */ +#define JAY_SENTINEL (0xffffffffu) + +/* Maximum number of words in an jay_def */ +#define JAY_MAX_DEF_LENGTH (128) + +/* Maximum number of sources/destinations other than for phis */ +#define JAY_MAX_SRCS (16) +#define JAY_MAX_DESTS (2) +#define JAY_MAX_OPERANDS (JAY_MAX_SRCS + JAY_MAX_DESTS) +#define JAY_MAX_FLAGS (8) +#define JAY_MAX_SAMPLER_MESSAGE_SIZE (11) +#define JAY_NUM_LAST_USE_BITS (32) +#define JAY_NUM_PHYS_GRF (128) +#define JAY_NUM_UGPR (1024) +#define JAY_REG_BITS (17) + +/* + * An jay_def represents a contiguous array of registers or a 32-bit immediate. + * It is used for sources or (in restricted form) for destinations. + */ +typedef struct jay_def { + /* Mode-dependent payload. + * + * File = IMMEDIATE: Immediate. + * Collect = false: Base SSA index. + * Collect = true: Pointer to SSA indices. + * + * SSA indices must be unique even across register files, so that we can + * easily track them all in e.g. a bitfield without needing to have + * separate data structures for each file. + * + * Each index represents a single 32-bit (or 1-bit if a predicate) value in + * the specified register file. 64-bit or vec4 values use multiple indices. + * + * Index 0 is reserved as the null value. + */ + uint32_t _payload; + + /* After register allocation, the register assigned to this def. + * + * Also used for additional pointer bits for collect pre-RA, which is why + * this is as large as it is. Could be shrunk with more pointer compression. + */ + unsigned reg:JAY_REG_BITS; + + /* (Post-RA) only, access only the top half of the indexed 32-bit register */ + bool hi:1; + + /** The associated file (must be < JAY_NUM_SSA_FILES for SSA) */ + enum jay_file file:4; + + /* Represents either a negation or a bitwise inversion (depending on the + * instruction type.) + */ + bool negate:1; + + /* Represents absolute value (on floating point sources) */ + bool abs:1; + + /* Number of values minus 1 */ + unsigned num_values_m1:7; + + /* If true, collects many discontiguous SSA indices into a single def. + * Requires file = GPR or file = UGPR. Cannot be used post-RA. + * + * Canonical form is required: the indices pointed to by the payload must NOT + * be contiguous. Also, the payload is not owned by the def: the def may be + * cheaply copied around, but mutating the payload requires copy-on-write and + * maintaining the canonical form. + */ + bool collect:1; +} jay_def; +static_assert(sizeof(jay_def) == 8, "packed"); + +/* + * Construct an jay_def representing a bare register with no associated SSA + * index, for use post-RA only. + */ +static inline jay_def +jay_bare_reg(enum jay_file file, uint16_t reg) +{ + return (jay_def) { ._payload = JAY_SENTINEL, .reg = reg, .file = file }; +} + +/* + * Set the register for a def (called by RA only). This drops the collect + * indices since we do not have space to encode both simultaneously. + */ +static inline void +jay_set_reg(jay_def *d, unsigned r) +{ + if (d->collect) { + d->collect = false; + d->_payload = JAY_SENTINEL; + } + + d->reg = r; +} + +static inline uint32_t +jay_base_index(jay_def d) +{ + assert(d.file != J_IMM && !d.collect); + return d._payload; +} + +/** + * True if the value is null. + */ +static inline bool +jay_is_null(jay_def d) +{ + return d._payload == 0 && d.file != J_IMM; +} + +static inline bool +jay_is_imm(jay_def d) +{ + return d.file == J_IMM; +} + +/** + * True if the def is a 1-bit flag regardless of whether it is uniform. + */ +static inline bool +jay_is_flag(jay_def d) +{ + return d.file == FLAG || d.file == UFLAG; +} + +/** + * Return the number of SSA indices referenced by an jay_def. + */ +static inline unsigned +jay_num_values(jay_def d) +{ + return jay_is_imm(d) || jay_is_null(d) ? 0 : (d.num_values_m1 + 1); +} + +/** + * True if the def is an SSA def (and not, say, an arch register). + */ +static inline bool +jay_is_ssa(jay_def d) +{ + return d.file < JAY_NUM_SSA_FILES; +} + +#define jay_foreach_comp(def, c) \ + for (unsigned c = 0; c < jay_num_values(def); ++c) + +#define jay_foreach_comp_rev(def, c) \ + for (signed c = jay_num_values(def) - 1; c >= 0; --c) + +/* + * Alias for jay_base_index for use with scalar defs. + */ +static inline uint32_t +jay_index(jay_def d) +{ + assert(jay_num_values(d) == 1); + return jay_base_index(d); +} + +/** + * Return a reference to the array of indices of a collect source. + */ +static inline uint32_t * +_jay_collect_indices(jay_def d) +{ + assert(d.collect); + + /* reg has upper bits of the pointer */ + uint64_t payload = (((uint64_t) d.reg) << 32) | d._payload; + return (uint32_t *) (uintptr_t) payload; +} + +/** + * Return the n'th channel of an SSA def. + * + * Note: this is specifically read-only. To mutate, use jay_set_channel. + */ +static inline uint32_t +jay_channel(jay_def d, unsigned c) +{ + assert(d.file != J_IMM); + assert(c <= d.num_values_m1); + + if (likely(!d.collect)) { + return jay_base_index(d) + c; + } else { + return _jay_collect_indices(d)[c]; + } +} + +/** + * Build a contiguous jay_def. + */ +static inline jay_def +jay_contiguous_def(enum jay_file file, uint32_t index, unsigned count) +{ + assert(count > 0 && count <= (1 << 7) && "max def width"); + + return (jay_def) { + ._payload = index, + .file = file, + .num_values_m1 = count - 1, + }; +} + +/* + * Replaces a source, preserving the negate/abs if present. + */ +static inline void +jay_replace_src(jay_def *old, jay_def replacement) +{ + replacement.negate = old->negate; + replacement.abs = old->abs; + *old = replacement; +} + +static inline jay_def +jay_scalar(enum jay_file file, uint32_t index) +{ + return jay_contiguous_def(file, index, 1); +} + +static inline jay_def +jay_null() +{ + return jay_scalar(J_ARF, 0); +} + +/** + * Return a contiguous subrange inside an SSA def. + */ +static inline jay_def +jay_extract_range(jay_def def, unsigned chan, unsigned count) +{ + assert(!jay_is_imm(def)); + assert((count == 1 || !def.collect) && "slicing collects unsupported"); + assert(chan + count <= jay_num_values(def)); + + uint32_t base = jay_channel(def, chan); + jay_replace_src(&def, jay_contiguous_def(def.file, base, count)); + return def; +} + +/** + * Return a scalar SSA def equal to a single channel from an SSA def. + */ +static inline jay_def +jay_extract(jay_def def, unsigned chan) +{ + return jay_extract_range(def, chan, 1); +} + +/** + * Like jay_extract but working on bare registers. This could be unified to + * preserve indices and such but meh. + */ +static inline jay_def +jay_extract_post_ra(jay_def def, unsigned chan) +{ + return jay_bare_reg(def.file, def.reg + chan); +} + +/** + * Construct an immediate source from a raw 32-bit data pattern. + */ +static inline jay_def +jay_imm(uint32_t imm) +{ + return (jay_def) { ._payload = imm, .file = J_IMM }; +} + +/** + * True if both jay_defs are equivalent up to source modifiers. + */ +static inline bool +jay_defs_equivalent(jay_def a, jay_def b) +{ + if (a.file != b.file || + a.num_values_m1 != b.num_values_m1 || + a.collect != b.collect) + return false; + + if (likely(!a.collect)) { + /* Contiguous or immediate */ + return a._payload == b._payload && a.reg == b.reg; + } else { + /* Collect. Component-wise compare. */ + return !memcmp(_jay_collect_indices(a), _jay_collect_indices(b), + sizeof(uint32_t) * jay_num_values(a)); + } +} + +/** + * True if both registers are equal (for use post-RA). + */ +static inline bool +jay_regs_equal(jay_def a, jay_def b) +{ + return a.file == b.file && + a.num_values_m1 == b.num_values_m1 && + a.reg == b.reg; +} + +/** + * Return a reference to the execution mask (mask0) architecture register. + */ +static inline jay_def +jay_exec_mask(void) +{ + return jay_scalar(J_ARF, JAY_ARF_MASK); +} + +/** + * Return a reference to the control (cr0) architecture register. + */ +static inline jay_def +jay_control(void) +{ + return jay_scalar(J_ARF, JAY_ARF_CONTROL); +} + +/** + * Construct an immediate from a floating point constant. + */ +static inline jay_def +jay_imm_f(float imm) +{ + return jay_imm(fui(imm)); +} + +/** + * Return the negation of a source. + */ +static inline jay_def +jay_negate(jay_def src) +{ + src.negate = !src.negate; + return src; +} + +/** + * Return the absolute value of a source. + */ +static inline jay_def +jay_abs(jay_def src) +{ + src.negate = false; + src.abs = true; + return src; +} + +/** + * Returns true if the given source reads the same value in all lanes. + */ +static inline bool +jay_is_uniform(jay_def d) +{ + return d.file == UGPR || + d.file == UFLAG || + d.file == UACCUM || + jay_is_imm(d); +} + +/** + * Returns true if the given definition represents a spilled variable. + */ +static inline bool +jay_is_mem(jay_def x) +{ + return x.file == MEM || x.file == UMEM; +} + +static inline uint32_t +jay_as_uint(jay_def src) +{ + assert(jay_is_imm(src)); + return src._payload; +} + +static inline bool +jay_is_zero(jay_def src) +{ + return jay_is_imm(src) && jay_as_uint(src) == 0; +} + +/* Chosen so that sized type is the unsized type OR the number bits */ +#define JAY_TYPE_BASE_MASK (128 | 2 | 4) + +enum PACKED jay_type { + JAY_TYPE_UNTYPED = 0, + JAY_TYPE_U = 2, + JAY_TYPE_S = 4, + JAY_TYPE_F = 6, + JAY_TYPE_BF = 128, + + /** Unsigned integers */ + JAY_TYPE_U64 = JAY_TYPE_U | 64, + JAY_TYPE_U32 = JAY_TYPE_U | 32, + JAY_TYPE_U16 = JAY_TYPE_U | 16, + JAY_TYPE_U8 = JAY_TYPE_U | 8, + JAY_TYPE_U1 = JAY_TYPE_U | 1, + + /** Signed integers */ + JAY_TYPE_S64 = JAY_TYPE_S | 64, + JAY_TYPE_S32 = JAY_TYPE_S | 32, + JAY_TYPE_S16 = JAY_TYPE_S | 16, + JAY_TYPE_S8 = JAY_TYPE_S | 8, + JAY_TYPE_S1 = JAY_TYPE_S | 1, + + /** IEEE floating point */ + JAY_TYPE_F64 = JAY_TYPE_F | 64, + JAY_TYPE_F32 = JAY_TYPE_F | 32, + JAY_TYPE_F16 = JAY_TYPE_F | 16, + + /** Other floating point variants */ + JAY_TYPE_BF16 = JAY_TYPE_BF | 16, +}; +static_assert(sizeof(enum jay_type) == 1); + +static inline enum jay_type +jay_type(enum jay_type base, unsigned bits) +{ + /* Normalize booleans */ + if (bits == 1) { + base = JAY_TYPE_U; + } + + return (enum jay_type)(base | bits); +} + +static inline enum jay_type +jay_base_type(enum jay_type t) +{ + return (enum jay_type)(t & JAY_TYPE_BASE_MASK); +} + +static inline unsigned +jay_type_size_bits(enum jay_type t) +{ + return t & ~JAY_TYPE_BASE_MASK; +} + +static inline enum jay_type +jay_type_rebase(enum jay_type t, enum jay_type new_base) +{ + return jay_type(new_base, jay_type_size_bits(t)); +} + +static inline enum jay_type +jay_type_resize(enum jay_type t, unsigned bits) +{ + return jay_type(jay_base_type(t), bits); +} + +/** + * Returns the number of 32-bit values needed to hold a type t. + */ +static inline unsigned +jay_type_vector_length(enum jay_type t) +{ + return jay_type_size_bits(t) == 64 ? 2 : 1; +} + +static inline bool +jay_type_is_any_float(enum jay_type t) +{ + return jay_base_type(t) == JAY_TYPE_F || jay_base_type(t) == JAY_TYPE_BF; +} + +enum jay_predication : uint8_t { + /** No predication. */ + JAY_NOT_PREDICATED = 0, + + /** + * Predicated with no default value. Used post-RA and for instructions that + * do not write a destination. + */ + JAY_PREDICATED = 1, + + /** Predicated with 1 default value. Used pre-RA. */ + JAY_PREDICATED_DEFAULT = 2, +}; + +/** + * Representation of a shader instruction in the Jay IR. + */ +typedef struct jay_inst { + struct list_head link; + + /** + * Metadata calculated by liveness analysis: bit i is set if the i'th + * non-null SSA index read by the instruction is killed by that read. + */ + BITSET_DECLARE(last_use, JAY_NUM_LAST_USE_BITS); + + enum jay_opcode op; + enum jay_type type; /**< execution type of the instruction */ + + /** Software scoreboarding dependencies (for non-SYNC instructions) */ + struct tgl_swsb dep; + + /** Number of sources */ + uint8_t num_srcs; + + /** + * Indicates an instruction reading only uniform sources but writing a FLAG + * and no GPR/UGPR that expects the flag to replicate for all SIMD lanes. + * This is okay in our data model but cannot be inferred from the files, so + * we have a secondary bit to express this. + */ + bool broadcast_flag:1; + bool saturate :1; + + /** + * In a SIMD split instruction, whether the regdist dependency is replicated + * to each physical instruction. If false, only the first instruction waits. + * + * If decrement_dep is also set, the regdist is decremented by the macro + * length for each instruction (modelling cross-pipe dependencies). + */ + bool replicate_dep:1; + bool decrement_dep:1; + unsigned padding :12; + + enum jay_predication predication; + enum jay_conditional_mod conditional_mod; + + jay_def cond_flag; /**< conditional flag */ + jay_def dst; + + jay_def src[]; +} jay_inst; + +static_assert(sizeof(jay_inst) == 32 + (sizeof(uintptr_t) * 2), "packed"); + +/* + * Return the number of instruction set defined sources, ignoring implicit + * predication and accumulator sources. + */ +static inline unsigned +jay_num_isa_srcs(const jay_inst *I) +{ + return I->num_srcs - I->predication - (I->op == JAY_OPCODE_SEL); +} + +static inline bool +jay_uses_flag(const jay_inst *I) +{ + return I->predication || + !jay_is_null(I->cond_flag) || + I->op == JAY_OPCODE_SEL; +} + +static inline void +jay_remove_instruction(jay_inst *inst) +{ + list_del(&inst->link); +} + +static inline bool +jay_has_src_mods(jay_inst *I, unsigned s) +{ + return jay_opcode_infos[I->op].src_mods & BITFIELD_BIT(s); +} + +static inline bool +jay_inst_has_default(jay_inst *I) +{ + return I->predication >= JAY_PREDICATED_DEFAULT; +} + +static inline jay_def * +jay_inst_get_predicate(jay_inst *I) +{ + assert(I->predication); + return &I->src[I->num_srcs - I->predication]; +} + +static inline jay_def * +jay_inst_get_default(jay_inst *I) +{ + assert(jay_inst_has_default(I)); + return &I->src[I->num_srcs - 1]; +} + +/* Must be included late since it depends on jay_inst but the rest of this file + * depends on the inline functions it defines. + */ +#include "jay_extra_info.h" + +static inline enum jay_type +jay_src_type(const jay_inst *I, unsigned s) +{ + /* Predicates */ + if (s == (unsigned) (I->num_srcs - I->predication) || + (I->op == JAY_OPCODE_SEL && s == 2) || + (I->op == JAY_OPCODE_PHI_SRC && jay_is_flag(I->src[s]))) + return JAY_TYPE_U1; + + /* Conversions have an explicit source type, use that. */ + if (I->op == JAY_OPCODE_CVT) + return jay_cvt_src_type(I); + + /* 16-bit operand */ + if (I->op == JAY_OPCODE_MUL_32X16 && s == 1) + return jay_type_resize(I->type, jay_type_size_bits(I->type) / 2); + + if (I->op == JAY_OPCODE_SEND) { + if (s < 2) + return JAY_TYPE_U32; + else if (s < 4) + return s == 3 ? jay_send_type_1(I) : jay_send_type_0(I); + } + + if (I->op == JAY_OPCODE_CAST_CANONICAL_TO_FLAG) + return JAY_TYPE_U32; + + /* Shifts are always small even with 64-bit destinations */ + if ((I->op == JAY_OPCODE_SHL || + I->op == JAY_OPCODE_SHR || + I->op == JAY_OPCODE_ASR) && + s == 1) + return JAY_TYPE_U16; + + /* TODO: Do we want to allow zero-extension generally? */ + if (I->op == JAY_OPCODE_AND_U32_U16) + return JAY_TYPE_U16; + + /* Mixed-signedness integer dot product opcode */ + if (I->op == JAY_OPCODE_DP4A_SU && s == 2) + return JAY_TYPE_U32; + + /* Shuffle lane index distinct from data type */ + if (I->op == JAY_OPCODE_SHUFFLE && s == 1) + return JAY_TYPE_U32; + + /* Other instructions inherit the destination type. */ + return I->type; +} + +enum jay_stride { + JAY_STRIDE_2 = 0, + JAY_STRIDE_4, + JAY_STRIDE_8, + JAY_NUM_STRIDES, +}; + +static inline unsigned +jay_stride_to_bits(enum jay_stride s) +{ + assert(s <= JAY_STRIDE_8); + return 16 << s; +} + +#define JAY_PARTITION_BLOCKS (3) + +struct jay_register_block { + uint16_t start, len; +}; + +struct jay_partition { + /** Consecutive ranges of GRFs in GPR/UGPRs. */ + struct jay_register_block blocks[JAY_NUM_GRF_FILES][JAY_PARTITION_BLOCKS]; + + /** Number of GPR/UGPRs per GRF, times 16. For example, 16 encodes SIMD16 + * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 UGPRs). + * 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR). + */ + unsigned units_x16[JAY_NUM_GRF_FILES]; + + /** Base GPR for each stride. The file is partitioned (4, 8, 2, 4=EOT). */ + unsigned base8, base2, base_eot; + + /** Region of the UGPR partition suitable for large UGPR vectors */ + struct jay_register_block large_ugpr_block; +}; + +static inline enum jay_stride +jay_gpr_to_stride(struct jay_partition *p, unsigned reg) +{ + return (reg < p->base8 || reg >= p->base_eot) ? JAY_STRIDE_4 : + reg >= p->base2 ? JAY_STRIDE_2 : + JAY_STRIDE_8; +} + +/** + * Representation of a shader in the Jay IR. + */ +typedef struct jay_shader { + mesa_shader_stage stage; + struct list_head functions; + const struct intel_device_info *devinfo; + union brw_any_prog_data *prog_data; + unsigned spills, fills; + unsigned scratch_size; + unsigned push_grfs; + + /** + * Ralloc linear context. Since we don't typically free as we go, + * most allocations should go through this context for efficiency. + */ + struct linear_ctx *lin_ctx; + + /* Dispatch width of the current compile: 8, 16, or 32. */ + unsigned dispatch_width; + + /** + * Number of GPR/UGPRs used across all functions in the shader. This is the + * limit that must be allocated for the shader. + */ + unsigned num_regs[JAY_NUM_RA_FILES]; + + /** + * Register file partition chosen for the whole shader. + */ + struct jay_partition partition; + + /** Current compilation phase (for printing & validation) */ + bool post_ra; +} jay_shader; + +static inline jay_shader * +jay_new_shader(void *memctx, mesa_shader_stage stage) +{ + jay_shader *s = rzalloc(NULL, jay_shader); + s->stage = stage; + s->lin_ctx = linear_context(s); + list_inithead(&s->functions); + return s; +} + +static inline unsigned +jay_ugpr_per_grf(jay_shader *s) +{ + unsigned B_per_unit = 32 /* see reg_unit */; + unsigned B_per_ugpr = 4; + + return reg_unit(s->devinfo) * (B_per_unit / B_per_ugpr); +} + +static inline unsigned +jay_grf_per_gpr(jay_shader *s) +{ + assert(reg_unit(s->devinfo) == 1 || reg_unit(s->devinfo) == 2); + return reg_unit(s->devinfo) == 2 ? (s->dispatch_width / 16) : + (s->dispatch_width / 8); +} + +static inline unsigned +jay_phys_flag_per_virt(jay_shader *s) +{ + /* TODO: Check if this holds on older platforms */ + return jay_grf_per_gpr(s); +} + +/* + * Returns whether an instruction will lower to a SEND post-RA: either a SEND or + * a spill/fill that has not yet been lowered. + */ +static inline bool +jay_is_send_like(const jay_inst *I) +{ + if (I->op == JAY_OPCODE_MOV) + return jay_is_mem(I->dst) || jay_is_mem(I->src[0]); + else + return I->op == JAY_OPCODE_SEND; +} + +/* + * Returns whether an instruction contains cross-lane access. + */ +static inline bool +jay_is_shuffle_like(const jay_inst *I) +{ + return I->op == JAY_OPCODE_SHUFFLE || + I->op == JAY_OPCODE_QUAD_SWIZZLE || + I->op == JAY_OPCODE_BROADCAST_IMM; +} + +/* + * Return the required alignment for the register assigned to a given source. + */ +static inline unsigned +jay_src_alignment(jay_shader *shader, const jay_inst *I, unsigned s) +{ + /* SENDs operate on entire GRFs at a time, so align UGPRs to GRFs. This + * includes UGPR->UMEM moves which lower to SENDs. + */ + if ((I->op == JAY_OPCODE_SEND && I->src[s].file == UGPR) || + (I->dst.file == UMEM)) { + return jay_ugpr_per_grf(shader); + } + + /* If the destination is 64-bit, we need the sources to be aligned. Along + * with a suitable partitioning, this ensures only the aligned low half of + * a strided register is used, preventing invalid assembly like: + * + * mov.s64 g40, g42.1<2>:s32 + * + * ..which would violate the rule: + * + * Register Regioning patterns where register data bit location of the LSB + * of the channels are changed between source and destination are not + * supported except for broadcast of a scalar. + */ + return jay_type_vector_length(I->type); +} + +/* + * Return the required alignment for the register assigned to a destination. + */ +static inline unsigned +jay_dst_alignment(jay_shader *shader, const jay_inst *I) +{ + /* SENDs write entire GRFs, so align UGPRs to GRFs. Similarly for any + * instructions involving accumulators: + * + * Register Regioning patterns where register data bit locations are + * changed between source and destination are not supported when an + * accumulator is used as an implicit source or an explicit source in an + * instruction. (TODO) + */ + if (I->dst.file == UGPR && + (I->op == JAY_OPCODE_SEND || + (I->op == JAY_OPCODE_MOV && I->src[0].file == UMEM) || + I->op == JAY_OPCODE_MUL_32)) { + + return jay_ugpr_per_grf(shader); + } + + /* If any source is 64-bit, align the destination to 64-bit too. As above. */ + return jay_type_vector_length(jay_src_type(I, 0)); +} + +static inline bool +jay_inst_is_uniform(const jay_inst *I) +{ + if (I->op == JAY_OPCODE_SEND) + return jay_send_uniform(I); + + return jay_is_uniform(I->dst) || + (I->dst.file == J_ADDRESS && jay_is_uniform(I->src[0])) || + I->cond_flag.file == UFLAG || + I->op == JAY_OPCODE_SYNC || + I->dst.file == FLAG || + (I->dst.file == J_ARF && !jay_is_null(I->dst)); +} + +unsigned jay_simd_split(jay_shader *s, const jay_inst *I); + +static inline unsigned +jay_simd_width_logical(jay_shader *s, const jay_inst *I) +{ + unsigned base = jay_inst_is_uniform(I) ? 1 : s->dispatch_width; + + /* Handle vectors-of-UGPR operations with special care for 64-bit */ + unsigned vec_per_channel = jay_type_vector_length(I->type); + unsigned dst_size = jay_num_values(I->dst); + assert(util_is_aligned(dst_size, vec_per_channel)); + + if (base == 1 && dst_size > vec_per_channel && I->op != JAY_OPCODE_SEND) { + assert(util_is_power_of_two_nonzero(dst_size) && vec_per_channel == 1); + base = dst_size; + } + + return base; +} + +static inline unsigned +jay_simd_width_physical(jay_shader *s, const jay_inst *I) +{ + return jay_simd_width_logical(s, I) >> jay_simd_split(s, I); +} + +/* + * Returns the number of physical instructions emitted for each logical + * instruction not accounting for SIMD split. That is, the number of + * instructions that macros will expand to in jay_to_binary or 1 for non-macros. + */ +static inline unsigned +jay_macro_length(const jay_inst *I) +{ + bool macro = (I->op == JAY_OPCODE_MUL_32 || + I->op == JAY_OPCODE_SHUFFLE || + I->op == JAY_OPCODE_LOOP_ONCE); + return macro ? 2 : 1; +} + +static inline bool +jay_is_no_mask(const jay_inst *I) +{ + return jay_inst_is_uniform(I) || + I->broadcast_flag || + I->op == JAY_OPCODE_QUAD_SWIZZLE || + I->op == JAY_OPCODE_DESWIZZLE_16 || + I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || + I->op == JAY_OPCODE_LANE_ID_8 || + I->op == JAY_OPCODE_LANE_ID_EXPAND; +} + +/** + * Representation of an (implemented) function in the Jay IR. This corresponds + * to nir_function_impl in NIR. + */ +typedef struct jay_function { + struct list_head link; + + /* Parent pointer for convenience */ + struct jay_shader *shader; + + /* Set of SSA indices of defs that are dead immediately after being written + * (because they are never read but cannot be DCE'd). + */ + BITSET_WORD *dead_defs; + + /* Register demand metadata calculated & used in RA */ + unsigned demand[JAY_NUM_SSA_FILES]; + + unsigned num_blocks; + struct list_head blocks; + bool is_entrypoint; + + uint32_t ssa_alloc; +} jay_function; + +static inline jay_function * +jay_new_function(jay_shader *s) +{ + jay_function *f = rzalloc(s, jay_function); + list_inithead(&f->blocks); + + f->shader = s; + f->ssa_alloc = 1; /* skip null */ + + list_add(&f->link, &s->functions); + return f; +} + +static inline jay_function * +jay_shader_get_entrypoint(jay_shader *s) +{ + /* TODO: Multifunction shaders */ + assert(list_is_singular(&s->functions)); + return list_first_entry(&s->functions, jay_function, link); +} + +static inline unsigned +jay_num_regs(jay_shader *shader, enum jay_file file) +{ + assert(file < JAY_NUM_SSA_FILES); + + if (file < JAY_NUM_RA_FILES) + return shader->num_regs[file]; + else if (file == FLAG) + return shader->dispatch_width == 32 ? 4 : 8; + else if (file == UFLAG) + return 0; + else + return 1 /* TODO: We don't have address or accumulator RA yet */; +} + +static inline enum jay_stride +jay_def_stride(jay_shader *shader, jay_def x) +{ + assert(x.file == GPR); + return jay_gpr_to_stride(&shader->partition, x.reg); +} + +/* Represents an allocated register number with file in the top 3 bits. */ +typedef uint16_t jay_reg; + +/** Represents a set of registers that may be clobbered for lowering swaps */ +struct jay_temp_regs { + jay_reg gpr, gpr2, ugpr, ugpr2; +}; + +/** + * A basic block representation + */ +typedef struct jay_block { + struct list_head link; + struct list_head instructions; + + /** Control flow graph */ + struct jay_block *successors[2]; + struct util_dynarray predecessors; + + /** Index of the block in source order */ + unsigned index; + + /** Liveness analysis results */ + struct u_sparse_bitset live_in; + struct u_sparse_bitset live_out; + + /** + * After register allocation but before going out-of-SSA, registers that + * are free at the logical end of the block (before phi_src). These will + * be clobbered by the out-of-SSA pass. + */ + struct jay_temp_regs temps_out; + + /** + * Is this block a loop header? If not, all of its predecessors precede it + * in source order. + */ + bool loop_header; + + /** True if all non-exited lanes execute this block together */ + bool uniform; + + /** Pretty printing based on original structured control flow */ + uint8_t indent; +} jay_block; + +static inline jay_block * +jay_new_block(jay_function *f) +{ + jay_block *block = rzalloc(f, jay_block); + + util_dynarray_init(&block->predecessors, block); + list_inithead(&block->instructions); + + block->index = f->num_blocks++; + return block; +} + +static inline bool +jay_op_is_control_flow(enum jay_opcode op) +{ + return op >= JAY_OPCODE_BRD && op <= JAY_OPCODE_LOOP_ONCE; +} + +/** + * Returns the control flow instruction at the end of a block or NULL. + */ +static inline jay_inst * +jay_block_ending_jump(jay_block *block) +{ + jay_inst *last = list_is_empty(&block->instructions) ? + NULL : + list_last_entry(&block->instructions, jay_inst, link); + return last && jay_op_is_control_flow(last->op) ? last : NULL; +} + +static inline unsigned +jay_num_predecessors(jay_block *block) +{ + return util_dynarray_num_elements(&block->predecessors, jay_block *); +} + +static inline unsigned +jay_num_successors(jay_block *block) +{ + static_assert(ARRAY_SIZE(block->successors) == 2); + return !!block->successors[0] + !!block->successors[1]; +} + +static inline jay_block * +jay_first_predecessor(jay_block *block) +{ + if (jay_num_predecessors(block) == 0) + return NULL; + + return *util_dynarray_element(&block->predecessors, struct jay_block *, 0); +} + +/* Block worklist helpers */ + +#define jay_worklist_push_head(w, block) u_worklist_push_head(w, block, index) +#define jay_worklist_push_tail(w, block) u_worklist_push_tail(w, block, index) +#define jay_worklist_peek_head(w) u_worklist_peek_head(w, jay_block, index) +#define jay_worklist_pop_head(w) u_worklist_pop_head(w, jay_block, index) +#define jay_worklist_peek_tail(w) u_worklist_peek_tail(w, jay_block, index) +#define jay_worklist_pop_tail(w) u_worklist_pop_tail(w, jay_block, index) + +/* Iterators */ + +#define jay_foreach_function(s, v) \ + list_for_each_entry(jay_function, v, &s->functions, link) + +#define jay_foreach_block(f, v) \ + list_for_each_entry(jay_block, v, &f->blocks, link) + +#define jay_foreach_block_safe(f, v) \ + list_for_each_entry_safe(jay_block, v, &f->blocks, link) + +#define jay_foreach_block_rev(f, v) \ + list_for_each_entry_rev(jay_block, v, &f->blocks, link) + +#define jay_foreach_block_from(f, from, v) \ + list_for_each_entry_from(jay_block, v, from, &f->blocks, link) + +#define jay_foreach_block_from_rev(f, from, v) \ + list_for_each_entry_from_rev(jay_block, v, from, &f->blocks, link) + +#define jay_foreach_inst_in_block(block, v) \ + list_for_each_entry(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_rev(block, v) \ + list_for_each_entry_rev(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_safe(block, v) \ + list_for_each_entry_safe(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_safe_rev(block, v) \ + list_for_each_entry_safe_rev(jay_inst, v, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_from(block, v, from) \ + list_for_each_entry_from(jay_inst, v, from, &(block)->instructions, link) + +#define jay_foreach_inst_in_block_from_rev(block, v, from) \ + list_for_each_entry_from_rev(jay_inst, v, from, &(block)->instructions, link) + +#define jay_foreach_inst_in_func(func, block, v) \ + jay_foreach_block(func, block) \ + jay_foreach_inst_in_block(block, v) + +#define jay_foreach_inst_in_func_rev(func, block, v) \ + jay_foreach_block_rev(func, block) \ + jay_foreach_inst_in_block_rev(block, v) + +#define jay_foreach_inst_in_func_safe(func, block, v) \ + jay_foreach_block(func, block) \ + jay_foreach_inst_in_block_safe(block, v) + +#define jay_foreach_inst_in_func_safe_rev(func, block, v) \ + jay_foreach_block_rev(func, block) \ + jay_foreach_inst_in_block_safe_rev(block, v) + +#define jay_foreach_inst_in_shader(s, func, inst) \ + jay_foreach_function(s, func) \ + jay_foreach_inst_in_func(func, v_block, inst) + +#define jay_foreach_inst_in_shader_safe(s, func, inst) \ + jay_foreach_function(s, func) \ + jay_foreach_inst_in_func_safe(func, v_block, inst) + +#define jay_foreach_successor(blk, v) \ + jay_block *v; \ + jay_block **_v; \ + for (_v = (jay_block **) &blk->successors[0], v = *_v; \ + v != NULL && _v < (jay_block **) &blk->successors[2]; _v++, v = *_v) + +#define jay_foreach_predecessor(blk, v) \ + util_dynarray_foreach(&blk->predecessors, jay_block *, v) + +#define jay_foreach_src(inst, s) for (unsigned s = 0; s < inst->num_srcs; ++s) + +#define jay_foreach_src_rev(inst, s) \ + for (signed s = inst->num_srcs - 1; s >= 0; --s) + +#define jay_foreach_ssa_src(I, s) \ + jay_foreach_src(I, s) \ + if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) + +#define jay_foreach_ssa_src_rev(I, s) \ + jay_foreach_src_rev(I, s) \ + if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) + +#define jay_foreach_index(def, c, idx) \ + jay_foreach_comp(def, c) \ + for (uint32_t idx = jay_channel(def, c); idx != 0; idx = 0) + +#define jay_foreach_index_rev(def, c, idx) \ + jay_foreach_comp_rev(def, c) \ + for (uint32_t idx = jay_channel(def, c); idx != 0; idx = 0) + +#define jay_foreach_src_index(I, s, c, i) \ + jay_foreach_ssa_src(I, s) \ + jay_foreach_index(I->src[s], c, i) + +#define jay_foreach_src_index_rev(I, s, c, i) \ + jay_foreach_ssa_src_rev(I, s) \ + jay_foreach_index_rev(I->src[s], c, i) + +#define jay_foreach_dst(I, d) \ + for (unsigned _d = 0; _d < 2; ++_d) \ + for (jay_def d = (_d ? I->cond_flag : I->dst); !jay_is_null(d); \ + d = jay_null()) + +#define jay_foreach_dst_index(I, d, i) \ + jay_foreach_dst(I, d) \ + jay_foreach_index(d, _c, i) + +/* + * Phi iterators take advantage of the known position of phis in the block. + */ +#define jay_foreach_phi_src_in_block(block, phi) \ + jay_foreach_inst_in_block_safe_rev(block, phi) \ + if (jay_op_is_control_flow(phi->op)) \ + continue; \ + else if (phi->op != JAY_OPCODE_PHI_SRC) \ + break; \ + else + +#define jay_foreach_phi_dst_in_block(block, phi) \ + jay_foreach_inst_in_block(block, phi) \ + if (phi->op != JAY_OPCODE_PHI_DST) \ + break; \ + else + +#define jay_foreach_preload(func, preload) \ + jay_foreach_inst_in_block_safe(jay_first_block(func), preload) \ + if (I->op != JAY_OPCODE_PRELOAD) \ + break; \ + else + +static inline jay_block * +jay_first_block(jay_function *f) +{ + assert(!list_is_empty(&f->blocks)); + jay_block *first_block = list_first_entry(&f->blocks, jay_block, link); + assert(first_block->index == 0); + return first_block; +} + +static inline jay_inst * +jay_first_inst(jay_block *block) +{ + if (list_is_empty(&block->instructions)) + return NULL; + else + return list_first_entry(&block->instructions, jay_inst, link); +} + +static inline jay_block * +jay_last_block(jay_function *f) +{ + if (list_is_empty(&f->blocks)) + return NULL; + else + return list_last_entry(&f->blocks, jay_block, link); +} + +static inline jay_inst * +jay_last_inst(jay_block *block) +{ + if (list_is_empty(&block->instructions)) + return NULL; + else + return list_last_entry(&block->instructions, jay_inst, link); +} + +static inline jay_block * +jay_next_block(jay_block *block) +{ + return list_first_entry(&(block->link), jay_block, link); +} + +static inline void +jay_block_add_successor(jay_block *block, jay_block *succ) +{ + unsigned i = block->successors[0] ? 1 : 0; + + assert(succ && block->successors[0] != succ && block->successors[1] != succ); + assert(block->successors[i] == NULL && "at most 2 successors"); + + block->successors[i] = succ; + util_dynarray_append(&(succ->predecessors), block); +} + +static inline unsigned +jay_source_last_use_bit(const jay_def *srcs, unsigned src_idx) +{ + assert(jay_is_ssa(srcs[src_idx]) && "precondition"); + unsigned i = 0; + + for (unsigned s = 0; s < src_idx; ++s) { + jay_foreach_index(srcs[s], c, idx) { + i++; + } + } + + return i; +} + +#define jay_foreach_killed(I, s, c) \ + for (unsigned _kill_idx = 0; _kill_idx == 0; _kill_idx = 1) \ + jay_foreach_src_index(I, s, c, idx) \ + for (unsigned _k = _kill_idx++; _k != ~0; _k = ~0) \ + if (BITSET_TEST(I->last_use, _k)) + +/* Helper to run a pass */ +#define JAY_PASS(shader, pass, ...) \ + do { \ + pass(shader, ##__VA_ARGS__); \ + jay_validate(shader, #pass); \ + } while (0) + +#define JAY_DEFINE_FUNCTION_PASS(name, per_func) \ + void name(jay_shader *s) \ + { \ + jay_foreach_function(s, f) { \ + per_func(f); \ + } \ + } diff --git a/src/intel/compiler/jay/jay_liveness.c b/src/intel/compiler/jay/jay_liveness.c new file mode 100644 index 00000000000..ebe89f7504f --- /dev/null +++ b/src/intel/compiler/jay/jay_liveness.c @@ -0,0 +1,203 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "util/macros.h" +#include "util/sparse_bitset.h" +#include "util/u_math.h" +#include "util/u_worklist.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* LiveIn = GEN + (LiveOut - KILL) */ +static void +update_liveness_for_inst(BITSET_WORD *dead_defs, + struct u_sparse_bitset *live_in, + jay_inst *I) +{ + /* No destination is live-in before the instruction, but any destination not + * live-in after is immediately dead. + */ + jay_foreach_dst_index(I, _, def) { + if (u_sparse_bitset_test(live_in, def)) { + u_sparse_bitset_clear(live_in, def); + } else { + BITSET_SET(dead_defs, def); + } + } + + if (I->op == JAY_OPCODE_PHI_SRC) { + /* Phi sources do not require last-use bits. */ + jay_foreach_src_index(I, src_idx, comp, index) { + u_sparse_bitset_set(live_in, index); + } + } else { + BITSET_ZERO(I->last_use); + unsigned last_use_i = 0; + + jay_foreach_src_index(I, s, comp, index) { + /* If the source is not live after this instruction, but becomes + * live at this instruction, this is the last use. + */ + if (!u_sparse_bitset_test(live_in, index)) { + assert(last_use_i < JAY_NUM_LAST_USE_BITS); + BITSET_SET(I->last_use, last_use_i); + } + + u_sparse_bitset_set(live_in, index); + ++last_use_i; + } + } +} + +/** + * Calculate liveness information for SSA values. + * + * This populates the jay_block::live_in/live_out bitsets and last_use flags. + */ +void +jay_compute_liveness(jay_function *f) +{ + u_worklist worklist; + u_worklist_init(&worklist, f->num_blocks, NULL); + + ralloc_free(f->dead_defs); + f->dead_defs = BITSET_RZALLOC(f, f->ssa_alloc); + + jay_foreach_block(f, block) { + u_sparse_bitset_free(&block->live_in); + u_sparse_bitset_free(&block->live_out); + + u_sparse_bitset_init(&block->live_in, f->ssa_alloc, block); + u_sparse_bitset_init(&block->live_out, f->ssa_alloc, block); + + jay_worklist_push_head(&worklist, block); + } + + while (!u_worklist_is_empty(&worklist)) { + /* Pop in reverse order since liveness is a backwards pass */ + jay_block *block = jay_worklist_pop_head(&worklist); + + /* Update its liveness information: + * 1. Assume everything liveout from this block was live_in + * 2. Clear live_in for anything defined in this block + */ + u_sparse_bitset_dup(&block->live_in, &block->live_out); + + jay_foreach_inst_in_block_rev(block, inst) { + update_liveness_for_inst(f->dead_defs, &block->live_in, inst); + } + + /* Propagate block->live_in[] to the live_out[] of predecessors. Since + * phis are split, they are handled naturally without special cases. + */ + jay_foreach_predecessor(block, p) { + if (u_sparse_bitset_merge(&(*p)->live_out, &block->live_in)) { + jay_worklist_push_tail(&worklist, *p); + } + } + } + +#ifndef NDEBUG + jay_block *first_block = jay_first_block(f); + jay_block *last_block = list_last_entry(&f->blocks, jay_block, link); + + assert(u_sparse_bitset_count(&first_block->live_in) == 0 && "invariant"); + assert(u_sparse_bitset_count(&last_block->live_out) == 0 && "invariant"); +#endif + + u_worklist_fini(&worklist); +} + +/* + * Calculate the register demand for each SSA file using the previously + * calculated liveness analysis. SSA makes this exact in linear-time. + */ +void +jay_calculate_register_demands(jay_function *func) +{ + enum jay_file *files = calloc(func->ssa_alloc, sizeof(enum jay_file)); + BITSET_WORD *killed = BITSET_CALLOC(func->ssa_alloc); + unsigned *max_demand = func->demand; + memset(max_demand, 0, sizeof(func->demand)); + + jay_foreach_inst_in_func(func, block, I) { + jay_foreach_dst_index(I, def, index) { + files[index] = def.file; + } + } + + jay_foreach_block(func, block) { + unsigned demands[JAY_NUM_SSA_FILES] = {}; + + /* Everything live-in. */ + U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) { + ++demands[files[i]]; + } + + jay_foreach_ssa_file(f) { + max_demand[f] = MAX2(demands[f], max_demand[f]); + } + + jay_foreach_inst_in_block(block, I) { + /* We must have enough register file space for the register payload */ + if (I->op == JAY_OPCODE_PRELOAD) { + uint32_t max = jay_preload_reg(I) + jay_num_values(I->dst); + max_demand[I->dst.file] = MAX2(max_demand[I->dst.file], max); + } + + /* Collect source values to kill */ + jay_foreach_killed(I, s, c) { + BITSET_SET(killed, jay_channel(I->src[s], c)); + } + + /* Make destinations live */ + jay_foreach_dst(I, d) { + demands[d.file] += util_next_power_of_two(jay_num_values(d)); + } + + /* Update maximum demands */ + jay_foreach_ssa_file(f) { + max_demand[f] = MAX2(demands[f], max_demand[f]); + } + + /* Dead destinations are those written by the instruction but killed + * immediately after the instruction finishes. + */ + jay_foreach_dst_index(I, d, index) { + if (BITSET_TEST(func->dead_defs, index)) { + assert(demands[d.file] > 0); + --demands[d.file]; + } + } + + jay_foreach_dst(I, d) { + unsigned n = jay_num_values(d); + demands[d.file] -= util_next_power_of_two(n) - n; + } + + /* Late-kill sources */ + jay_foreach_killed(I, s, c) { + uint32_t index = jay_channel(I->src[s], c); + + if (BITSET_TEST(killed, index)) { + BITSET_CLEAR(killed, index); + + assert(demands[I->src[s].file] > 0); + --demands[I->src[s].file]; + } + } + + if (jay_debug & JAY_DBG_PRINTDEMAND) { + printf("(LA) [G:%u\tU:%u] ", demands[GPR], demands[UGPR]); + jay_print_inst(stdout, I); + } + } + } + + free(files); + free(killed); +} diff --git a/src/intel/compiler/jay/jay_lower_post_ra.c b/src/intel/compiler/jay/jay_lower_post_ra.c new file mode 100644 index 00000000000..db8661b011d --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_post_ra.c @@ -0,0 +1,153 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/macros.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * If default != dest, we need to lower. Predicated moves generalize as SEL, + * with default in src0 to allow for immediates. + * + * For anything else, we have to insert a copy. + */ +static void +lower_non_tied_default(jay_builder *b, jay_inst *I, jay_def default_) +{ + jay_def not_pred = jay_negate(*jay_inst_get_predicate(I)); + assert(default_.file != FLAG && "we don't support this"); + + if (I->op == JAY_OPCODE_MOV) { + jay_SEL(b, I->type, I->dst, default_, I->src[0], not_pred); + jay_remove_instruction(I); + } else { + jay_foreach_comp(I->dst, c) { + jay_def dst = jay_extract_post_ra(I->dst, c); + jay_def src = jay_extract_post_ra(default_, c); + + jay_add_predicate(b, jay_MOV(b, dst, src), not_pred); + } + } +} + +static inline jay_def +hi(jay_def x) +{ + x.hi = true; + return x; +} + +static bool +lower(jay_builder *b, jay_inst *I) +{ + switch (I->op) { + case JAY_OPCODE_PRELOAD: + case JAY_OPCODE_PHI_DST: + case JAY_OPCODE_INDETERMINATE: + /* Delete instructions that only exist for RA. Uninitialized register + * contents is a perfectly cromulent indeterminate value. + */ + return true; + + case JAY_OPCODE_MOV: { + /* Delete trivial moves */ + if (jay_regs_equal(I->dst, I->src[0]) && !I->predication) + return true; + + if (I->dst.file == GPR && I->src[0].file == GPR) { + jay_def dst = I->dst, src = I->src[0], tmp4 = jay_bare_reg(GPR, 0); + enum jay_stride dst_stride = jay_def_stride(b->shader, dst); + enum jay_stride src_stride = jay_def_stride(b->shader, src); + assert(jay_def_stride(b->shader, tmp4) == JAY_STRIDE_4 && "ABI"); + + if (dst_stride == JAY_STRIDE_8 && src_stride == JAY_STRIDE_2) { + jay_MOV(b, dst, tmp4); + jay_MOV(b, tmp4, src)->type = JAY_TYPE_U16; + jay_MOV(b, hi(tmp4), hi(src))->type = JAY_TYPE_U16; + + jay_XOR(b, JAY_TYPE_U32, dst, dst, tmp4); + jay_XOR(b, JAY_TYPE_U32, tmp4, dst, tmp4); + jay_XOR(b, JAY_TYPE_U32, dst, dst, tmp4); + return true; + } else if (dst_stride == JAY_STRIDE_2 && src_stride == JAY_STRIDE_8) { + jay_MOV(b, dst, tmp4)->type = JAY_TYPE_U16; + jay_MOV(b, hi(dst), hi(tmp4))->type = JAY_TYPE_U16; + jay_MOV(b, tmp4, src); + + for (unsigned i = 0; i < 3; ++i) { + jay_XOR(b, JAY_TYPE_U16, i == 1 ? tmp4 : dst, dst, tmp4); + jay_XOR(b, JAY_TYPE_U16, i == 1 ? hi(tmp4) : hi(dst), hi(dst), + hi(tmp4)); + } + + return true; + } + + /* Lower 4B<-->2B copies. To pack the register file, RA + * sometimes inserts 32-bit copies involving 16-bit strided sources like + * "mov.u32 r4 <32-bit>, r50 <16-bit>". This cannot be implemented in a + * single hardware instruction, so we split into two 16-bit copies. + */ + enum jay_stride min_stride = MIN2(dst_stride, src_stride); + unsigned stride_sz = jay_stride_to_bits(min_stride); + unsigned type_sz = jay_type_size_bits(I->type); + + if (stride_sz < type_sz) { + assert(stride_sz == 16 && type_sz == 32 && "no other case hit"); + I->type = JAY_TYPE_U16; + jay_MOV(b, hi(dst), hi(src))->type = JAY_TYPE_U16; + } + } + + return false; + } + + case JAY_OPCODE_SWAP: { + jay_def x = I->src[0], y = I->src[1]; + /* TODO: Need stride-aware lowering here too like MOV. Same ideas. */ + if (jay_def_stride(b->shader, x) != jay_def_stride(b->shader, y)) + UNREACHABLE("todo"); + + jay_XOR(b, JAY_TYPE_U32, x, y, x); + jay_XOR(b, JAY_TYPE_U32, y, x, y); + jay_XOR(b, JAY_TYPE_U32, x, y, x); + return true; + } + + case JAY_OPCODE_ZERO_FLAG: { + jay_MOV(b, jay_bare_reg(FLAG, jay_zero_flag_reg(I)), 0)->type = + JAY_TYPE_U32; + return true; + } + + default: + return false; + } +} + +void +jay_lower_post_ra(jay_shader *s) +{ + jay_foreach_inst_in_shader_safe(s, func, I) { + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + + if (jay_inst_has_default(I)) { + if (!jay_regs_equal(I->dst, *jay_inst_get_default(I))) { + lower_non_tied_default(&b, I, *jay_inst_get_default(I)); + } + + /* Now just drop the default source */ + jay_shrink_sources(I, I->num_srcs - 1); + I->predication = JAY_PREDICATED; + } + + if (lower(&b, I)) { + jay_remove_instruction(I); + } + } +} diff --git a/src/intel/compiler/jay/jay_lower_pre_ra.c b/src/intel/compiler/jay/jay_lower_pre_ra.c new file mode 100644 index 00000000000..d71ea7c3711 --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_pre_ra.c @@ -0,0 +1,200 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/bitscan.h" +#include "util/hash_table.h" +#include "util/lut.h" +#include "util/macros.h" +#include "util/u_math.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * Register allocation operates only on power-of-two vectors. Pad out + * non-power-of-two vectors with null values to simplify RA. + */ +static jay_def +lower_npot_vector(jay_builder *b, jay_def x) +{ + unsigned n = jay_num_values(x); + + if (!util_is_power_of_two_or_zero(n)) { + uint32_t indices[JAY_MAX_DEF_LENGTH] = { 0 }; + + for (unsigned i = 0; i < n; ++i) { + indices[i] = jay_channel(x, i); + } + + x = jay_collect(b, x.file, indices, util_next_power_of_two(n)); + } + + assert(util_is_power_of_two_or_zero(jay_num_values(x)) && "post-cond"); + return x; +} + +/** + * Vectors need to be allocated to contiguous registers. Furthermore, we + * require power-of-two sizes in certain cases, that's handled here too. + * + * This means that a value cannot appear in multiple channels of an + * instruction, as register allocation would need to assign the same value to + * locations and . Scalars don't have this restriction, except for + * SENDs because the hardware bans repeated sources. + * + * If a value appears in multiple positions, we emit copies so that each + * can be register allocated in the correct position. + */ +static void +lower_contiguous_sources(jay_builder *b, jay_inst *I) +{ + b->cursor = jay_before_inst(I); + uint32_t seen[JAY_MAX_DEF_LENGTH], nr_seen = 0; + + jay_foreach_src(I, s) { + if (jay_num_values(I->src[s]) > 1 || I->op == JAY_OPCODE_SEND) { + jay_foreach_index(I->src[s], c, index) { + /* Search for the index */ + unsigned i; + for (i = 0; i < nr_seen && seen[i] != index; ++i) { + } + + if (i == nr_seen) { + /* Record a new index */ + assert(nr_seen < ARRAY_SIZE(seen)); + seen[nr_seen++] = index; + } else { + /* Insert a copy to access a duplicated index */ + jay_def copy = jay_alloc_def(b, I->src[s].file, 1); + jay_MOV(b, copy, jay_extract(I->src[s], c)); + jay_insert_channel(b, &I->src[s], c, copy); + } + } + + jay_replace_src(&I->src[s], lower_npot_vector(b, I->src[s])); + } + } +} + +static jay_def +lower_imm_to_ugpr(jay_builder *b, + jay_inst *I, + unsigned s, + struct hash_table_u64 *constants) +{ + /* Although only 32-bit constants are supported, 64-bit constants are + * separate in the key since they must be zero-extended. We could optimize + * this but it doesn't really matter. + */ + uint32_t imm = jay_as_uint(I->src[s]); + bool is_64bit = jay_type_size_bits(jay_src_type(I, s)) == 64; + uint64_t key = imm | (is_64bit ? BITFIELD64_BIT(32) : 0); + + jay_inst *mov = _mesa_hash_table_u64_search(constants, key); + if (mov) + return mov->dst; + + /* Try to use source modifiers to reuse a constant if we can */ + if (jay_src_type(I, s) == JAY_TYPE_F32 && jay_has_src_mods(I, s)) { + mov = _mesa_hash_table_u64_search(constants, fui(-uif(imm))); + if (mov) + return jay_negate(mov->dst); + } + + /* If this is a new constant, insert a move and cache it. Currently, we pool + * constants per-function. Inserting everything at the start guarantees that + * these moves dominate all their uses, although it hurts register pressure. + * The spiller should rematerialize constants where necessary to ensure we + * don't lose the wave, but we could still probably optimize this. + */ + jay_def x = jay_alloc_def(b, UGPR, is_64bit ? 2 : 1); + b->cursor = jay_before_function(b->func); + _mesa_hash_table_u64_insert(constants, key, jay_MOV(b, x, imm)); + return x; +} + +static bool +try_swap_src01(jay_inst *I) +{ + if (I->op == JAY_OPCODE_SEL) { + /* sel(a, b, p) = sel(b, a, !p) */ + I->src[2].negate ^= true; + } else if (I->op == JAY_OPCODE_CMP) { + I->conditional_mod = jay_conditional_mod_swap_sources(I->conditional_mod); + } else if (I->op == JAY_OPCODE_BFN) { + jay_set_bfn_ctrl(I, util_lut3_swap_sources(jay_bfn_ctrl(I), 0, 1)); + } else if (!jay_opcode_infos[I->op]._2src_commutative) { + /* Nothing to do for commutative, but otherwise we give up */ + return false; + } + + SWAP(I->src[0], I->src[1]); + return true; +} + +/* + * Instructions can only encode immediates in certain positions. Lower + * immediates to moves where necessary. + */ +static void +lower_immediates(jay_builder *b, jay_inst *I, struct hash_table_u64 *constants) +{ + /* Canonicalize compare-with-zero to increase freedom */ + if (I->op == JAY_OPCODE_CMP && + jay_is_zero(I->src[1]) && + jay_is_null(I->dst) && + I->type == JAY_TYPE_U32) { + + assert(!jay_is_null(I->cond_flag) && !I->predication); + I->op = JAY_OPCODE_MOV; + jay_shrink_sources(I, 1); + } + + /* One source supports immediates but the other does not, so swap. */ + unsigned other = I->op == JAY_OPCODE_BFN ? 1 : 0; + if (jay_is_imm(I->src[other]) && + !_mesa_hash_table_u64_search(constants, jay_as_uint(I->src[other]))) { + + try_swap_src01(I); + } + + /* Immediates allowed only in certain cases, lower the rest */ + jay_foreach_src(I, s) { + if (jay_is_imm(I->src[s])) { + uint32_t imm = jay_as_uint(I->src[s]); + + bool last = s == (jay_num_isa_srcs(I) - 1); + bool allowed = s < 2 && (last || I->op == JAY_OPCODE_SEND); + allowed |= (I->op == JAY_OPCODE_BFN && s == 0 && imm < UINT16_MAX); + + if (!allowed) { + I->src[s] = lower_imm_to_ugpr(b, I, s, constants); + } + } + } +} + +void +jay_lower_pre_ra(jay_shader *s) +{ + struct hash_table_u64 *constants = _mesa_hash_table_u64_create(NULL); + + jay_foreach_function(s, f) { + /* Pool constants per function. */ + _mesa_hash_table_u64_clear(constants); + + jay_foreach_inst_in_func(f, block, I) { + jay_builder b = { .shader = s, .func = f }; + + /* lower_immediates must be last since it consumes I */ + lower_contiguous_sources(&b, I); + lower_immediates(&b, I, constants); + } + } + + _mesa_hash_table_u64_destroy(constants); +} diff --git a/src/intel/compiler/jay/jay_lower_scoreboard.c b/src/intel/compiler/jay/jay_lower_scoreboard.c new file mode 100644 index 00000000000..305dfff57ba --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_scoreboard.c @@ -0,0 +1,376 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include "compiler/brw/brw_eu_defines.h" +#include "util/bitset.h" +#include "util/macros.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* TODO: Shrink */ +#define MAX_KEYS (2 * JAY_NUM_UGPR) +#define NUM_TOKENS (16) + +/** SEND scoreboarding */ +struct gpr_range { + unsigned base, width; +}; + +static inline struct gpr_range +def_to_gpr(jay_function *func, jay_inst *I, jay_def x) +{ + if (x.file == GPR || x.file == UGPR) { + unsigned base = x.file == UGPR ? func->shader->num_regs[GPR] : 0; + return (struct gpr_range) { base + x.reg, jay_num_values(x) }; + } else { + return (struct gpr_range) { 0, 0 }; + } +} + +static inline void +sync_sbid(jay_function *func, jay_inst *I, uint32_t *busy, unsigned sbid) +{ + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, sbid); + *busy &= ~BITFIELD_BIT(sbid); +} + +static void +lower_send_local(jay_function *func, jay_block *block) +{ + struct { + BITSET_DECLARE(reading, MAX_KEYS); + BITSET_DECLARE(writing, MAX_KEYS); + } tokens[NUM_TOKENS]; + + uint32_t busy = 0; + unsigned roundrobin = 0; + + jay_foreach_inst_in_block_safe(block, I) { + /* Read-after-write */ + jay_foreach_src(I, s) { + struct gpr_range src = def_to_gpr(func, I, I->src[s]); + + u_foreach_bit(sbid, busy) { + if (BITSET_TEST_COUNT(tokens[sbid].writing, src.base, src.width)) { + sync_sbid(func, I, &busy, sbid); + } + } + } + + /* Write-after-write & write-after-read */ + jay_foreach_dst(I, d) { + struct gpr_range dst = def_to_gpr(func, I, I->dst); + + u_foreach_bit(sbid, busy) { + if (BITSET_TEST_COUNT(tokens[sbid].reading, dst.base, dst.width) || + BITSET_TEST_COUNT(tokens[sbid].writing, dst.base, dst.width)) { + sync_sbid(func, I, &busy, sbid); + } + } + } + + if (I->op == JAY_OPCODE_SEND && !jay_send_eot(I)) { + unsigned sbid = (roundrobin++) % NUM_TOKENS; + jay_set_send_sbid(I, sbid); + + if (!(busy & BITSET_BIT(sbid))) { + busy |= BITSET_BIT(sbid); + BITSET_ZERO(tokens[sbid].writing); + BITSET_ZERO(tokens[sbid].reading); + } + + struct gpr_range dst = def_to_gpr(func, I, I->dst); + BITSET_SET_COUNT(tokens[sbid].writing, dst.base, dst.width); + + jay_foreach_src(I, s) { + struct gpr_range src = def_to_gpr(func, I, I->src[s]); + BITSET_SET_COUNT(tokens[sbid].reading, src.base, src.width); + } + } + } + + /* Sync on block boundaries. */ + if (block != jay_last_block(func)) { + jay_builder b = jay_init_builder(func, jay_before_jump(block)); + + u_foreach_bit(sbid, busy) { + jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, sbid); + } + } +} + +/** + * Regdist scoreboarding + * + * Register access is tracked per pipe, with 0 (NONE) having data on the writer + * packed into a u32 with the following macros. + */ +#define make_writer(pipe, ip) (((uint32_t) ip << 3) | (uint32_t) (pipe)) +#define writer_ip(writer) (writer >> 3) +#define writer_pipe(writer) (enum tgl_pipe)(writer & BITFIELD_MASK(3)) + +#define TGL_NUM_PIPES (TGL_PIPE_ALL) +typedef uint32_t u32_per_pipe[TGL_NUM_PIPES]; + +struct swsb_state { + unsigned ip[TGL_NUM_PIPES]; + unsigned last_shape[TGL_NUM_PIPES]; + + /* finished_ip[X][Y] = ip means from the perspective of pipe X, ip on pipe Y + * has already been waited on. + */ + unsigned finished_ip[TGL_NUM_PIPES][TGL_NUM_PIPES]; + u32_per_pipe *access; +}; + +static enum tgl_pipe +inst_exec_pipe(const struct intel_device_info *devinfo, jay_inst *I) +{ + if (I->op == JAY_OPCODE_SEND || jay_op_is_control_flow(I->op) /* XXX*/) { + return TGL_PIPE_NONE; + } else if (I->op == JAY_OPCODE_MATH) { + return TGL_PIPE_MATH; + } else if (I->type == JAY_TYPE_F64) { + return TGL_PIPE_LONG; + } else if (jay_type_is_any_float(I->type)) { + return TGL_PIPE_FLOAT; + } else { + return TGL_PIPE_INT; + } +} + +/** + * Return the RegDist pipeline the hardware will synchronize with if no + * pipeline information is provided in the SWSB annotation of an + * instruction (e.g. when TGL_PIPE_NONE is specified in tgl_swsb). + */ +static enum tgl_pipe +inferred_sync_pipe(const struct intel_device_info *devinfo, const jay_inst *I) +{ + bool has_int_src = false, has_long_src = false; + + if (devinfo->verx10 >= 125) { + jay_foreach_src(I, s) { + has_int_src |= !jay_type_is_any_float(jay_src_type(I, s)); + has_long_src |= jay_src_type(I, s) == JAY_TYPE_F64; + } + + /* Avoid emitting (RegDist, SWSB) annotations for long instructions on + * platforms where they are unordered as they may not be allowed. + */ + if (devinfo->has_64bit_float_via_math_pipe && has_long_src) + return TGL_PIPE_NONE; + } + + return I->op == JAY_OPCODE_SEND ? TGL_PIPE_NONE : + has_long_src ? TGL_PIPE_LONG : + has_int_src ? TGL_PIPE_INT : + TGL_PIPE_FLOAT; +} + +static void +depend_on_writer(struct swsb_state *state, struct gpr_range r, unsigned *dep) +{ + for (unsigned i = 0; i < r.width; ++i) { + uint32_t w = state->access[r.base + i][0]; + dep[writer_pipe(w)] = MAX2(dep[writer_pipe(w)], writer_ip(w)); + } +} + +#define jay_foreach_pipe(pipe) \ + for (unsigned pipe = 1; pipe < TGL_NUM_PIPES; ++pipe) + +static void +lower_regdist_local(jay_function *func, jay_block *block, u32_per_pipe *access) +{ + struct swsb_state state = { .access = access }; + jay_inst *last_sync = NULL; + bool need_deswizzle_wait = false; + + jay_foreach_inst_in_block_safe(block, I) { + enum tgl_pipe exec_pipe = inst_exec_pipe(func->shader->devinfo, I); + unsigned dep[TGL_NUM_PIPES] = { 0 }; + if (I->op == JAY_OPCODE_SYNC) { + last_sync = I; + continue; + } else if (I->op == JAY_OPCODE_DESWIZZLE_16) { + need_deswizzle_wait = true; + state.ip[TGL_PIPE_INT]++; + continue; + } + + /* Force a wait on the deswizzles at the start of the program. XXX: Is + * there a cleaner way to deal with this? + */ + if (need_deswizzle_wait) { + dep[TGL_PIPE_INT] = state.ip[TGL_PIPE_INT]; + need_deswizzle_wait = false; + } + + /* Write-after-{write, read} */ + jay_foreach_dst(I, def) { + struct gpr_range r = def_to_gpr(func, I, def); + depend_on_writer(&state, r, dep); + + for (unsigned i = 0; i < r.width; ++i) { + jay_foreach_pipe(p) { + dep[p] = MAX2(dep[p], state.access[r.base + i][p]); + } + } + } + + /* Read-after-write */ + jay_foreach_src(I, s) { + depend_on_writer(&state, def_to_gpr(func, I, I->src[s]), dep); + } + + unsigned nr_waits = 0; + unsigned last_pipe = TGL_PIPE_NONE; + + /* If dependency P implies dependency Q, drop dependency Q to avoid + * unnecessary annotations. + */ + jay_foreach_pipe(p) { + if (dep[p]) { + jay_foreach_pipe(q) { + if (dep[q] && state.finished_ip[p][q] >= dep[q]) { + dep[q] = 0; + } + } + } + } + + unsigned min_delta = 7; + jay_foreach_pipe(p) { + if (dep[p] && (exec_pipe == TGL_PIPE_NONE /* TODO: Sends */ || + dep[p] > state.finished_ip[exec_pipe][p])) { + unsigned delta = state.ip[p] - dep[p] + 1; + min_delta = MIN2(min_delta, delta); + state.finished_ip[exec_pipe][p] = dep[p]; + nr_waits++; + last_pipe = p; + } + } + + /* If we're SIMD split the same way as our dependency, we can relax the + * dependency to have each half wait in parallel. We could do even better + * with more tracking but this should be good enough for now. + */ + unsigned simd_split = jay_simd_split(func->shader, I); + unsigned shape = ((simd_split << 2) | jay_macro_length(I)) + 1; + bool same_shape = state.last_shape[last_pipe] == shape; + + if (simd_split && same_shape && nr_waits == 1 && min_delta == 1) { + min_delta += ((1 << simd_split) - 1) * jay_macro_length(I); + I->replicate_dep = true; + I->decrement_dep = last_pipe != exec_pipe; + } + + bool has_sbid = I->op == JAY_OPCODE_SEND && !jay_send_eot(I); + I->dep = (struct tgl_swsb) { + .sbid = has_sbid ? jay_send_sbid(I) : 0, + .mode = has_sbid ? TGL_SBID_SET : TGL_SBID_NULL, + .regdist = nr_waits ? min_delta : 0, + .pipe = nr_waits == 1 && (!has_sbid || + last_pipe == TGL_PIPE_FLOAT || + last_pipe == TGL_PIPE_INT) ? + last_pipe : + TGL_PIPE_ALL, + }; + + /* Fold the immediate preceding SYNC.nop into this instruction, allowing + * us to wait on both ALU and a SEND in the same annotation. + */ + if (last_sync && + jay_sync_op(last_sync) == TGL_SYNC_NOP && + I->dep.mode == TGL_SBID_NULL && + (I->dep.regdist == 0 || + inferred_sync_pipe(func->shader->devinfo, I) == I->dep.pipe)) { + + assert(last_sync->dep.regdist == 0); + assert(last_sync->dep.pipe == TGL_PIPE_NONE); + + I->dep.mode = last_sync->dep.mode; + I->dep.sbid = last_sync->dep.sbid; + + jay_remove_instruction(last_sync); + } + + if (exec_pipe != TGL_PIPE_NONE) { + /* Advance the IP by the number of physical instructions emitted */ + state.ip[exec_pipe] += + jay_macro_length(I) << jay_simd_split(func->shader, I); + + struct gpr_range r = def_to_gpr(func, I, I->dst); + uint32_t now = make_writer(exec_pipe, state.ip[exec_pipe]); + + for (unsigned i = 0; i < r.width; ++i) { + state.access[r.base + i][0] = now; + } + + jay_foreach_src(I, s) { + struct gpr_range r = def_to_gpr(func, I, I->src[s]); + for (unsigned i = 0; i < r.width; ++i) { + state.access[r.base + i][exec_pipe] = state.ip[exec_pipe]; + } + } + + state.last_shape[exec_pipe] = shape; + } + + last_sync = NULL; + } + + /* Sync on block boundaries. */ + jay_inst *first = jay_first_inst(block); + if (block != jay_first_block(func) && first && first->op != JAY_OPCODE_SEND) { + first->dep = tgl_swsb_regdist(1); + } +} + +/* + * Trivial scoreboard lowering pass for debugging use. Stalls after every + * instruction and assigns SBID zero to all messages. + */ +static void +lower_trivial(jay_function *func) +{ + jay_foreach_inst_in_func_safe(func, block, I) { + if (I->op == JAY_OPCODE_SEND && !jay_send_eot(I)) { + I->dep = tgl_swsb_dst_dep(tgl_swsb_sbid(TGL_SBID_SET, 0), 1); + + jay_builder b = jay_init_builder(func, jay_after_inst(I)); + jay_SYNC(&b, TGL_SYNC_NOP)->dep = tgl_swsb_sbid(TGL_SBID_DST, 0); + } else { + I->dep = tgl_swsb_regdist(1); + } + } +} + +void +jay_lower_scoreboard(jay_shader *s) +{ + uint32_t nr_keys = s->num_regs[GPR] + s->num_regs[UGPR]; + assert(nr_keys <= MAX_KEYS && "SENDs use uninitialized stack allocation"); + u32_per_pipe *access = malloc(sizeof(*access) * nr_keys); + + jay_foreach_function(s, func) { + if (jay_debug & JAY_DBG_SYNC) { + lower_trivial(func); + } else { + jay_foreach_block(func, block) { + memset(access, 0, sizeof(*access) * nr_keys); + lower_send_local(func, block); + lower_regdist_local(func, block, access); + } + } + } + + free(access); +} diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c new file mode 100644 index 00000000000..21fbac1777e --- /dev/null +++ b/src/intel/compiler/jay/jay_lower_spill.c @@ -0,0 +1,156 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/brw/brw_eu_defines.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* We reserve an address register for spilling by ABI */ +#define ADDRESS_REG jay_bare_reg(J_ADDRESS, 2) + +static void +insert_spill_fill(jay_builder *b, + jay_def mem, + jay_def gpr, + jay_def sp, + bool load, + unsigned *sp_delta_B, + unsigned umem_base) +{ + assert(jay_is_mem(mem) && !jay_is_mem(gpr)); + + bool uniform = mem.file == UMEM; + unsigned offs_B = mem.reg * 4; + unsigned mem_reg_B = + uniform ? (umem_base + offs_B) : (offs_B * b->shader->dispatch_width); + + /* The stack pointer needs to be offset to the desired offset */ + signed sp_adjust_B = mem_reg_B - (*sp_delta_B); + if (sp_adjust_B) { + jay_ADD(b, JAY_TYPE_U32, sp, sp, sp_adjust_B); + *sp_delta_B = mem_reg_B; + } + + const struct intel_device_info *devinfo = b->shader->devinfo; + unsigned cache = load ? LSC_CACHE(devinfo, LOAD, L1STATE_L3MOCS) : + LSC_CACHE(devinfo, STORE, L1STATE_L3MOCS); + uint32_t desc = lsc_msg_desc(devinfo, load ? LSC_OP_LOAD : LSC_OP_STORE, + LSC_ADDR_SURFTYPE_SS, LSC_ADDR_SIZE_A32, + LSC_DATA_SIZE_D32, 1, uniform, cache); + if (uniform) { + sp.num_values_m1 = 0; + } + + jay_def srcs[] = { sp, gpr }; + + jay_SEND(b, .sfid = BRW_SFID_UGM, .msg_desc = desc, .srcs = srcs, + .nr_srcs = load ? 1 : 2, .dst = load ? gpr : jay_null(), + .type = JAY_TYPE_U32, .uniform = uniform, .ex_desc = ADDRESS_REG); +} + +void +jay_lower_spill(jay_function *func) +{ + jay_builder b = jay_init_builder(func, jay_before_function(func)); + + /* We reserve the top UGPRs for spilling by ABI */ + unsigned ugpr_reservation = func->shader->num_regs[UGPR]; + assert(util_is_aligned(ugpr_reservation + 1, func->shader->dispatch_width)); + + jay_def surf = jay_bare_reg(UGPR, ugpr_reservation); + jay_def sp = jay_bare_reg(UGPR, ugpr_reservation + 1); + sp.num_values_m1 = func->shader->dispatch_width - 1; + + /* Calculate how much stack space we need */ + unsigned nr_mem = 0, nr_umem = 0; + jay_foreach_inst_in_func(func, block, I) { + if (I->op == JAY_OPCODE_MOV && jay_is_send_like(I)) { + jay_def mem = jay_is_mem(I->dst) ? I->dst : I->src[0]; + unsigned *nr = mem.file == UMEM ? &nr_umem : &nr_mem; + + *nr = MAX2(*nr, mem.reg + 1); + } + } + + assert((nr_umem > 0) || (nr_mem > 0)); + unsigned umem_base = (func->shader->dispatch_width * nr_mem * 4); + + /* We burn the address & stack pointer registers for all spills/fills in a + * shader. Preinitialize at the top using a scratch register. + * + * TODO: Need ABI for multi-function. + */ + assert(func->is_entrypoint); + jay_AND(&b, JAY_TYPE_U32, surf, jay_bare_reg(UGPR, 5), ~BITFIELD_MASK(10)); + jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4); + + /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */ + jay_def tmp2 = jay_bare_reg(GPR, func->shader->partition.base2); + jay_LANE_ID_8(&b, tmp2); + for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) { + jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i); + } + + jay_SHL(&b, JAY_TYPE_U16, tmp2, tmp2, util_logbase2(4)); + jay_CVT(&b, JAY_TYPE_U32, sp, tmp2, JAY_TYPE_U16, JAY_ROUND, 0); + if (b.shader->scratch_size) { + jay_ADD(&b, JAY_TYPE_U32, sp, sp, b.shader->scratch_size); + } + + jay_foreach_block(func, block) { + /* We offset the stack pointer locally within a block to form offsets. By + * contract keep it in its canonical (unoffset) form at block boundaries. + */ + unsigned sp_delta_B = 0; + bool address_valid = true; + + jay_foreach_inst_in_block_safe(block, I) { + b.cursor = jay_before_inst(I); + + if (I->op == JAY_OPCODE_MOV && jay_is_send_like(I)) { + if (!address_valid) { + jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4); + address_valid = true; + } + + if (jay_is_mem(I->dst)) { + insert_spill_fill(&b, I->dst, I->src[0], sp, false, &sp_delta_B, + umem_base); + func->shader->spills++; + } else { + insert_spill_fill(&b, I->src[0], I->dst, sp, true, &sp_delta_B, + umem_base); + func->shader->fills++; + } + + jay_remove_instruction(I); + } else if (I->op == JAY_OPCODE_SHUFFLE) { + /* Shuffles implicitly clobber the address register so we'll need to + * rematerialize the surface state (but be lazy). + */ + address_valid = false; + } + } + + /* Canonicalize our internal registers at block boundaries */ + if (jay_num_successors(block) > 0) { + if (!address_valid) { + jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, surf, 4); + } + + if (sp_delta_B > 0) { + jay_ADD(&b, JAY_TYPE_U32, sp, sp, -sp_delta_B); + } + } + } + + /* Note this is bogus with recursion, but recursion is not supported on any + * current graphics/compute API. + */ + func->shader->scratch_size += umem_base + (nr_umem * 4); +} diff --git a/src/intel/compiler/jay/jay_nir_algebraic.py b/src/intel/compiler/jay/jay_nir_algebraic.py new file mode 100644 index 00000000000..209f9585172 --- /dev/null +++ b/src/intel/compiler/jay/jay_nir_algebraic.py @@ -0,0 +1,95 @@ +# Copyright 2024 Intel Corporation +# SPDX-License-Identifier: MIT + +import argparse +import sys +from math import pi + +a = 'a' +b = 'b' +c = 'c' + +lower_fsign = [ + (('fsign', a), ('bcsel', ('!flt', 0, a), +1.0, + ('bcsel', ('!flt', a, 0), -1.0, 0.0))), + (('fceil', a), ('fneg', ('ffloor', ('fneg', a)))), + + # inot is free on and/or/xor sources but not dests. Apply De Morgan's. + (('inot', ('iand(is_used_once)', ('inot', a), b)), ('ior', a, ('inot', b))), + (('inot', ('ior(is_used_once)', ('inot', a), b)), ('iand', a, ('inot', b))), + (('inot', ('ixor(is_used_once)', ('inot', a), b)), ('ixor', a, b)), + (('inot', ('iand(is_used_once)', a, b)), ('ior', ('inot', a), ('inot', b))), + (('inot', ('ior(is_used_once)', a, b)), ('iand', ('inot', a), ('inot', b))), + (('inot', ('ixor(is_used_once)', a, b)), ('ixor', ('inot', a), b)), + + # Remove the zeroing. Down-conversion is free but extracts are not. + (('u2f32', ('extract_u8', a, 0)), ('u2f32', ('u2u8', a))), + (('u2f32', ('extract_u16', a, 0)), ('u2f32', ('u2u16', a))), + (('i2f32', ('extract_i8', a, 0)), ('i2f32', ('i2i8', a))), + (('i2f32', ('extract_i16', a, 0)), ('i2f32', ('i2i16', a))), + + (('pack_half_2x16_split', a, b), + ('pack_32_2x16_split', ('f2f16', a), ('f2f16', b))), + + # Allows us to use more modifiers + (('bcsel', a, ('iadd(is_used_once)', b, c), b), + ('iadd', ('bcsel', a, c, 0), b)), +] + + +lower_bool = [ + # Try to use conditional modifiers more + (('ieq', ('iand(is_used_once)', a, b), b), + ('ieq', ('iand', ('inot', a), b), 0)), + (('ine', ('iand(is_used_once)', a, b), b), + ('ine', ('iand', ('inot', a), b), 0)), +] + +for T, sizes, one in [('f', [16, 32], 1.0), + ('i', [8, 16, 32], 1), + ('b', [8, 16, 32], -1)]: + for sz in sizes: + if T in ['f', 'i']: + lower_bool.extend([ + ((f'{T}neg', (f'b2{T}{sz}', ('inot', 'a@1'))), + ('bcsel', a, 0, -one)), + ((f'{T}neg', (f'b2{T}{sz}', 'a@1')), ('bcsel', a, -one, 0)), + ]) + + lower_bool.extend([ + ((f'b2{T}{sz}', ('inot', 'a@1')), ('bcsel', a, 0, one)), + ((f'b2{T}{sz}', 'a@1'), ('bcsel', a, one, 0)), + ]) + +lower_bool.extend([ + ((f'b2i64', 'a@1'), ('pack_64_2x32_split', ('bcsel', a, 1, 0), 0)), +]) + +opt_sel_zero = [ + (('bcsel@32', a, 0, 1), ('iadd', ('bcsel', a, 0xffffffff, 0), 1)), + (('bcsel@32', a, 1, 0), ('ineg', ('bcsel', a, 0xffffffff, 0))), +] + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument('-p', '--import-path', required=True) + parser.add_argument('output') + args = parser.parse_args() + + sys.path.insert(0, args.import_path) + import nir_algebraic # pylint: disable=import-error + + with open(args.output, 'w', encoding='utf-8') as f: + f.write('#include "jay_private.h"') + + f.write(nir_algebraic.AlgebraicPass( + "jay_nir_lower_fsign", lower_fsign).render()) + f.write(nir_algebraic.AlgebraicPass( + "jay_nir_lower_bool", lower_bool).render()) + f.write(nir_algebraic.AlgebraicPass( + "jay_nir_opt_sel_zero", opt_sel_zero).render()) + + +if __name__ == '__main__': + main() diff --git a/src/intel/compiler/jay/jay_opcodes.py b/src/intel/compiler/jay/jay_opcodes.py new file mode 100644 index 00000000000..928d1e90b04 --- /dev/null +++ b/src/intel/compiler/jay/jay_opcodes.py @@ -0,0 +1,233 @@ +# Copyright 2026 Intel Corporation +# SPDX-License-Identifier: MIT + +from typing import TYPE_CHECKING +from dataclasses import dataclass +import enum + +if TYPE_CHECKING: + from collections.abc import Mapping + + +@dataclass +class Opcode: + name: str + has_dest: bool + num_srcs: int + types: list[str] + negate: int + sat: bool + cmod: bool + side_effects: bool + _2src_commutative: bool + extra_struct: list[tuple[str, str]] + + +@enum.unique +class Props(enum.IntEnum): + NEGATE0 = 1 << 0 + NEGATE1 = 1 << 1 + NEGATE2 = 1 << 2 + NEGATE3 = 1 << 3 + SAT = 1 << 4 + CMOD = 1 << 5 + SIDE_EFFECTS = 1 << 6 + COMMUTATIVE = 1 << 7 + NO_DEST_ = 1 << 8 + NEGATE = NEGATE0 | NEGATE1 | NEGATE2 | NEGATE3 + NO_DEST = SIDE_EFFECTS | NO_DEST_ + + +_opcodes: dict[str, Opcode] = {} + + +def op(name: str, num_srcs: int, types: str | None = None, + props: int = 0, extra_struct: str | list[str] | None = None) -> None: + types_ = types.split(' ') if types else ['untyped'] + + # We can always negate the predicate. + negate_mask = (props & Props.NEGATE) | (1 << num_srcs) + + if extra_struct is not None: + extra_struct_ = [(' '.join(x.split(' ')[0:-1]), x.split(' ')[-1]) + for x in extra_struct] + else: + extra_struct_ = [] + + _opcodes[name] = Opcode(name, not bool(props & Props.NO_DEST_), + num_srcs, types_, negate_mask, + bool(props & Props.SAT), bool(props & Props.CMOD), + bool(props & Props.SIDE_EFFECTS), + bool(props & Props.COMMUTATIVE), + extra_struct_) + + +op('and', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE) +op('or', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE) +op('xor', 2, 'u1 u16 u32', Props.NEGATE | Props.CMOD | Props.COMMUTATIVE) + +op('add', 2, 'u32 s32 u64 s64 f32 f64 f16 bf16 u16 s16', + Props.SAT | Props.CMOD | Props.COMMUTATIVE | Props.NEGATE) +op('add3', 3, 'u32 s32 u64 s64 u16 s16', Props.SAT | + Props.CMOD | Props.COMMUTATIVE | Props.NEGATE) +op('asr', 2, 's32 s64 s16', Props.CMOD | Props.NEGATE0) +op('avg', 2, 's16 s32 u16 u32', Props.NEGATE | Props.CMOD) +op('bfe', 3, 'u32 s32', Props.NEGATE0) +op('bfi1', 2, 'u32') +op('bfi2', 3, 'u32') +op('bfn', 3, 'u32', Props.CMOD, ['uint8_t ctrl']) +op('bfrev', 1, 'u32', Props.NEGATE) +op('cbit', 1, 'u32', Props.NEGATE | Props.CMOD) +op('cmp', 2, 'u32', Props.NEGATE | Props.CMOD) + + +# With an 8/16-bit type, `index` specifies the element index of the source +# within the 32-bit word. For example, if src_type == U16 and index == 1, this +# converts the upper 16-bits of the input. +op('cvt', 1, 'u8 s8 u16 s16 u32 s32 u64 s64 f32 f64 f16 bf16', Props.NEGATE | Props.SAT, [ + 'enum jay_type src_type', + 'enum jay_rounding_mode rounding_mode', + 'uint8_t index', + 'uint8_t pad' +]) + +op('fbh', 1, 'u32 s32') +op('fbl', 1, 'u32') +op('lzd', 1, 'u32') +op('frc', 1, 'f32 f64', Props.NEGATE | Props.CMOD) +op('mad', 3, 'u32 s32 u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.CMOD | Props.COMMUTATIVE) +op('max', 2, 'u32 s32 u64 s64 u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.COMMUTATIVE) +op('min', 2, 'u32 s32 u64 s64 u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.COMMUTATIVE) +op('mov', 1, 'u1 u16 u32 u64', Props.NEGATE0 | Props.CMOD) +op('modifier', 1, 'f32 f64 f16 s16 s32 s64 u16 u32 u64 s8', + Props.NEGATE | Props.SAT | Props.CMOD) +op('mul', 2, 'u16 s16 f32 f64 f16 bf16', + Props.NEGATE | Props.SAT | Props.CMOD | Props.COMMUTATIVE) +op('mul_high', 2, 'u32 s32', Props.COMMUTATIVE) +op('mul_32x16', 2, 'u32 s32') +op('mul_32', 2, 'u32 s32', Props.COMMUTATIVE, ['bool high']) +op('sel', 3, 'u32 f32 u1 u16', Props.NEGATE) +op('csel', 3, 'u32 s32 f32', Props.NEGATE) +op('dp4a_uu', 3, 'u32', Props.SAT) +op('dp4a_ss', 3, 's32', Props.SAT) +op('dp4a_su', 3, 's32', Props.SAT) +op('rndd', 1, 'f16 f32 f64', Props.NEGATE | Props.SAT) +op('rndz', 1, 'f16 f32 f64', Props.NEGATE | Props.SAT) +op('rnde', 1, 'f16 f32 f64', Props.NEGATE | Props.SAT) +op('math', 1, 'f16 f32', Props.NEGATE | Props.SAT, ['enum jay_math op']) + +for n in ['rol', 'ror', 'shl', 'shr']: + op(n, 2, 'u32 u64 u16 s16 s32 s64', Props.CMOD | Props.NEGATE0) + +op('quad_swizzle', 1, 'u1 u32', 0, ['enum jay_quad_swizzle swizzle']) +op('sync', 0, None, Props.NO_DEST, ['enum tgl_sync_function op']) + +for n in ['brd', 'illegal', 'goto', 'join', 'if', 'else', + 'endif', 'while', 'break', 'cont', 'call', 'calla', 'jmpi', 'ret', + 'loop_once']: + op(n, 0, None, Props.NO_DEST) + +op('send', 4, None, Props.SIDE_EFFECTS, [ + 'enum brw_sfid sfid', + 'uint8_t sbid', + 'bool eot', + 'bool check_tdr', + 'bool uniform', + 'bool bindless', + 'enum jay_type type_0', + 'enum jay_type type_1', + 'uint8_t ex_mlen', + 'uint32_t ex_desc_imm', +]) + +op('reloc', 0, 'u32 u64', 0, ['unsigned param', 'unsigned base']) +op('preload', 0, 'u32', 0, ['unsigned reg']) +op('deswizzle_16', 0, 'u32', Props.NO_DEST, ['unsigned dst', 'unsigned src']) + +# Calculating the lane ID requires multiple power-of-two steps each involving +# complex architectural features not modelled in the IR. +op('lane_id_8', 0, 'u16') +op('lane_id_expand', 1, 'u16', 0, ['unsigned width']) + +# Sample ID calculation +op('extract_byte_per_8lanes', 2, 'u32') +op('shr_odd_subspans_by_4', 1, 'u16') +op('and_u32_u16', 2, 'u32') + +# Pixel coord calculations. expand_quad replicates out the per-2x2 values from +# its source g0.[10...13] and - in the case of SIMD32 - g1.[10...13] into a +# per-lane value. Then offset_packed_pixel_coords adds the appropriate packed +# 2x16-bit offset within each quad, giving 2x16-bit per-lane coordinates. +op('expand_quad', 2, 'u32') +op('offset_packed_pixel_coords', 1, 'u32') +op('extract_layer', 2, 'u32') + +# Generated by RA and lowered after. Valid only for GPR/UGPR. +op('swap', 2, 'u32', Props.NO_DEST) + +# Phi function representations +# +# Unlike in NIR, we represent Phi functions as a pair of opcodes, purely +# for convenience since it makes many things easier to work with. +# +# Phis locially exist along control flow edges between blocks. PHI_DST +# lives where 𝜙 would traditionally be written, at the point where the new +# value is defined. A PHI_DST will have a corresponding PHI_SRC in each of +# its predecessor block, representing value coming in along that edge. This +# ensures that source modifiers, scalar to vector promotion, or other source +# evaluation happens in the predecessor block. +# +# The PHI_SRC refers to the SSA index of the PHI_DST. For example, 'if (..) r3 = +# r1 else r3 = r2 endif' might look +# +# (following block) | (then block) | (else block) +# START B3 +#include "util/macros.h" + +enum PACKED jay_opcode { +% for opcode in opcodes: + JAY_OPCODE_${opcode.upper()}, +% endfor + JAY_NUM_OPCODES +}; +static_assert(sizeof(enum jay_opcode) == 1); + +struct jay_opcode_info { + const char *name; + unsigned num_srcs; + + /** Bitfield of sources which support negation/abs */ + uint8_t src_mods; + + /** Which modifiers are broadly supported by the opcode. Note there may be + * further restrictions (e.g. based on types) not encoded here. + */ + bool sat; + bool cmod; + + /** Whether the operation has side effects not expressed in the SSA IR */ + bool side_effects; + + /** op(a, b, c, ...) = op(b, a, c, ...) */ + bool _2src_commutative; +}; + +extern const struct jay_opcode_info jay_opcode_infos[JAY_NUM_OPCODES]; +""" + +CODE_TEMPLATE = """/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ +#include "jay_opcodes.h" + +const struct jay_opcode_info jay_opcode_infos[JAY_NUM_OPCODES] = { +% for opcode, op in opcodes.items(): + [JAY_OPCODE_${opcode.upper()}] = { + .name = "${opcode}", + .num_srcs = ${op.num_srcs}, + .src_mods = ${bin(op.negate)}, +% for mod in ["sat", "cmod", "side_effects", "_2src_commutative"]: +% if getattr(op, mod): + .${mod} = true, +% endif +% endfor + }, +% endfor +}; +""" + + +def main() -> int: + parser = argparse.ArgumentParser() + parser.add_argument('--code', action='store', default=None) + parser.add_argument('--header', action='store', default=None) + args = parser.parse_args() + + if not (args.header or args.code): + parser.error('At least one of --code or --header is required') + + try: + if args.code is not None: + with open(args.code, 'w', encoding='utf-8') as f: + f.write(Template(CODE_TEMPLATE).render(opcodes=OPCODES)) + if args.header is not None: + with open(args.header, 'w', encoding='utf-8') as f: + f.write(Template(HEADER_TEMPLATE).render(opcodes=OPCODES)) + except Exception: + print(exceptions.text_error_template().render()) + return 1 + + return 0 + + +if __name__ == '__main__': + sys.exit(main()) diff --git a/src/intel/compiler/jay/jay_opt_control_flow.c b/src/intel/compiler/jay/jay_opt_control_flow.c new file mode 100644 index 00000000000..1f337f37296 --- /dev/null +++ b/src/intel/compiler/jay/jay_opt_control_flow.c @@ -0,0 +1,137 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2023 Valve Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/list.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * Detect the block "else; endif" and remove the no-op else, effectively + * removing empty else blocks. Logically, that causes critical edges, so this + * pass must run late (post-RA). + */ +static void +opt_empty_else(jay_block *blk) +{ + unsigned i = 0; + enum jay_opcode ops[] = { JAY_OPCODE_ELSE, JAY_OPCODE_ENDIF }; + + jay_foreach_inst_in_block(blk, I) { + if (i >= ARRAY_SIZE(ops) || ops[i++] != I->op) + return; + } + + if (i == ARRAY_SIZE(ops)) { + jay_remove_instruction(jay_first_inst(blk)); + } +} + +/* + * Replace short if-statements with predication. Assumes opt_empty_else already + * ran. TODO: Generalize. + */ +static void +opt_predicate(jay_function *f, jay_block *block) +{ + jay_inst *if_ = jay_last_inst(block); + if (!if_ || if_->op != JAY_OPCODE_IF) + return; + + /* If's fallthrough to the then */ + jay_block *then_block = jay_next_block(block); + assert(block->successors[0] == then_block && "successors for if"); + + /* We're searching for a single block then, so the next block is else */ + jay_block *else_block = jay_next_block(then_block); + if (block->successors[1] != else_block || + list_length(&then_block->instructions) > 3 || + !list_is_singular(&else_block->instructions)) + return; + + /* We can only access one flag per instruction, so do not predicate anything + * accessing flags. This also ensures the if-condition flag is kept live. + * + * MIN/MAX turn into SEL which cannot be predicated despite not using flags. + * + * Predicating NoMask instructions doesn't work if we are electing a nonzero + * lane but the NoMask forces lane 0. This should be optimized later. + */ + jay_foreach_inst_in_block(then_block, I) { + if (jay_uses_flag(I) || + I->op == JAY_OPCODE_MIN || + I->op == JAY_OPCODE_MAX || + I->op == JAY_OPCODE_CSEL || + jay_is_no_mask(I)) + return; + } + + jay_inst *endif = jay_last_inst(else_block); + if (endif->op != JAY_OPCODE_ENDIF) + return; + + /* Rewrite with predication */ + jay_builder b = jay_init_builder(f, jay_after_block(block)); + assert(if_->predication == JAY_PREDICATED && "if's are always predicated"); + + jay_foreach_inst_in_block_safe(then_block, I) { + jay_add_predicate(&b, I, *jay_inst_get_predicate(if_)); + } + + /* Remove the jumps */ + jay_remove_instruction(if_); + jay_remove_instruction(endif); +} + +/* + * Optimize "(f0) break; while" to "(!f0) while". As break/while appear in + * different blocks, we optimize the entire function at a time. + */ +static void +opt_predicate_while(jay_function *func) +{ + jay_inst *prev_break = NULL; + + jay_foreach_block(func, block) { + if (list_is_empty(&block->instructions)) { + /* Ignore empty blocks */ + } else if (jay_last_inst(block)->op == JAY_OPCODE_BREAK) { + prev_break = jay_last_inst(block); + } else if (jay_first_inst(block)->op == JAY_OPCODE_WHILE && + prev_break && + prev_break->predication) { + assert(!jay_first_inst(block)->predication); + jay_inst_get_predicate(prev_break)->negate ^= true; + + jay_remove_instruction(jay_first_inst(block)); + jay_remove_instruction(prev_break); + + jay_builder b = jay_init_builder(func, jay_before_block(block)); + jay_builder_insert(&b, prev_break); + + prev_break->op = JAY_OPCODE_WHILE; + prev_break = NULL; + } else { + prev_break = NULL; + } + } +} + +void +jay_opt_control_flow(jay_shader *s) +{ + jay_foreach_function(s, f) { + /* Iterating blocks in reverse lets both opts converge in 1 pass */ + jay_foreach_block_rev(f, block) { + opt_empty_else(block); + opt_predicate(f, block); + } + + /* Do last: opt_predicate_while depends on both previous optimizations */ + opt_predicate_while(f); + } +} diff --git a/src/intel/compiler/jay/jay_opt_dead_code.c b/src/intel/compiler/jay/jay_opt_dead_code.c new file mode 100644 index 00000000000..da9d7299d57 --- /dev/null +++ b/src/intel/compiler/jay/jay_opt_dead_code.c @@ -0,0 +1,58 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +static void +pass(jay_function *f) +{ + BITSET_WORD *live_set = BITSET_CALLOC(f->ssa_alloc); + + jay_foreach_inst_in_func_safe_rev(f, block, I) { + /* TODO: Allow for atomics? */ + if (!BITSET_TEST_COUNT(live_set, jay_base_index(I->dst), + jay_num_values(I->dst)) && + I->op != JAY_OPCODE_SEND) { + I->dst = jay_null(); + } + + if (!jay_is_null(I->cond_flag) && + !BITSET_TEST(live_set, jay_index(I->cond_flag)) && + (I->op != JAY_OPCODE_CMP || jay_is_null(I->dst))) { + + I->cond_flag = jay_null(); + I->conditional_mod = 0; + } + + bool no_dest = jay_is_null(I->dst) && jay_is_null(I->cond_flag); + bool side_effects = jay_opcode_infos[I->op].side_effects; + + if (no_dest && !side_effects) { + jay_remove_instruction(I); + } else { + jay_foreach_src_index(I, s, _, index) { + BITSET_SET(live_set, index); + } + } + } + + /* Eliminate phis. This step may leave dead code but it's good enough in + * practice since NIR already eliminated dead phis. + */ + jay_foreach_block(f, block) { + jay_foreach_phi_src_in_block(block, I) { + if (!BITSET_TEST(live_set, jay_phi_src_index(I))) { + jay_remove_instruction(I); + } + } + } + + free(live_set); +} + +JAY_DEFINE_FUNCTION_PASS(jay_opt_dead_code, pass) diff --git a/src/intel/compiler/jay/jay_opt_propagate.c b/src/intel/compiler/jay/jay_opt_propagate.c new file mode 100644 index 00000000000..25a58253d93 --- /dev/null +++ b/src/intel/compiler/jay/jay_opt_propagate.c @@ -0,0 +1,282 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/lut.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +static enum jay_type +canonicalize_for_bit_compare(enum jay_type type) +{ + enum jay_type base = jay_base_type(type); + return (base == JAY_TYPE_S) ? jay_type_rebase(type, JAY_TYPE_U) : type; +} + +static bool +propagate_cmod(jay_function *func, jay_inst *I, jay_inst **defs) +{ + enum jay_type cmp_type = I->type; + enum jay_conditional_mod cmod = I->conditional_mod; + jay_inst *def = NULL; + + /* TODO: Generalize cmod propagation */ + if (jay_type_size_bits(cmp_type) != 32) + return false; + + /* Pattern match `cmp ssa, 0` or `cmp 0, ssa`. */ + jay_foreach_ssa_src(I, s) { + if (jay_is_zero(I->src[1 - s])) { + def = defs[jay_base_index(I->src[s])]; + + /* Canonicalize the cmod to have the zero second */ + cmod = s == 1 ? jay_conditional_mod_swap_sources(cmod) : cmod; + break; + } + } + + /* Check if we can fold into the def */ + if (!def || !jay_is_null(def->cond_flag) || !jay_opcode_infos[def->op].cmod) + return false; + + /* "Neither Saturate nor conditional modifier allowed with DW integer + * multiply." + * + * Could be refined. + */ + if (def->op == JAY_OPCODE_MUL && !jay_type_is_any_float(def->type)) + return false; + + enum jay_type instr_type = def->type; + + if (cmod == JAY_CONDITIONAL_NE || cmod == JAY_CONDITIONAL_EQ) { + cmp_type = canonicalize_for_bit_compare(cmp_type); + instr_type = canonicalize_for_bit_compare(instr_type); + } + + if (instr_type != cmp_type) + return false; + + jay_builder b = jay_init_builder(func, jay_before_inst(I)); + jay_set_conditional_mod(&b, def, I->cond_flag, cmod); + return true; +} + +static jay_def +jay_compose_src(jay_def to, jay_def from) +{ + if (to.abs) { + from.negate = false; + from.abs = true; + } + + from.negate ^= to.negate; + return from; +} + +static bool +uses_modifiers(const jay_inst *I) +{ + jay_foreach_src(I, s) { + if (I->src[s].abs || I->src[s].negate) + return true; + } + + return I->saturate; +} + +static void +propagate_modifier(jay_inst *I, unsigned s, jay_inst *mod) +{ + /* Check if we can propagate abs/neg here in general */ + if (!jay_has_src_mods(I, s) || mod->saturate) + return; + + /* Try to make the types compatible. */ + if (jay_src_type(I, s) != mod->type) { + if (I->op == JAY_OPCODE_SEL && !uses_modifiers(I)) { + I->type = mod->type; + } else { + return; + } + } + + jay_replace_src(&I->src[s], mod->src[0]); + I->src[s] = jay_compose_src(I->src[s], mod->src[0]); +} + +static void +propagate_not(jay_inst *I, unsigned s, jay_inst *mod) +{ + /* Handle inot specially for predicates, and logic operations per bspec text: + * + * When used with logic instructions (and, not, or, xor), [the + * negate] field indicates whether the source bits are + * inverted... regardless of the source type. + */ + if ((s == I->num_srcs - I->predication) || + I->op == JAY_OPCODE_AND || + I->op == JAY_OPCODE_OR || + I->op == JAY_OPCODE_XOR) { + jay_replace_src(&I->src[s], mod->src[0]); + I->src[s].negate ^= true; + } else if (I->op == JAY_OPCODE_BFN) { + jay_replace_src(&I->src[s], mod->src[0]); + jay_set_bfn_ctrl(I, util_lut3_invert_source(jay_bfn_ctrl(I), s)); + } +} + +static void +propagate_forwards(jay_function *f) +{ + jay_inst **defs = calloc(f->ssa_alloc, sizeof(defs[0])); + + jay_foreach_inst_in_func_safe(f, block, I) { + jay_builder b = jay_init_builder(f, jay_before_inst(I)); + + jay_foreach_dst_index(I, _, d) { + defs[d] = I; + } + + /* Copy propagate individual components into vectors */ + jay_foreach_src_index(I, s, c, idx) { + jay_inst *def = defs[idx]; + assert(def != NULL && "SSA"); + + if (def->op == JAY_OPCODE_MOV && + !def->predication && + jay_num_values(def->dst) == 1 && + jay_num_values(def->src[0]) == 1 && + I->src[s].file == def->src[0].file) { + + jay_insert_channel(&b, &I->src[s], c, def->src[0]); + } + } + + /* Don't propagate into phis yet - TODO: File awareness */ + if (I->op == JAY_OPCODE_PHI_SRC || I->op == JAY_OPCODE_SEND) + continue; + + jay_foreach_ssa_src(I, s) { + /* Copy propagate whole vectors */ + jay_def src = I->src[s]; + if (src.collect) + continue; + + jay_inst *def = defs[jay_base_index(src)]; + assert(def != NULL && "SSA"); + + if (!jay_defs_equivalent(def->dst, src) || def->predication) + continue; + + if (def->op == JAY_OPCODE_MOV) { + /* Default values must have the same file as their dest, do not + * propagate invalid there. Also don't propagate inverse-ballots. + * Also only source 0 can read ARF (i.e. ballotted flags). + */ + if ((I->src[s].file == def->src[0].file) || + ((!jay_inst_has_default(I) || + &I->src[s] != jay_inst_get_default(I)) && + !(I->src[s].file == UFLAG && !jay_is_imm(def->src[0])) && + !(I->src[s].file == FLAG) && + (s == 0 || !jay_is_flag(def->src[0])) && + !(jay_is_imm(def->src[0]) && I->src[s].negate))) { + + jay_replace_src(&I->src[s], def->src[0]); + } + } else if (def->op == JAY_OPCODE_MODIFIER && !jay_uses_flag(def)) { + propagate_modifier(I, s, def); + } else if (def->op == JAY_OPCODE_NOT && !jay_uses_flag(def)) { + propagate_not(I, s, def); + } + } + + if (I->op == JAY_OPCODE_CMP && propagate_cmod(f, I, defs)) { + /* Even if we propagate the predicate write, there might be uses of the + * register value (TODO: Maybe check for this and skip propagating in + * that case?). So we cannot remove the compare, just strip the cond + * flag. Furthermore the CMP we always clobber some predicate, so give + * it an immediately-dead one instead. + */ + I->cond_flag = jay_alloc_def(&b, I->cond_flag.file, 1); + continue; + } + } + + free(defs); +} + +static bool +propagate_fsat(jay_inst *I, jay_inst *fsat) +{ + if (fsat->op != JAY_OPCODE_MODIFIER || + fsat->predication || + fsat->src[0].negate || + fsat->src[0].abs || + (fsat->conditional_mod && !jay_opcode_infos[I->op].cmod) || + I->conditional_mod || + I->type != fsat->type || + !jay_type_is_any_float(fsat->type)) + return false; + + /* saturate(saturate(x)) = saturate(x) */ + I->saturate |= fsat->saturate; + I->dst = fsat->dst; + I->cond_flag = fsat->cond_flag; + I->conditional_mod = fsat->conditional_mod; + return true; +} + +static void +propagate_backwards(jay_function *f) +{ + jay_inst **uses = calloc(f->ssa_alloc, sizeof(uses[0])); + BITSET_WORD *multiple = BITSET_CALLOC(f->ssa_alloc); + + jay_foreach_inst_in_func_rev(f, block, I) { + /* Record uses */ + jay_foreach_src_index(I, s, c, ssa_index) { + if (uses[ssa_index]) + BITSET_SET(multiple, ssa_index); + else + uses[ssa_index] = I; + } + + /* TODO: f64 sat propagation */ + if (jay_num_values(I->dst) != 1) + continue; + + assert(jay_is_ssa(I->dst)); + + jay_inst *use = uses[jay_base_index(I->dst)]; + if (!use || BITSET_TEST(multiple, jay_base_index(I->dst))) + continue; + + if (jay_opcode_infos[I->op].sat && + jay_type_is_any_float(I->type) && + propagate_fsat(I, use)) { + + jay_remove_instruction(use); + continue; + } + + /* Fold UGPR->{GPR, FLAG} copies coming out of NIR */ + if (I->type == use->type && + I->op != JAY_OPCODE_PHI_DST && + use->op == JAY_OPCODE_MOV) { + + I->dst = use->dst; + jay_remove_instruction(use); + continue; + } + } + + free(multiple); + free(uses); +} + +JAY_DEFINE_FUNCTION_PASS(jay_opt_propagate_forwards, propagate_forwards) +JAY_DEFINE_FUNCTION_PASS(jay_opt_propagate_backwards, propagate_backwards) diff --git a/src/intel/compiler/jay/jay_print.c b/src/intel/compiler/jay/jay_print.c new file mode 100644 index 00000000000..3b8c3781d20 --- /dev/null +++ b/src/intel/compiler/jay/jay_print.c @@ -0,0 +1,309 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "compiler/brw/brw_eu_defines.h" +#include "util/lut.h" +#include "util/macros.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +#define ENUM_TO_STR(x, arr) \ + ({ \ + assert(x < ARRAY_SIZE(arr)); \ + arr[x]; \ + }) + +static const char *jay_conditional_mod_str[] = { + [JAY_CONDITIONAL_EQ] = ".eq", [JAY_CONDITIONAL_NE] = ".ne", + [JAY_CONDITIONAL_GT] = ".gt", [JAY_CONDITIONAL_LT] = ".lt", + [JAY_CONDITIONAL_GE] = ".ge", [JAY_CONDITIONAL_LE] = ".le", + [JAY_CONDITIONAL_OV] = ".ov", [JAY_CONDITIONAL_NAN] = ".nan", +}; + +static const char *jay_arf_str[] = { + [JAY_ARF_NULL] = "_", + [JAY_ARF_MASK] = "mask", + [JAY_ARF_CONTROL] = "ctrl", + [JAY_ARF_TIMESTAMP] = "timestamp", +}; + +static const char *jay_file_str[JAY_FILE_LAST + 1] = { + [GPR] = "r", [UGPR] = "u", [FLAG] = "f", [UFLAG] = "uf", + [J_ADDRESS] = "a", [ACCUM] = "acc", [UACCUM] = "uacc", [J_ARF] = "arf", + [MEM] = "m", [UMEM] = "um", [TEST_FILE] = "t", +}; + +static const char *jay_base_types[] = { + [JAY_TYPE_U] = "u", [JAY_TYPE_S] = "s", [JAY_TYPE_F] = "f", [JAY_TYPE_BF] = "bf" +}; + +void +jay_print_type(FILE *fp, enum jay_type t) +{ + fprintf(fp, ".%s%u", ENUM_TO_STR(jay_base_type(t), jay_base_types), + jay_type_size_bits(t)); +} + +static void +jay_print_def(FILE *fp, const jay_inst *I, int src) +{ + jay_def def = src == -2 ? I->cond_flag : src == -1 ? I->dst : I->src[src]; + unsigned len = jay_num_values(def); + const char *file = ENUM_TO_STR(def.file, jay_file_str); + bool has_lu = jay_is_ssa(def) && !jay_is_null(def) && src >= 0; + unsigned lu_bit = has_lu ? jay_source_last_use_bit(I->src, src) : 0; + + bool has_index = jay_channel(def, 0) != JAY_SENTINEL; + bool has_reg = !def.collect && def.reg && def.file != J_ARF; + + if (jay_is_null(def)) { + has_reg = false; + fprintf(fp, "_"); + } else if (def.file == J_ARF) { + fputs(ENUM_TO_STR(jay_base_index(def), jay_arf_str), fp); + } else if (def.collect) { + assert(has_index && "else would be contiguous"); + fprintf(fp, "("); + for (unsigned i = 0; i < len; ++i) { + if (i) + fprintf(fp, ", "); + + if (jay_channel(def, i)) { + if (has_lu && BITSET_TEST(I->last_use, lu_bit)) + fprintf(fp, "*"); + + fprintf(fp, "%s%u", file, jay_channel(def, i)); + ++lu_bit; + } else { + fprintf(fp, "_"); + } + } + fprintf(fp, ")"); + } else if (has_index) { + fprintf(fp, "%s%s%u", + has_lu && BITSET_TEST(I->last_use, lu_bit) ? "*" : "", file, + jay_channel(def, 0)); + if (len > 1) { + fprintf(fp, ":%s%u", file, jay_channel(def, len - 1)); + } + } + + if (has_reg) { + if (has_index) + fprintf(fp, "("); + + fprintf(fp, "%s%u%s", file, def.reg, def.hi ? "h" : ""); + if (len > 1) { + fprintf(fp, ":%s%u", file, def.reg + len - 1); + } + + if (has_index) + fprintf(fp, ")"); + } +} + +static void +jay_print_src(FILE *fp, jay_inst *I, unsigned s) +{ + jay_def src = I->src[s]; + fprintf(fp, "%s%s", src.negate ? "-" : "", src.abs ? "(abs)" : ""); + + if (jay_is_imm(src)) { + fprintf(fp, "0x%X", jay_as_uint(src)); + if (util_is_probably_float(jay_as_uint(src))) { + float f = uif(jay_as_uint(src)); + fprintf(fp, fabs(f) >= 1000000.0 ? " (%e)" : " (%f)", f); + } + } else { + jay_print_def(fp, I, s); + } +} + +/* XXX: copypaste of brw_print_swsb */ +static void +jay_print_swsb(FILE *f, const struct tgl_swsb swsb) +{ + if (swsb.regdist) { + fprintf(f, "%s@%d", + (swsb.pipe == TGL_PIPE_FLOAT ? "F" : + swsb.pipe == TGL_PIPE_INT ? "I" : + swsb.pipe == TGL_PIPE_LONG ? "L" : + swsb.pipe == TGL_PIPE_ALL ? "A" : + swsb.pipe == TGL_PIPE_MATH ? "M" : + swsb.pipe == TGL_PIPE_SCALAR ? "S" : + ""), + swsb.regdist); + } + + if (swsb.mode) { + if (swsb.regdist) + fprintf(f, " "); + + fprintf(f, "$%d%s", swsb.sbid, + (swsb.mode & TGL_SBID_SET ? "" : + swsb.mode & TGL_SBID_DST ? ".dst" : + ".src")); + } +} + +void +jay_print_inst(FILE *fp, jay_inst *I) +{ + const char *sep = ""; + + if (!jay_is_null(I->dst)) { + jay_print_def(fp, I, -1); + sep = ", "; + } + + if (!jay_is_null(I->cond_flag)) { + fprintf(fp, "%s", sep); + jay_print_def(fp, I, -2); + } + + if (!jay_is_null(I->dst) || !jay_is_null(I->cond_flag)) { + fprintf(fp, " = "); + } + + if (I->predication) { + fprintf(fp, "("); + jay_print_src(fp, I, jay_inst_get_predicate(I) - I->src); + + if (jay_inst_has_default(I)) { + fprintf(fp, "/"); + jay_print_src(fp, I, jay_inst_get_default(I) - I->src); + } + + fprintf(fp, ")"); + } + + if (I->op == JAY_OPCODE_MATH) { + jay_print_inst_info(fp, I, ""); + } else { + fprintf(fp, "%s", jay_opcode_infos[I->op].name); + } + + if (I->type != JAY_TYPE_UNTYPED) { + jay_print_type(fp, I->type); + } + + if (I->op == JAY_OPCODE_BFN) { + fprintf(fp, ".(%s)", util_lut3_to_str[jay_bfn_ctrl(I)]); + } + + const char *cmod = ENUM_TO_STR(I->conditional_mod, jay_conditional_mod_str); + fprintf(fp, "%s%s ", I->saturate ? ".sat" : "", cmod ? cmod : ""); + sep = ""; + + for (unsigned i = 0; i < I->num_srcs - I->predication; i++) { + fprintf(fp, "%s", sep); + jay_print_src(fp, I, i); + + enum jay_type T = jay_src_type(I, i); + if (T != I->type && !(T == JAY_TYPE_U1 && jay_is_flag(I->src[i]))) { + jay_print_type(fp, T); + } + + sep = ", "; + } + + if (I->op != JAY_OPCODE_MATH) { + sep = jay_print_inst_info(fp, I, sep); + } + + /* Software scoreboard dependency info */ + if (I->dep.regdist || I->dep.mode) { + fprintf(fp, "%s%s%s", strlen(sep) ? " {" : "{", + I->replicate_dep ? "*" : "", I->decrement_dep ? "+" : ""); + jay_print_swsb(fp, I->dep); + fprintf(fp, "}"); + } + + fprintf(fp, "\n"); +} + +static inline void +indent(FILE *fp, jay_block *block, bool interior) +{ + for (unsigned i = 0; i < block->indent + interior; i++) + fprintf(fp, " "); +} + +static void +comma_separate(FILE *fp, jay_block *block, bool *first) +{ + if (*first) { + indent(fp, block, true); + *first = false; + } else { + fprintf(fp, ", "); + } +} + +void +jay_print_block(FILE *fp, jay_block *block) +{ + indent(fp, block, false); + fprintf(fp, "B%d%s%s", block->index, block->uniform ? " [uniform]" : "", + block->loop_header ? " [loop header]" : ""); + bool first = true; + jay_foreach_predecessor(block, p) { + fprintf(fp, "%s B%d", first ? " <-" : "", (*p)->index); + first = false; + } + fprintf(fp, " {\n"); + + /* We group phi destinations/sources for legibility */ + first = true; + jay_foreach_phi_dst_in_block(block, phi) { + comma_separate(fp, block, &first); + jay_print_def(fp, phi, -1); + } + fprintf(fp, "%s", first ? "" : " = 𝜙\n"); + + jay_foreach_inst_in_block(block, inst) { + if (inst->op != JAY_OPCODE_PHI_DST && inst->op != JAY_OPCODE_PHI_SRC) { + indent(fp, block, true); + jay_print_inst(fp, inst); + } + } + + first = true; + jay_foreach_phi_src_in_block(block, phi) { + comma_separate(fp, block, &first); + fprintf(fp, "𝜙%u = ", jay_phi_src_index(phi)); + jay_print_def(fp, phi, 0); + } + fprintf(fp, "%s", first ? "" : "\n"); + + indent(fp, block, false); + fprintf(fp, "}"); + first = true; + jay_foreach_successor(block, succ) { + if (succ) { + fprintf(fp, "%s B%d", first ? " ->" : "", succ->index); + first = false; + } + } + fprintf(fp, "\n\n"); +} + +void +jay_print_func(FILE *fp, jay_function *f) +{ + fprintf(fp, "Jay function: \n\n"); + jay_foreach_block(f, block) { + jay_print_block(fp, block); + } +} + +void +jay_print(FILE *fp, jay_shader *s) +{ + jay_foreach_function(s, f) { + jay_print_func(fp, f); + } +} diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h new file mode 100644 index 00000000000..2799eaa7b7b --- /dev/null +++ b/src/intel/compiler/jay/jay_private.h @@ -0,0 +1,72 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "jay_ir.h" +#include "nir.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define JAY_DBG_NOOPT BITFIELD_BIT(0) +#define JAY_DBG_PRINTDEMAND BITFIELD_BIT(1) +#define JAY_DBG_SPILL BITFIELD_BIT(2) +#define JAY_DBG_SYNC BITFIELD_BIT(3) +extern int jay_debug; + +bool jay_nir_lower_bool(nir_shader *nir); +bool jay_nir_opt_sel_zero(nir_shader *nir); +bool jay_nir_lower_fsign(nir_shader *nir); + +void jay_compute_liveness(jay_function *f); +void jay_calculate_register_demands(jay_function *f); + +void jay_spill(jay_function *func, enum jay_file file, unsigned limit); +void jay_partition_grf(jay_shader *shader); +void jay_register_allocate(jay_shader *s); +void jay_assign_flags(jay_shader *s); +void jay_repair_ssa(jay_function *func); + +const char *jay_file_to_string(enum jay_file file); +void jay_print_type(FILE *f, enum jay_type t); +void jay_print_inst(FILE *f, jay_inst *I); +void jay_print_block(FILE *f, jay_block *block); +void jay_print_func(FILE *fp, jay_function *func); +void jay_print(FILE *f, jay_shader *s); + +#ifndef NDEBUG +void jay_validate(jay_shader *s, const char *when); +void jay_validate_ra(jay_function *func); +#else +static inline void +jay_validate(jay_shader *s, const char *when) +{ +} + +static inline void +jay_validate_ra(jay_function *func) +{ +} +#endif + +void jay_opt_propagate_forwards(jay_shader *s); +void jay_opt_propagate_backwards(jay_shader *s); +void jay_opt_dead_code(jay_shader *s); +void jay_opt_control_flow(jay_shader *s); + +void jay_lower_pre_ra(jay_shader *s); +void jay_lower_post_ra(jay_shader *s); +void jay_lower_spill(jay_function *func); +void jay_lower_simd_width(jay_shader *s); +void jay_lower_scoreboard(jay_shader *s); + +struct jay_shader_bin * +jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size); + +#ifdef __cplusplus +} /* extern C */ +#endif diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c new file mode 100644 index 00000000000..65cbf05c080 --- /dev/null +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -0,0 +1,1659 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include "util/bitscan.h" +#include "util/bitset.h" +#include "util/macros.h" +#include "util/ralloc.h" +#include "util/sparse_bitset.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" +#include "shader_enums.h" + +/** + * Register allocation for Jay shaders. + * + * We use a decoupled register allocation approach. First, we spill values + * until the register demand fits within the size of each register file. + * + * Secondly, we assign registers using a tree-scan algorithm similar to the + * one described in Colombet et al 2011: + * + * Q. Colombet, B. Boissinot, P. Brisk, S. Hack and F. Rastello, + * "Graph-coloring and treescan register allocation using repairing," + * 2011 Proceedings of the 14th International Conference on Compilers, + * Architectures and Synthesis for Embedded Systems (CASES), Taipei, + * Taiwan, 2011, pp. 45-54, doi: 10.1145/2038698.2038708. + * + * We also use a union-find set to construct equivalence classes for phi webs, + * and attempt to use the same regs for registers in that class, similar to + * the "Aggressive Pre-Coalescing" step described in that paper. + * + * Finally, we deconstruct SSA. + */ + +static inline bool +is_ra_src(jay_def d) +{ + return d.file < JAY_NUM_RA_FILES; +} + +#define jay_foreach_ra_file(file) \ + for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file) + +#define jay_foreach_ra_src(I, s) \ + jay_foreach_src(I, s) \ + if (is_ra_src(I->src[s]) && !jay_is_null(I->src[s])) + +static enum jay_stride +jay_min_stride_for_type(enum jay_type T) +{ + unsigned bits = jay_type_size_bits(T); + + /* We need at least enough contiguous bits per-lane to store a scalar */ + if (bits == 64) + return JAY_STRIDE_8; + else if (bits == 32) + return JAY_STRIDE_4; + else + return JAY_STRIDE_2; +} + +static enum jay_stride +jay_max_stride_for_type(enum jay_type T) +{ + /* Horizontal stride can be at most 4 */ + return (jay_type_size_bits(T) >= 16) ? JAY_STRIDE_8 : JAY_STRIDE_4; +} + +static bool +jay_restrict_mixed_strides(jay_inst *I, unsigned s) +{ + /* From the hardware spec section "Register Region Restrictions": + * + * "In case of all floating point data types used in destination:" and + * + * "In case where source or destination datatype is 64b or operation is + * integer DWord multiply:" and + * + * "Src2 Restrictions" + * + * Register Regioning patterns where register data bit location + * of the LSB of the channels are changed between source and + * destination are not supported on Src0 and Src1 except for + * broadcast of a scalar. + * + * Therefore, ban mixed-strides in these cases. + * + * Similarly, SENDs cannot do any regioning so restrict that too. + */ + return jay_type_is_any_float(I->type) || + jay_type_size_bits(I->type) == 64 || + jay_is_send_like(I) || + I->op == JAY_OPCODE_MUL_32X16 || + I->op == JAY_OPCODE_MUL_32 || + s == 2; +} + +static enum jay_stride +jay_dst_stride_minmax(jay_inst *I, bool do_max) +{ + enum jay_stride min = jay_min_stride_for_type(I->type); + enum jay_stride max = jay_max_stride_for_type(I->type); + + /* Destination stride must be equal to the ratio of the sizes of the + * execution data type to the destination type + */ + if (I->op == JAY_OPCODE_CVT) { + min = MAX2(min, jay_min_stride_for_type(jay_src_type(I, 0))); + } + + /* V/UV types are restricted */ + if (I->op == JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4) { + return JAY_STRIDE_2; + } + + /* The src2 restriction quoted above effectively implies we should not stride + * destinations of 3-source instructions either. + */ + if (jay_num_isa_srcs(I) >= 3) { + return min; + } + + return (do_max && !jay_restrict_mixed_strides(I, 0)) ? max : min; +} + +static enum jay_stride +jay_src_stride_minmax(jay_inst *I, unsigned s, bool do_max) +{ + enum jay_stride min = jay_min_stride_for_type(jay_src_type(I, s)); + enum jay_stride max = jay_max_stride_for_type(jay_src_type(I, s)); + + /* SENDs cannot do any regioning so force exactly the types of the sources + * regardless of the type of the destination. + * + * Shuffles could theoretically support regioning but it would be nontrivial + * and probably pointless most of the time. + */ + if (jay_is_send_like(I) || jay_is_shuffle_like(I)) { + return min; + } + + /* While "add.u16 r0<2>, r1<4>" is legal, "add.u16 r0, r1<4>" is not. + * Conservatively assume the destination is packed and restrict the source + * stride accordingly. This satisfies the special restrictions. + */ + if (jay_type_size_bits(I->type) <= 16) { + max = JAY_STRIDE_4; + } + + /* "add.u16 r0.8, g1<2>" is not legal. We don't generate this normally yet + * (preferring to burn the upper bits) but it is used internally. + */ + if (I->op == JAY_OPCODE_LANE_ID_EXPAND) { + max = JAY_STRIDE_2; + } + + if (jay_restrict_mixed_strides(I, s) && + jay_type_size_bits(jay_src_type(I, s)) < jay_type_size_bits(I->type)) { + + return jay_dst_stride_minmax(I, do_max); + } + + return (do_max && !jay_restrict_mixed_strides(I, s)) ? max : min; +} + +struct affinity { + /** + * If there is a vector affinity defined for this SSA def, it is relative to + * some representative SSA index. Else 0 if there is no affinity. + */ + uint32_t repr; + + /** If the representative: offset in registers from the base. + * + * If not the representative: offset in registers from the representative. */ + signed offset:4; + + /** + * If true, this value is used in an end-of-thread SEND and requires high + * registers. + */ + bool eot:1; + + /** If true, this UGPR needs full GRF alignment */ + bool grf_align :1; + unsigned align_offs:4; + unsigned padding :22; +}; +static_assert(sizeof(struct affinity) == 8, "packed"); + +struct phi_web_node { + /* Parent index, or circular for root */ + uint32_t parent; + + /* If root, assigned register, or ~0 if no register assigned. */ + uint16_t reg; + + /* Rank, at most log2(n) so need ~5-bits */ + uint16_t rank; + + /* If root, affinity for the whole web */ + struct affinity affinity; +}; +static_assert(sizeof(struct phi_web_node) == 16, "packed"); + +static unsigned +phi_web_find(struct phi_web_node *web, unsigned x) +{ + if (web[x].parent == x) { + /* Root */ + return x; + } else { + /* Search up the tree */ + unsigned root = x; + while (web[root].parent != root) + root = web[root].parent; + + /* Compress path. Second pass ensures O(1) memory usage. */ + while (web[x].parent != x) { + unsigned temp = web[x].parent; + web[x].parent = root; + x = temp; + } + + return root; + } +} + +static void +phi_web_union(struct phi_web_node *web, unsigned x, unsigned y) +{ + x = phi_web_find(web, x); + y = phi_web_find(web, y); + + if (x == y) + return; + + /* Union-by-rank: ensure x.rank >= y.rank */ + if (web[x].rank < web[y].rank) { + SWAP(x, y); + } + + web[y].parent = x; + + /* Increment rank if necessary */ + if (web[x].rank == web[y].rank) { + web[x].rank++; + } +} + +#define NO_REG 0xFFFF + +static inline jay_reg +make_reg(enum jay_file file, uint16_t reg) +{ + return (((uint16_t) file) << 13) | reg; +} + +static inline unsigned +r_reg(jay_reg r) +{ + assert(r != NO_REG); + return r & BITFIELD_MASK(13); +} + +static inline enum jay_file +r_file(jay_reg r) +{ + assert(r != NO_REG); + assert((r >> 13) < JAY_NUM_RA_FILES); + return r >> 13; +} + +static jay_def +def_from_reg(jay_reg r) +{ + return jay_bare_reg(r_file(r), r_reg(r)); +} + +typedef struct jay_ra_state { + /** Size of each register file */ + unsigned num_regs[JAY_NUM_RA_FILES]; + + /** First GPR that may be used for EOT sends */ + unsigned eot_offs; + + /** Phi coalescing data structure */ + struct phi_web_node *phi_web; + + /** + * Global SSA index -> jay_reg map. Unlike reg_for_index, once a register + * is picked it will not be shuffled. + */ + jay_reg *global_reg_for_index; + + /** + * Block currently being processed. ra_state is allocated once per + * function but the following fields are updated as we go through the + * program. This keeps RA linearish time. + */ + jay_block *block; + + /** Builder for inserting shuffle code */ + jay_builder bld; + + /** Local SSA index -> jay_reg map. Only defined for live indices. */ + jay_reg *reg_for_index; + + /** + * Value occupying a register (register -> uint32_t reverse maps) for + * registers that are not available. Undefined for available registers. + */ + uint32_t *index_for_reg[JAY_NUM_RA_FILES]; + + /** Set of registers that are available */ + BITSET_WORD *available_regs[JAY_NUM_RA_FILES]; + + /** + * Within assign_regs_for_inst, the set of registers that have been + * assigned and are therefore pinned. + * + * Invariant: zeroed on entry to assign_regs_for_inst. + */ + BITSET_WORD *pinned[JAY_NUM_RA_FILES]; + + /** Vector affinities for each def. */ + struct affinity *affinities; +} jay_ra_state; + +static inline jay_reg +current_reg(const jay_ra_state *ra, uint32_t index) +{ + assert(index > 0 && index < ra->bld.func->ssa_alloc); + jay_reg reg = ra->reg_for_index[index]; + + assert(reg != NO_REG); + assert(ra->index_for_reg[r_file(reg)][r_reg(reg)] == index); + return reg; +} + +/** (dst, src) pairs for use in parallel copies */ +struct jay_parallel_copy { + jay_reg dst, src; +}; + +static void +add_copy(struct util_dynarray *copies, jay_reg dst, jay_reg src) +{ + if (dst != src) { + assert(r_file(dst) == r_file(src)); + util_dynarray_append(copies, ((struct jay_parallel_copy) { dst, src })); + } +} + +static jay_def +push_temp(jay_builder *b, jay_reg reg, bool stride4) +{ + jay_def tmp = def_from_reg(reg); + + if (stride4 && jay_def_stride(b->shader, tmp) != JAY_STRIDE_4) { + jay_def new = def_from_reg(0); + jay_MOV(b, tmp, new); + tmp = new; + } + + return tmp; +} + +static void +pop_temp(jay_builder *b, struct jay_temp_regs t, jay_def temp) +{ + if (temp.file == GPR && temp.reg != t.gpr) { + jay_MOV(b, temp, def_from_reg(t.gpr)); + } +} + +/* + * Insert a single logical copy. Like jay_MOV but expands to multiple moves + * involving a temporary register in some cases. + */ +static void +mov(jay_builder *b, jay_def dst, jay_def src, struct jay_temp_regs temps) +{ + if (dst.file == MEM && src.file == MEM) { + assert(temps.gpr != NO_REG && "ensured by the spill limit"); + jay_def temp = push_temp(b, temps.gpr, true /* stride4 */); + jay_MOV(b, temp, src); + jay_MOV(b, dst, temp); + pop_temp(b, temps, temp); + } else if (dst.file == UMEM && src.file == UMEM) { + assert(temps.ugpr != NO_REG && "ensured by the spill limit"); + jay_MOV(b, def_from_reg(temps.ugpr), src); + jay_MOV(b, dst, def_from_reg(temps.ugpr)); + } else { + jay_MOV(b, dst, src); + } +} + +/* + * Sequentialize a parallel copy. temps are registers free *before* the + * parallel copy. A temporary might be the destination of a copy, but it + * cannot be the source of any copy (since copying a free register is + * undefined). Therefore it cannot be a part of a cycle, so it is free for use + * (only) when handling cycles, which must happen before sequential copies. + */ +static void +jay_emit_parallel_copies(jay_builder *b, + struct jay_parallel_copy *pcopies, + unsigned num_copies, + struct jay_temp_regs temps) +{ + /* Compact away trivial copies upfront to reduce runtime. */ + unsigned new_num_copies = 0; + for (unsigned i = 0; i < num_copies; ++i) { + assert(r_file(pcopies[i].dst) == r_file(pcopies[i].src)); + + if (pcopies[i].dst != pcopies[i].src) { + pcopies[new_num_copies++] = pcopies[i]; + } + } + + num_copies = new_num_copies; + if (num_copies == 0) + return; + + assert(num_copies < UINT16_MAX); + BITSET_WORD *done = BITSET_CALLOC(num_copies); + uint16_t *reg_use_count[JAY_NUM_RA_FILES]; + jay_foreach_ra_file(f) { + reg_use_count[f] = calloc(b->shader->num_regs[f], sizeof(uint16_t)); + } + + struct jay_parallel_copy *simple = malloc(num_copies * sizeof(*simple)); + unsigned num_simple = 0; + +#ifndef NDEBUG + BITSET_WORD *packed = BITSET_CALLOC(UINT16_MAX); + + if (0) { + const char *files = "ruMm"; + printf("[[\n"); + + for (unsigned i = 0; i < num_copies; i++) { + printf(" %c%u = %c%u\n", files[r_file(pcopies[i].dst)], + r_reg(pcopies[i].dst), files[r_file(pcopies[i].src)], + r_reg(pcopies[i].src)); + } + + printf("]]\n"); + } + + /** + * Assert that each parallel copy destination is unique: no reg can appear + * as the destination of two parallel copies. + */ + for (unsigned i = 0; i < num_copies; i++) { + assert(!BITSET_TEST(packed, pcopies[i].dst)); + BITSET_SET(packed, pcopies[i].dst); + } + + free(packed); +#endif + + for (unsigned i = 0; i < num_copies; i++) { + ++reg_use_count[r_file(pcopies[i].src)][r_reg(pcopies[i].src)]; + } + + bool progress; + do { + progress = false; + + /* Step 1: resolve paths in the transfer graph. This means finding + * copies whose destination aren't blocked by something else and then + * emitting them, continuing this process until every copy is blocked + * and there are only cycles left. + * + * TODO: We should note that src is also available in dest to unblock + * cycles that src is involved in. + */ + for (unsigned i = 0; i < num_copies; i++) { + struct jay_parallel_copy *copy = &pcopies[i]; + + if (!BITSET_TEST(done, i) && + reg_use_count[r_file(copy->dst)][r_reg(copy->dst)] == 0) { + + simple[num_simple++] = *copy; + BITSET_SET(done, i); + --reg_use_count[r_file(copy->src)][r_reg(copy->src)]; + progress = true; + } + } + } while (progress); + + /* Step 2: resolve cycles through swapping. + * + * At this point, the transfer graph should consist of only cycles. + * The reason is that, given any reg n_1 that's the source of a + * remaining entry, it has a destination n_2, which (because every + * copy is blocked) is the source of some other copy whose destination + * is n_3, and so we can follow the chain until we get a cycle. If we + * reached some other node than n_1: + * + * n_1 -> n_2 -> ... -> n_i + * ^ | + * |-------------| + * + * then n_2 would be the destination of 2 copies, which is illegal + * (checked above in an assert). So n_1 must be part of a cycle: + * + * n_1 -> n_2 -> ... -> n_i + * ^ | + * |---------------------| + * + * and this must be only cycle n_1 is involved in, because any other + * path starting from n_1 would also have to end in n_1, resulting in + * a node somewhere along the way being the destination of 2 copies + * when the 2 paths merge. + * + * The way we resolve the cycle is through picking a copy (n_1, n_2) + * and swapping n_1 and n_2. This moves n_1 to n_2, so n_2 is taken + * out of the cycle: + * + * n_1 -> ... -> n_i + * ^ | + * |--------------| + * + * and we can keep repeating this until the cycle is empty. After each + * swap, we update sources of blocking copies. At that point, every + * blocking copy's source should be contained within our destination. + */ + for (unsigned i = 0; i < num_copies; i++) { + struct jay_parallel_copy *copy = &pcopies[i]; + + if (!BITSET_TEST(done, i) && copy->dst != copy->src) { + jay_def dst = def_from_reg(copy->dst), src = def_from_reg(copy->src); + assert(dst.file == src.file); + enum jay_file file = dst.file; + jay_reg tmp = (file == GPR || file == MEM) ? temps.gpr : temps.ugpr; + + if (tmp != NO_REG) { + struct jay_temp_regs t = { .gpr = temps.gpr2, .ugpr = temps.ugpr2 }; + jay_def temp = push_temp(b, tmp, file == MEM /* stride4 */); + { + mov(b, temp, dst, t); + mov(b, dst, src, t); + mov(b, src, temp, t); + } + pop_temp(b, temps, temp); + } else { + jay_SWAP(b, dst, src); + } + + for (unsigned j = 0; j < num_copies; j++) { + if (pcopies[j].src == copy->dst) + pcopies[j].src = copy->src; + } + + /* Simple copies are deferred. Their destinations do not conflict with + * our swaps, but we need to swap their sources to sink. + */ + for (unsigned j = 0; j < num_simple; j++) { + assert(simple[j].dst != copy->src && simple[j].dst != copy->dst); + + if (simple[j].src == copy->src) + simple[j].src = copy->dst; + else if (simple[j].src == copy->dst) + simple[j].src = copy->src; + } + } + + BITSET_SET(done, i); + } + + /* Emit moves after swaps because they fan out and thus increase demand. + * This gives us more freedom around temporaries. The rewrite of simple + * copies above ensures correctness. + * + * Simiarly, we first emit memory-memory copies since those require + * temporaries but only register copies can clobber the temporaries. + */ + for (unsigned i = 0; i < num_simple; i++) { + jay_def dst = def_from_reg(simple[i].dst); + jay_def src = def_from_reg(simple[i].src); + + if (jay_is_mem(dst) && jay_is_mem(src)) { + mov(b, dst, src, temps); + } + } + + for (unsigned i = 0; i < num_simple; i++) { + jay_def dst = def_from_reg(simple[i].dst); + jay_def src = def_from_reg(simple[i].src); + + if (!(jay_is_mem(dst) && jay_is_mem(src))) { + mov(b, dst, src, temps); + + if (temps.gpr == simple[i].dst || temps.gpr == simple[i].src) { + temps.gpr = NO_REG; + } + + if (temps.ugpr == simple[i].dst || temps.ugpr == simple[i].src) { + temps.ugpr = NO_REG; + } + } + } + + jay_foreach_ra_file(f) { + free(reg_use_count[f]); + } + + free(simple); + free(done); +} + +static bool +reg_is_available(jay_ra_state *ra, jay_reg reg) +{ + assert(reg != NO_REG); + return BITSET_TEST(ra->available_regs[r_file(reg)], r_reg(reg)); +} + +static void +assign_reg_for_index(jay_ra_state *ra, uint32_t index, jay_reg reg) +{ + /* Update our data structures */ + ra->reg_for_index[index] = reg; + ra->index_for_reg[r_file(reg)][r_reg(reg)] = index; + BITSET_CLEAR(ra->available_regs[r_file(reg)], r_reg(reg)); + + /* Update the web to the most recent register. Heuristic from Colombet. */ + ra->phi_web[phi_web_find(ra->phi_web, index)].reg = reg; + + /* Post-conditions */ + assert(!reg_is_available(ra, reg)); + assert(current_reg(ra, index) == reg); +} + +static void +release_reg(jay_ra_state *ra, jay_reg reg) +{ + /* Update available_regs only - the reg<-->index maps are invalidated. */ + BITSET_SET(ra->available_regs[r_file(reg)], r_reg(reg)); +} + +static unsigned +register_demand(jay_ra_state *ra, enum jay_file f) +{ + unsigned n = ra->num_regs[f]; + return n - __bitset_prefix_sum(ra->available_regs[f], n, BITSET_WORDS(n)); +} + +static jay_reg +try_find_free_reg(jay_ra_state *ra, enum jay_file file, unsigned except) +{ + unsigned i; + + /* Prefer stride 4 temporaries, since they are more compatible and thus + * should reduce swapping on average. + */ + if (file == GPR) { + BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) { + if (i != except && + jay_gpr_to_stride(&ra->bld.shader->partition, i) == JAY_STRIDE_4) { + return make_reg(file, i); + } + } + } + + BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) { + if (i != except) { + return make_reg(file, i); + } + } + + return NO_REG; +} + +static jay_reg +find_free_reg(jay_ra_state *ra, enum jay_file file, unsigned except) +{ + jay_reg reg = try_find_free_reg(ra, file, except); + + if (reg == NO_REG) { + fprintf(stderr, "file %u, current demand %u, target %u\n", file, + register_demand(ra, file), ra->num_regs[file]); + UNREACHABLE("there should have been a free register"); + } + + return reg; +} + +static inline struct jay_temp_regs +find_temp_regs(jay_ra_state *ra) +{ + jay_reg gpr = try_find_free_reg(ra, GPR, ~0); + jay_reg ugpr = try_find_free_reg(ra, UGPR, ~0); + + return (struct jay_temp_regs) { + .gpr = gpr, + .ugpr = ugpr, + .gpr2 = try_find_free_reg(ra, GPR, gpr), + .ugpr2 = try_find_free_reg(ra, UGPR, ugpr), + }; +} + +static unsigned +pick_regs(jay_ra_state *ra, + enum jay_file file, + unsigned size, + unsigned alignment, + enum jay_stride min_stride, + enum jay_stride max_stride, + jay_inst *I, + jay_def var, + jay_def *last_killed, + bool is_src) +{ + struct jay_partition *partition = &ra->bld.shader->partition; + unsigned first = 0, end = ra->num_regs[file]; + unsigned ugpr_per_grf = jay_ugpr_per_grf(ra->bld.shader); + bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND; + must_tie &= !is_src; + + /* Cross-lane access cannot be SIMD split if the source/destination registers + * overlap, but as long as we don't tie those destinations, we're ok. + */ + bool may_tie = !jay_is_shuffle_like(I); + + /* Ensure we do not cross partitions */ + if (file == UGPR && size > 16) { + first = partition->large_ugpr_block.start; + end = partition->large_ugpr_block.start + partition->large_ugpr_block.len; + } + + /* Sources used by end-of-thread sends must be at the end of the file */ + if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) { + first = ra->eot_offs; + } + + /* If possible, keep sources in place to avoid shuffles. */ + if (is_src && jay_channel(var, 0) != 0) { + unsigned cur = r_reg(ra->reg_for_index[jay_channel(var, 0)]); + enum jay_stride stride = jay_gpr_to_stride(partition, cur); + + if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) && + util_is_aligned(cur, alignment) && + cur >= first && + cur + size <= end && + (file != GPR || (min_stride <= stride && stride <= max_stride))) { + return cur; + } + } + + unsigned best_cost = UINT32_MAX; + unsigned best_reg = 0; + struct affinity affinity = + ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity; + + assert(alignment >= size && "alignment must be a multiple of size"); + + for (unsigned r = first; r + size <= end; r += alignment) { + unsigned cost = 0; + bool tied = last_killed && last_killed->reg == r; + enum jay_stride stride = + file == GPR ? jay_gpr_to_stride(partition, r) : min_stride; + + if ((tied ? !may_tie : + (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) || + !(min_stride <= stride && stride <= max_stride)) + continue; + + /* Assigning a stride that is too big may result in SIMDness splitting. + * Model that cost so we prefer packed registers. + */ + cost += stride - min_stride; + + /* If we are used for end-of-thread and it is not in the appropriate + * register, we will need to insert 1 copy per channel at the end. + */ + if (affinity.eot && r < ra->eot_offs) + cost += size; + + /* If there are stricter alignment requirements later, model the cost of + * inserting copies for that. + */ + if (affinity.grf_align && + !util_is_aligned(r - affinity.align_offs, ugpr_per_grf)) + cost += size; + + if (affinity.repr == jay_channel(var, 0)) { + /* If we are the collect representative but the final collect won't + * actually be usable, the whole vector will need to be copied. + */ + if (!util_is_aligned(r - affinity.offset, 8) || + (affinity.eot && r - affinity.offset < ra->eot_offs)) { + cost += 8; + } + } else if (affinity.repr) { + /* If we are used for a collect but not in the right place, we will + * similarly insert copies. + */ + if (ra->reg_for_index[affinity.repr] != NO_REG && + r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) { + + cost += size; + } + } + + for (unsigned c = 0; c < size; ++c) { + unsigned i = r + c; + + /* If the register is unavailable, account for the cost of shuffling */ + if (!BITSET_TEST(ra->available_regs[file], i) && !tied) { + cost++; + + /* ..plus the cost of shuffling back. */ + if (u_sparse_bitset_test(&ra->block->live_out, + ra->index_for_reg[file][i])) + cost++; + } + + /* Model the cost of shuffling for phis */ + if (c < jay_num_values(var)) { + struct phi_web_node *phi_web = + &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))]; + if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != i) { + cost += 2; + } + } + + /* Choosing this register will pin it, leaving it unavailable to later + * smaller sources which will need to be shuffled. Account for those + * moves. + * + * TODO: Faster algorithm. + */ + jay_foreach_src_index(I, s, c, index) { + if (jay_num_values(I->src[s]) < size && + ra->reg_for_index[index] == make_reg(file, i)) { + cost++; + } + } + } + + if (cost < best_cost) { + best_cost = cost; + best_reg = r; + + /* If we find something with 0 cost, we are guaranteed to pick this + * register, so terminate early. This speeds up the search. + */ + if (cost == 0) { + break; + } + } + } + + assert(best_cost != UINT32_MAX && "we always find something"); + assert(best_reg + size <= ra->num_regs[file]); + return best_reg; +} + +struct window { + jay_reg base; + uint16_t length; +}; +static_assert(sizeof(struct window) == 4, "packed"); + +static void +assign_regs_for_inst(jay_ra_state *ra, jay_inst *I) +{ + jay_shader *shader = ra->bld.shader; + jay_def *vars[JAY_MAX_OPERANDS]; + jay_def *last_killed[JAY_NUM_RA_FILES] = { 0 }; + jay_def saved_srcs[JAY_MAX_SRCS]; + struct jay_parallel_copy copies[JAY_MAX_DEF_LENGTH * JAY_MAX_OPERANDS]; + uint32_t eviction_indices[JAY_MAX_DEF_LENGTH * JAY_MAX_OPERANDS]; + unsigned nr_vars = 0, nr_copies = 0; + + /* Gather temporary registers that are free /before/ any shuffling */ + struct jay_temp_regs temp_regs = find_temp_regs(ra); + + /* Save sources so we can get at last-use info even after munging */ + typed_memcpy(saved_srcs, I->src, I->num_srcs); + + /* Gather sources (in order) then destinations. This order (with a stable + * sort) ensures we see killed sources before same-size destinations, + * naturally tying the last source to the destination. Predicated default + * values rely on this invariant for correctness. + */ + jay_foreach_ra_src(I, s) { + /* Filter out duplicate scalar sources - they should only be assigned + * once. Duplicated vector sources are lowered away as a precondition. + */ + bool duplicate = false; + if (jay_num_values(I->src[s]) == 1) { + uint32_t index = jay_index(I->src[s]); + + for (unsigned i = 0; i < nr_vars; ++i) { + jay_def var = *(vars[i]); + duplicate |= (jay_num_values(var) == 1 && jay_index(var) == index); + } + } + + if (!duplicate) { + vars[nr_vars++] = &I->src[s]; + + /* Record the old registers as parallel copies to be filled in later. + * Then release the old registers to be reassigned. + */ + jay_foreach_index(I->src[s], _, index) { + jay_reg reg = current_reg(ra, index); + assert(reg != NO_REG); + + eviction_indices[nr_copies] = index; + copies[nr_copies++] = (struct jay_parallel_copy) { .src = reg }; + release_reg(ra, reg); + } + } + } + + if (!jay_is_null(I->dst) && I->dst.file < JAY_NUM_RA_FILES) { + vars[nr_vars++] = &I->dst; + } + + /* Sort variables by size in descending order. We use insertion sort + * because it is stable, adaptive, and faster than mergesort for small n. + * + * Algorithm from CLRS. + */ + for (unsigned i = 1; i < nr_vars; ++i) { + jay_def *pivot = vars[i]; + unsigned j, key = pivot->num_values_m1; + + for (j = i; j > 0 && key > vars[j - 1]->num_values_m1; --j) { + vars[j] = vars[j - 1]; + } + + vars[j] = pivot; + } + + /* Partition `copies` into "source shuffles" and "livethrough shuffles" */ + uint32_t first_eviction_copy = nr_copies; + + /* Choose registers for sources/destinations in order */ + for (unsigned i = 0; i < nr_vars; ++i) { + bool is_src = vars[i] >= I->src; + bool killed = false; + jay_def var = *(vars[i]); + unsigned size = jay_num_values(var); + if (is_src) { + assert(util_is_power_of_two_nonzero(size) && "NPOT sources lowered"); + } else { + size = util_next_power_of_two(size); + } + + unsigned alignment = I->op == JAY_OPCODE_EXPAND_QUAD ? 1 : size; + enum jay_file file = var.file; + enum jay_stride min_stride = JAY_STRIDE_2, max_stride = JAY_STRIDE_8; + + assert(size > 0 && file < JAY_NUM_RA_FILES && "filtered above"); + + if (is_src) { + /* If a source is duplicated, we need to take the most constrained + * version. This matters for 3-src restrictions. + */ + jay_foreach_src(I, s) { + if (jay_defs_equivalent(var, I->src[s])) { + alignment = MAX2(alignment, jay_src_alignment(shader, I, s)); + min_stride = + MAX2(jay_src_stride_minmax(I, s, false), min_stride); + max_stride = MIN2(jay_src_stride_minmax(I, s, true), max_stride); + } + } + + unsigned s = vars[i] - I->src; + + /* Sources are considered killed only if completely killed */ + unsigned lu = jay_source_last_use_bit(saved_srcs, s); + + killed = true; + for (unsigned i = 0; i < size; ++i) { + if (jay_channel(I->src[s], i) == 0 || + !BITSET_TEST(I->last_use, lu + i)) { + killed = false; + break; + } + } + } else { + alignment = MAX2(alignment, jay_dst_alignment(shader, I)); + min_stride = jay_dst_stride_minmax(I, false); + max_stride = jay_dst_stride_minmax(I, true); + } + + /* Choose registers satisfying the constraints and minimizing shuffles */ + unsigned base = + pick_regs(ra, file, size, alignment, min_stride, max_stride, I, var, + is_src ? NULL : last_killed[file], is_src); + jay_reg reg = make_reg(file, base); + + /* If we decided to tie, process that */ + if (!is_src && last_killed[file] && last_killed[file]->reg == base) { + /* Fully killed source so we can zero a contiguous range. Note we need + * to use the unpadded size to avoid leaking a register for vec3 + * destinations tied to vec4 sources. + */ + unsigned offs = + jay_source_last_use_bit(saved_srcs, last_killed[file] - I->src); + BITSET_CLEAR_COUNT(I->last_use, offs, jay_num_values(var)); + last_killed[file] = NULL; + } else { + /* Otherwise pin our choice */ + BITSET_SET_COUNT(ra->pinned[file], base, size); + + for (unsigned c = 0; c < size; ++c) { + /* Evict any livethrough value interfering with our choice */ + if (!(is_src && jay_channel(var, c) == 0) && + !reg_is_available(ra, reg + c)) { + uint32_t index = ra->index_for_reg[file][base + c]; + struct jay_parallel_copy copy = { .src = reg + c }; + eviction_indices[nr_copies] = index; + copies[nr_copies++] = copy; + release_reg(ra, reg + c); + } + } + } + + jay_set_reg(vars[i], base); + + jay_foreach_index(var, c, index) { + assign_reg_for_index(ra, index, reg + c); + } + + if (killed) { + last_killed[file] = vars[i]; + } + } + + /* Set .reg late so duplicated scalar sources are handled properly */ + jay_foreach_ra_src(I, s) { + if (I->src[s]._payload != JAY_SENTINEL) { + jay_set_reg(&I->src[s], + r_reg(ra->reg_for_index[jay_channel(I->src[s], 0)])); + } + } + + /* Look up where shuffled sources ended up */ + for (unsigned i = 0; i < first_eviction_copy; ++i) { + copies[i].dst = ra->reg_for_index[eviction_indices[i]]; + } + + /* Assign new registers for evicted values */ + for (unsigned i = first_eviction_copy; i < nr_copies; ++i) { + copies[i].dst = find_free_reg(ra, r_file(copies[i].src), ~0); + assign_reg_for_index(ra, eviction_indices[i], copies[i].dst); + } + + /* Shuffle everything */ + ra->bld.cursor = jay_before_inst(I); + jay_emit_parallel_copies(&ra->bld, copies, nr_copies, temp_regs); + + /* Reset data structures */ + for (unsigned i = 0; i < nr_vars; ++i) { + jay_def var = *(vars[i]); + BITSET_CLEAR_COUNT(ra->pinned[var.file], var.reg, + util_next_power_of_two(jay_num_values(var))); + } + + /* Sources selected for early-kill have had their last_use fields cleared. + * Anything else is late-killed. Release those registers. + */ + unsigned kill_idx = 0; + jay_foreach_ssa_src(I, s) { + jay_foreach_index(saved_srcs[s], c, idx) { + if (is_ra_src(I->src[s]) && BITSET_TEST(I->last_use, kill_idx)) { + release_reg(ra, make_reg(I->src[s].file, I->src[s].reg + c)); + } + + kill_idx++; + } + } +} + +static void +local_ra(jay_ra_state *ra, jay_block *block) +{ + ra->block = block; + + /* Initialize local data structures based on global state */ + jay_foreach_ra_file(file) { + BITSET_SET_COUNT(ra->available_regs[file], 0, ra->num_regs[file]); + } + + U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) { + if (ra->global_reg_for_index[i] != NO_REG) { + assign_reg_for_index(ra, i, ra->global_reg_for_index[i]); + } + } + + /* Assign registers locally */ + jay_foreach_inst_in_block(block, I) { + if (I->op == JAY_OPCODE_PHI_SRC) { + break; + } else if (I->op == JAY_OPCODE_PHI_DST) { + /* Phis are special as we never shuffle them */ + unsigned index = jay_index(I->dst); + jay_reg reg = ra->phi_web[phi_web_find(ra->phi_web, index)].reg; + + if (reg == NO_REG || !reg_is_available(ra, reg)) { + reg = find_free_reg(ra, I->dst.file, ~0); + } + + assign_reg_for_index(ra, jay_index(I->dst), reg); + I->dst.reg = r_reg(reg); + } else if (I->op == JAY_OPCODE_PRELOAD) { + /* Preloads always get what they want */ + I->dst.reg = jay_preload_reg(I); + jay_reg base = make_reg(I->dst.file, I->dst.reg); + + jay_foreach_comp(I->dst, c) { + assert(reg_is_available(ra, base + c) && "preloads always work"); + assign_reg_for_index(ra, jay_channel(I->dst, c), base + c); + } + } else { + /* For normal instructions, assign registers. */ + assign_regs_for_inst(ra, I); + } + + /* Release registers for destinations that are immediately killed */ + jay_foreach_index(I->dst, _, index) { + if (BITSET_TEST(ra->bld.func->dead_defs, index)) { + release_reg(ra, current_reg(ra, index)); + } + } + + if (jay_debug & JAY_DBG_PRINTDEMAND) { + printf("(RA) [G:%u\tU:%u] ", register_demand(ra, GPR), + register_demand(ra, UGPR)); + jay_print_inst(stdout, I); + } + } + + /* Gather temporary registers that are free /before/ any shuffling */ + struct jay_temp_regs temp_regs = find_temp_regs(ra); + + /* Reconcile local state with the global structures */ + jay_foreach_ra_file(file) { + BITSET_SET_COUNT(ra->available_regs[file], 0, ra->num_regs[file]); + } + + /* Extend live ranges for correctness. Might be a better solution though. */ + jay_foreach_inst_in_block_rev(block, I) { + if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) { + break; + } + + jay_foreach_ra_src(I, s) { + u_sparse_bitset_set(&block->live_out, jay_index(I->src[s])); + } + } + + /* Already assigned global registers need to be shuffled back */ + struct util_dynarray copies = UTIL_DYNARRAY_INIT; + + U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) { + jay_reg lreg = ra->reg_for_index[i], greg = ra->global_reg_for_index[i]; + + if (lreg != NO_REG && greg != NO_REG) { + add_copy(&copies, greg, lreg); + assign_reg_for_index(ra, i, greg); + } + } + + /* Live-out variables defined in this block need global registers assigned */ + U_SPARSE_BITSET_FOREACH_SET(&block->live_out, i) { + jay_reg reg = ra->reg_for_index[i]; + + if (ra->global_reg_for_index[i] == NO_REG && reg != NO_REG) { + if (!reg_is_available(ra, reg)) { + jay_reg old = reg; + reg = find_free_reg(ra, r_file(reg), ~0); + add_copy(&copies, reg, old); + } + + assign_reg_for_index(ra, i, reg); + ra->global_reg_for_index[i] = reg; + } + } + + /* Gather temporary registers free after shuffling (before phis) */ + block->temps_out = find_temp_regs(ra); + + /* Handle the end of the block */ + ra->bld.cursor = jay_before_block(block); + + jay_foreach_inst_in_block_rev(block, I) { + if (I->op != JAY_OPCODE_PHI_SRC && !jay_op_is_control_flow(I->op)) { + ra->bld.cursor = jay_after_inst(I); + break; + } + + jay_foreach_ra_src(I, s) { + jay_set_reg(&I->src[s], + r_reg(ra->global_reg_for_index[jay_index(I->src[s])])); + } + } + + const unsigned num_pcopies = + util_dynarray_num_elements(&copies, struct jay_parallel_copy); + + jay_emit_parallel_copies(&ra->bld, copies.data, num_pcopies, temp_regs); + util_dynarray_fini(&copies); +} + +/* + * Record all phi webs. First initialize the union-find data structure + * with all SSA defs in their own singletons, then union together anything + * related by a phi. The resulting union-find structure will be the webs. + */ +static void +construct_phi_webs(struct phi_web_node *web, jay_function *f) +{ + for (unsigned i = 0; i < f->ssa_alloc; ++i) { + web[i] = (struct phi_web_node) { .parent = i, .reg = NO_REG }; + } + + jay_foreach_block(f, block) { + jay_foreach_phi_src_in_block(block, phi) { + phi_web_union(web, jay_index(phi->src[0]), jay_phi_src_index(phi)); + } + } +} + +static void +insert_parallel_copies_for_phis(jay_function *f) +{ + jay_reg *phi_dsts = calloc(f->ssa_alloc, sizeof(jay_reg)); + struct util_dynarray copies = UTIL_DYNARRAY_INIT; + memset(phi_dsts, 0xFF, sizeof(jay_reg) * f->ssa_alloc); + + jay_foreach_block(f, block) { + jay_foreach_phi_dst_in_block(block, I) { + phi_dsts[jay_index(I->dst)] = make_reg(I->dst.file, I->dst.reg); + } + } + + jay_foreach_block(f, block) { + jay_builder b = jay_init_builder(f, jay_before_jump(block)); + + /* Copy phi source to phi destination along the edge. */ + jay_foreach_phi_src_in_block(block, phi) { + jay_reg src = make_reg(phi->src[0].file, phi->src[0].reg); + add_copy(&copies, phi_dsts[jay_phi_src_index(phi)], src); + jay_remove_instruction(phi); + } + + const unsigned nr = + util_dynarray_num_elements(&copies, struct jay_parallel_copy); + + jay_emit_parallel_copies(&b, copies.data, nr, block->temps_out); + util_dynarray_clear(&copies); + } + + util_dynarray_fini(&copies); + free(phi_dsts); +} + +static struct jay_register_block +block_gpr_to_grf(struct jay_partition *p, enum jay_file file, unsigned block) +{ + assert(file == GPR || file == UGPR); + assert(((p->blocks[file][block].start * 16) % p->units_x16[file]) == 0); + assert(((p->blocks[file][block].len * 16) % p->units_x16[file]) == 0); + + return (struct jay_register_block) { + .start = (p->blocks[file][block].start * 16) / p->units_x16[file], + .len = (p->blocks[file][block].len * 16) / p->units_x16[file], + }; +} + +static void +print_partition(struct jay_partition *p) +{ + for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { + for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { + struct jay_register_block B = block_gpr_to_grf(p, f, b); + const char *file = f ? "UGPR" : "GPR"; + + if (B.len > 1) { + fprintf(stderr, "%s: %u-%u\n", file, B.start, B.start + B.len - 1); + } else if (B.len == 1) { + fprintf(stderr, "%s: %u\n", file, B.start); + } + } + } + + fprintf(stderr, "\n"); +} + +/* + * Verify that a register partition is a bijective mapping of the GRF file. + */ +static void +validate_partition(struct jay_partition *p, + unsigned stride4_header_size, + unsigned nonuniform_gprs) +{ + BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 }; + + for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { + for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { + struct jay_register_block B = block_gpr_to_grf(p, f, b); + if (B.len) { + assert(B.start + B.len <= JAY_NUM_PHYS_GRF && "GRF file size"); + assert(!BITSET_TEST_COUNT(regs, B.start, B.len) && "uniqueness"); + + BITSET_SET_COUNT(regs, B.start, B.len); + } + } + } + + for (unsigned i = 0; i < JAY_NUM_PHYS_GRF; ++i) { + assert(BITSET_TEST(regs, i) && "all GRFs mapped"); + } + + assert(p->large_ugpr_block.len && "partition must have a large UGPR block"); + assert(p->base2 >= p->base8 && p->base_eot >= p->base2 && "monotonic"); + assert(p->base8 >= stride4_header_size && "header is big enough"); + assert(p->base_eot + p->units_x16[GPR] <= nonuniform_gprs && "EOT fits"); + assert(util_is_aligned(p->base8, 8) && "so vectors don't cross"); + assert(util_is_aligned(p->base2, 8) && "so vectors don't cross"); + assert(util_is_aligned(p->base_eot, 8) && "so vectors don't cross"); +} + +static void +build_partition(jay_shader *shader, unsigned *blocks, unsigned n) +{ + unsigned base = 0; + unsigned ugpr_base = 0; + struct jay_partition *p = &shader->partition; + + *p = (struct jay_partition) { + .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16, + .units_x16[GPR] = 16 / jay_grf_per_gpr(shader), + }; + + for (unsigned i = 0; i < n; ++i) { + enum jay_file file = (i & 1) ? GPR : UGPR; + unsigned file_i = i >> 1; + + p->blocks[file][file_i].start = (base * p->units_x16[file]) / 16; + p->blocks[file][file_i].len = (blocks[i] * p->units_x16[file]) / 16; + + if (file == UGPR && blocks[i] >= 8) { + p->large_ugpr_block = (struct jay_register_block) { + .start = (ugpr_base * p->units_x16[file]) / 16, + .len = p->blocks[file][file_i].len, + }; + } + + base += blocks[i]; + if (file == UGPR) { + ugpr_base += blocks[i]; + } + + /* GPR partition blocks must be vector size aligned to avoid crossing */ + if (file == GPR && i != (n - 1)) { + unsigned max_vec = 8; + assert(util_is_aligned(blocks[i], max_vec * jay_grf_per_gpr(shader))); + } + } +} + +/* + * Partition the register file for the entire shader. All functions must + * share the same partition for correctness with non-uniform function calls. + * For unlinked library functions, we must use the ABI partition (TODO). + */ +void +jay_partition_grf(jay_shader *shader) +{ + /* Calculate the maximum register demand across all functions in the shader. + * We will use this to choose a good partition. + */ + struct jay_partition *p = &shader->partition; + unsigned demand[JAY_NUM_GRF_FILES] = { 0 }; + + jay_foreach_function(shader, f) { + jay_compute_liveness(f); + jay_calculate_register_demands(f); + + demand[GPR] = MAX2(demand[GPR], f->demand[GPR]); + demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]); + } + + /* We must have enough register file space for the register payload, plus the + * reserved UGPRs in the case we spill. That UGPR interferes with everything + * we preload so it needs to be reserved specially here for the worst case. + */ + jay_foreach_preload(jay_shader_get_entrypoint(shader), I) { + unsigned end = jay_preload_reg(I) + jay_num_values(I->dst); + unsigned extra = I->dst.file == UGPR ? shader->dispatch_width + 1 : 0; + assert(I->dst.file < JAY_NUM_GRF_FILES); + demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra); + } + + /* Determine a good GPR/UGPR split informed by the demand calculation */ + unsigned ugpr_per_grf = jay_ugpr_per_grf(shader); + unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf); + + /* We must have enough for SIMD1 images (TODO: Check if this actually + * applies. Or if we could eliminate this with smarter partitioning even.) + */ + unsigned min_ugprs = 16; + min_ugprs = MAX2(min_ugprs, 256); + + unsigned grf_block_alignment = 8 * jay_grf_per_gpr(shader); /* max_vec */ + + /* TODO: We could partition more cleverly */ + uniform_grfs = CLAMP(align(uniform_grfs, grf_block_alignment), + DIV_ROUND_UP(min_ugprs, ugpr_per_grf), + 128 - (32 * jay_grf_per_gpr(shader))); + unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; + + /* Check the split */ + assert((uniform_grfs * ugpr_per_grf) >= min_ugprs); + assert(nonuniform_grfs >= 32 * jay_grf_per_gpr(shader)); + assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF); + + /* Partition GRFs between GPR & UGPR */ + unsigned dispatch_grf = 0; + unsigned stride4_header_size = 0; + + if (shader->stage == MESA_SHADER_VERTEX) { + unsigned attrib_grfs = shader->prog_data->vue.urb_read_length * 8; + unsigned blocks[] = { + 1, /* UGPR: g0 */ + 8, /* GPR: URB output handle */ + shader->push_grfs, /* UGPR: Push constants */ + attrib_grfs, /* GPR: Vertex inputs */ + uniform_grfs - (blocks[0] + blocks[2]), /* UGPR: * */ + nonuniform_grfs - (blocks[1] + blocks[3]), /* GPR: * and EOT */ + }; + + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + dispatch_grf = blocks[0] + blocks[1]; + stride4_header_size = blocks[1] + blocks[3]; + } else if (shader->stage == MESA_SHADER_FRAGMENT) { + unsigned len0 = jay_grf_per_gpr(shader); + unsigned blocks[] = { + len0, /* UGPR: g0 (and maybe g1) */ + len0 * 8, /* GPR: Barycentrics */ + uniform_grfs - len0, /* UGPR: Dispatch (eg push constants) & general */ + nonuniform_grfs - (len0 * 8), /* GPR: General & end-of-thread */ + }; + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + dispatch_grf = blocks[0] + blocks[1]; + stride4_header_size = blocks[1]; + } else { + unsigned blocks[] = { uniform_grfs - 4, nonuniform_grfs, 4 }; + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + } + + /* TODO: Make the stride partition smarter */ + unsigned nonuniform_gprs = nonuniform_grfs / jay_grf_per_gpr(shader); + unsigned eot_gprs = 16 / jay_grf_per_gpr(shader); + p->base8 = ROUND_DOWN_TO(nonuniform_gprs - (16 + eot_gprs), 8) + 0; + p->base2 = 8 + p->base8; + p->base_eot = 8 + p->base2; + + // print_partition(p); + validate_partition(p, stride4_header_size, nonuniform_gprs); + + if (shader->stage == MESA_SHADER_FRAGMENT && shader->dispatch_width == 32) { + shader->prog_data->fs.dispatch_grf_start_reg_32 = dispatch_grf; + } else if (shader->stage == MESA_SHADER_FRAGMENT && + shader->dispatch_width == 16) { + shader->prog_data->fs.dispatch_grf_start_reg_16 = dispatch_grf; + } else { + shader->prog_data->base.dispatch_grf_start_reg = dispatch_grf; + } + + /* By construction of our partition, the entire GRF is used. */ + shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF; + + /* Set the targets for the virtual register file accordingly */ + for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { + for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { + shader->num_regs[f] += p->blocks[f][b].len; + } + } + + /* TODO: These are arbitrary. Need to rework somehow, we have options. */ + shader->num_regs[MEM] = 512; + shader->num_regs[UMEM] = 2048; +} + +static void +spill_file(jay_function *f, enum jay_file file, bool *spilled) +{ + unsigned limit = f->shader->num_regs[file]; + + /* If testing spilling, set limit tightly. */ + if ((jay_debug & JAY_DBG_SPILL) && + file == GPR && + f->shader->stage != MESA_SHADER_VERTEX) { + limit = 13; + } + + /* Ensures we don't XOR swap, XXX: TODO: FIXME */ + limit--; + + if (f->demand[file] > limit) { + /* In the worst case, we + * require 2 temporary registers to lower a memory-memory swap produced by + * parallel copy lowering, so adjust the limit to be num_regs - 2. + */ + limit--; + + /* If we spill, we need to reserve UGPRs for spilling */ + if (!(*spilled)) { + unsigned reservation = f->shader->dispatch_width + 1; + f->shader->num_regs[UGPR] -= reservation; + f->shader->partition.large_ugpr_block.len -= reservation; + } + + jay_spill(f, file, limit); + jay_validate(f->shader, "spilling"); + jay_compute_liveness(f); + jay_calculate_register_demands(f); + + if (f->demand[file] > limit) { + fprintf(stderr, "limit %u but demand %u\n", limit, f->demand[file]); + UNREACHABLE("spiller bug"); + } + + *spilled = true; + } +} + +static void +jay_register_allocate_function(jay_function *f) +{ + jay_shader *shader = f->shader; + jay_ra_state ra = { .bld.shader = shader, .bld.func = f }; + + /* Spill as needed to fit within the limits. We spill GPR before UGPR since + * spilling GPRs requires reserving a UGPR. + */ + bool spilled = false; + spill_file(f, GPR, &spilled); + spill_file(f, UGPR, &spilled); + + typed_memcpy(ra.num_regs, shader->num_regs, JAY_NUM_RA_FILES); + + /* The end of the register file is allowed for end-of-thread messages. + * Calculate the offset in GPRs. Compute shaders have this as UGPRs while + * fragment shaders have this as GPRs. + */ + if (mesa_shader_stage_is_compute(shader->stage)) { + ra.eot_offs = ROUND_DOWN_TO(ra.num_regs[UGPR], jay_ugpr_per_grf(shader)) - + jay_ugpr_per_grf(shader); + } else { + ra.eot_offs = ra.num_regs[GPR] - (16 / jay_grf_per_gpr(shader)); + } + + linear_ctx *lin_ctx = linear_context(shader); + + ra.reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc); + ra.global_reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc); + ra.affinities = linear_zalloc_array(lin_ctx, struct affinity, f->ssa_alloc); + + memset(ra.reg_for_index, 0xFF, sizeof(jay_reg) * f->ssa_alloc); + memset(ra.global_reg_for_index, 0xFF, sizeof(jay_reg) * f->ssa_alloc); + + jay_foreach_ra_file(file) { + const unsigned num_regs = ra.num_regs[file]; + ra.index_for_reg[file] = linear_zalloc_array(lin_ctx, uint32_t, num_regs); + ra.available_regs[file] = BITSET_LINEAR_ZALLOC(lin_ctx, num_regs); + ra.pinned[file] = BITSET_LINEAR_ZALLOC(lin_ctx, num_regs); + } + + ra.phi_web = linear_zalloc_array(lin_ctx, struct phi_web_node, f->ssa_alloc); + + /* Construct the phi equivalence classes using the union-find data + * structure. This associates all SSA values related to the same phi, + * and selects one of them as a canonical/representative value. + */ + construct_phi_webs(ra.phi_web, f); + + jay_foreach_inst_in_func(f, block, I) { + jay_foreach_src_index(I, s, c, index) { + if (jay_num_values(I->src[s]) > 1) { + uint32_t repr = UINT_MAX, repr_c = 0; + + /* Pick the representative with the smallest index, as it most + * likely dominates the other components. + */ + jay_foreach_comp(I->src[s], j) { + if (jay_channel(I->src[s], j) < repr) { + repr = jay_channel(I->src[s], j); + repr_c = j; + } + } + + ra.affinities[index].repr = repr; + ra.affinities[index].offset = repr == index ? c : c - repr_c; + } + + if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) { + ra.affinities[index].eot = true; + } + + if (jay_src_alignment(shader, I, s) >= jay_ugpr_per_grf(shader)) { + ra.affinities[index].grf_align = true; + ra.affinities[index].align_offs = c; + } + + ra.phi_web[phi_web_find(ra.phi_web, index)].affinity = + ra.affinities[index]; + } + } + + jay_foreach_block(f, block) { + local_ra(&ra, block); + } + + linear_free_context(lin_ctx); + + /* Validate the registers we picked before going out of SSA */ + jay_validate_ra(f); + + insert_parallel_copies_for_phis(f); + + /* Lower spills using the UGPRs we stole above. We need to update num_regs + * for correct scoreboarding calculations. + */ + if (spilled) { + jay_lower_spill(f); + f->shader->num_regs[UGPR] += f->shader->dispatch_width + 1; + } +} + +void +jay_register_allocate(jay_shader *s) +{ + jay_foreach_function(s, f) { + jay_register_allocate_function(f); + } + + s->post_ra = true; +} diff --git a/src/intel/compiler/jay/jay_repair_ssa.c b/src/intel/compiler/jay/jay_repair_ssa.c new file mode 100644 index 00000000000..794f3977cdf --- /dev/null +++ b/src/intel/compiler/jay/jay_repair_ssa.c @@ -0,0 +1,247 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2023 Alyssa Rosenzweig + * Copyright 2023 Valve Corporation + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +/* + * Implementation of "Simple and Efficient + * Construction of Static Single Assignment Form", also by Braun et al. + * https://link.springer.com/content/pdf/10.1007/978-3-642-37051-9_6.pdf + */ + +#include "util/bitset.h" +#include "util/hash_table.h" +#include "util/ralloc.h" +#include "util/u_dynarray.h" +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +struct incomplete_phi { + jay_def old; + unsigned new; +}; + +struct phi { + jay_block *block; + unsigned *src; + jay_def old; + unsigned dst; +}; + +struct ctx { + /* Array of index->index maps with the remapped definition at block end */ + struct hash_table_u64 **defs; + struct hash_table_u64 *remap; + struct util_dynarray phis, indices, *incomplete_phis; + BITSET_WORD *sealed; + void *linctx; + unsigned alloc, idx_i; +}; + +#define jay_repair_foreach_phi(ctx, phi) \ + util_dynarray_foreach(&(ctx)->phis, struct phi, phi) \ + if (phi->block != NULL) + +static unsigned lookup(struct ctx *ctx, jay_block *block, jay_def def); + +static unsigned +remap_idx(struct ctx *ctx, unsigned idx) +{ + /* TODO: Switch to union-find */ + void *remapped; + while ((remapped = _mesa_hash_table_u64_search(ctx->remap, idx))) { + idx = (uintptr_t) remapped; + } + + return idx; +} + +static bool +try_remove_trivial_phi(struct ctx *ctx, struct phi *phi) +{ + unsigned same = 0; + for (unsigned i = 0; i < jay_num_predecessors(phi->block); ++i) { + unsigned src = remap_idx(ctx, phi->src[i]); + if (same && src != same && src != phi->dst) { + /* Nontrivial */ + return false; + } + + if (src != phi->dst) { + same = src; + } + } + + _mesa_hash_table_u64_insert(ctx->remap, phi->dst, (void *) (uintptr_t) same); + phi->block = NULL; + return true; +} + +static void +add_phi(struct ctx *ctx, jay_block *block, jay_def src, unsigned dst) +{ + unsigned i = 0, n = jay_num_predecessors(block); + unsigned *srcs = linear_alloc_array(ctx->linctx, unsigned, n); + jay_foreach_predecessor(block, pred) { + assert(i < n); + srcs[i++] = lookup(ctx, *pred, src); + } + + struct phi tmpl = { .block = block, .old = src, .dst = dst, .src = srcs }; + if (!try_remove_trivial_phi(ctx, &tmpl)) { + util_dynarray_append(&ctx->phis, tmpl); + } +} + +static unsigned +lookup(struct ctx *ctx, jay_block *block, jay_def def) +{ + /* Lookup within a block */ + struct hash_table_u64 *ht = ctx->defs[block->index]; + void *local = _mesa_hash_table_u64_search(ht, jay_index(def)); + if (local) { + return (uintptr_t) local; + } + + /* For a single predecessor, we can recurse without adding a phi. */ + bool insert_phi = jay_num_predecessors(block) > 1; + unsigned val = insert_phi ? ctx->alloc++ : + lookup(ctx, jay_first_predecessor(block), def); + + _mesa_hash_table_u64_insert(ctx->defs[block->index], jay_index(def), + (void *) (uintptr_t) val); + + if (block->loop_header && !BITSET_TEST(ctx->sealed, block->index)) { + struct incomplete_phi tmpl = { .old = def, .new = val }; + util_dynarray_append(&ctx->incomplete_phis[block->index], tmpl); + } else if (insert_phi) { + add_phi(ctx, block, def, val); + } + + return val; +} + +static void +remap(struct ctx *ctx, jay_builder *b, jay_def *inout) +{ + jay_def def = *inout; + unsigned reg = def.reg; + jay_foreach_index(def, c, index) { + unsigned el = ctx->idx_i++; + assert(el < util_dynarray_num_elements(&ctx->indices, unsigned)); + unsigned idx = *util_dynarray_element(&ctx->indices, unsigned, el); + idx = remap_idx(ctx, idx); + jay_insert_channel(b, inout, c, jay_scalar(def.file, idx)); + } + + /* We run after flag RA, so preserve flag registers */ + if (jay_is_flag(def)) { + inout->reg = reg; + } +} + +void +jay_repair_ssa(jay_function *func) +{ + jay_builder b = jay_init_builder(func, jay_before_function(func)); + void *memctx = ralloc_context(NULL); + void *linctx = linear_context(memctx); + BITSET_WORD *sealed = BITSET_LINEAR_ZALLOC(linctx, func->num_blocks); + struct ctx ctx = { .sealed = sealed, .alloc = 1, .linctx = linctx }; + unsigned *phi_remap = linear_zalloc_array(linctx, unsigned, func->ssa_alloc); + + ctx.remap = _mesa_hash_table_u64_create(memctx); + ctx.defs = + linear_alloc_array(linctx, struct hash_table_u64 *, func->num_blocks); + ctx.incomplete_phis = + linear_alloc_array(linctx, struct util_dynarray, func->num_blocks); + + jay_foreach_block(func, block) { + ctx.defs[block->index] = _mesa_hash_table_u64_create(memctx); + util_dynarray_init(&ctx.incomplete_phis[block->index], memctx); + } + + util_dynarray_init(&ctx.phis, memctx); + util_dynarray_init(&ctx.indices, memctx); + + jay_foreach_block(func, block) { + jay_foreach_inst_in_block(block, I) { + jay_foreach_src_index(I, s, c, index) { + unsigned val = lookup(&ctx, block, jay_extract(I->src[s], c)); + util_dynarray_append(&ctx.indices, val); + } + + jay_foreach_dst_index(I, d, index) { + unsigned val = ctx.alloc++; + util_dynarray_append(&ctx.indices, val); + if (I->op == JAY_OPCODE_PHI_DST) { + phi_remap[index] = val; + } + + _mesa_hash_table_u64_insert(ctx.defs[block->index], index, + (void *) (uintptr_t) val); + } + } + + /* Seal loop headers after processing the back edge */ + jay_foreach_successor(block, succ) { + if (succ->loop_header && succ->index <= block->index) { + util_dynarray_foreach(&ctx.incomplete_phis[succ->index], + struct incomplete_phi, el) { + add_phi(&ctx, succ, el->old, el->new); + } + + assert(!BITSET_TEST(sealed, succ->index) && "unique backedge"); + BITSET_SET(sealed, succ->index); + } + } + } + + /* Optimize trivial phis resulting from backedges. Use-lists would avoid the + * fixed point algorithm but this should be good enough for now. + */ + bool progress; + do { + progress = false; + jay_repair_foreach_phi(&ctx, phi) { + progress |= try_remove_trivial_phi(&ctx, phi); + } + } while (progress); + + /* Now apply everything */ + jay_foreach_block(func, block) { + jay_foreach_phi_src_in_block(block, I) { + jay_set_phi_src_index(I, phi_remap[jay_phi_src_index(I)]); + } + + jay_foreach_inst_in_block(block, I) { + jay_foreach_ssa_src(I, s) { + remap(&ctx, &b, &I->src[s]); + } + + remap(&ctx, &b, &I->dst); + remap(&ctx, &b, &I->cond_flag); + } + } + + jay_repair_foreach_phi(&ctx, phi) { + b.cursor = jay_before_block(phi->block); + jay_PHI_DST(&b, jay_scalar(phi->old.file, phi->dst)); + + unsigned i = 0; + jay_foreach_predecessor(phi->block, pred) { + b.cursor = jay_before_jump(*pred); + unsigned idx = remap_idx(&ctx, phi->src[i++]); + jay_PHI_SRC_u32(&b, jay_scalar(phi->old.file, idx), phi->dst); + } + } + + func->ssa_alloc = ctx.alloc; + ralloc_free(memctx); +} diff --git a/src/intel/compiler/jay/jay_simd_width.c b/src/intel/compiler/jay/jay_simd_width.c new file mode 100644 index 00000000000..86a48ba320d --- /dev/null +++ b/src/intel/compiler/jay/jay_simd_width.c @@ -0,0 +1,63 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_ir.h" +#include "jay_opcodes.h" + +static unsigned +max_simd_width(jay_shader *shader, const jay_inst *I) +{ + /* Only certain "complex" quad swizzles require splitting down to SIMD4 */ + if (I->op == JAY_OPCODE_QUAD_SWIZZLE && + (jay_quad_swizzle_swizzle(I) == JAY_QUAD_SWIZZLE_XYXY || + jay_quad_swizzle_swizzle(I) == JAY_QUAD_SWIZZLE_ZWZW)) { + return 4; + } + + /* These special instructions need to be split for various reasons. */ + if (I->op == JAY_OPCODE_EXPAND_QUAD || + I->op == JAY_OPCODE_EXTRACT_LAYER || + I->op == JAY_OPCODE_EXTRACT_BYTE_PER_8LANES || + I->op == JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS || + I->op == JAY_OPCODE_MUL_32 || + I->op == JAY_OPCODE_SHUFFLE) { + return 16; + } + + if (I->op != JAY_OPCODE_SEND) { + /* If any source/destination is 64-bit strided, we must split to avoid + * crossing more than 2 GRFs. Note that SENDs don't have this restriction, + * we don't have to split A64 load/store. + */ + if (I->dst.file == GPR && + jay_def_stride(shader, I->dst) == JAY_STRIDE_8) { + return 16; + } + + jay_foreach_src(I, s) { + if (I->src[s].file == GPR && + jay_def_stride(shader, I->src[s]) == JAY_STRIDE_8) { + return 16; + } + } + } else { + /* TODO: Do we ever split SENDs? ..Can we even split SENDs given we don't + * have stride control? How is this supposed to work? + * + * XXX + */ + } + + return 32; +} + +unsigned +jay_simd_split(jay_shader *s, const jay_inst *I) +{ + unsigned actual = jay_simd_width_logical(s, I); + unsigned max = max_simd_width(s, I); + + return (actual > max) ? (util_logbase2(actual) - util_logbase2(max)) : 0; +} diff --git a/src/intel/compiler/jay/jay_spill.c b/src/intel/compiler/jay/jay_spill.c new file mode 100644 index 00000000000..f4c3b85789c --- /dev/null +++ b/src/intel/compiler/jay/jay_spill.c @@ -0,0 +1,849 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2023-2024 Alyssa Rosenzweig + * Copyright 2023-2024 Valve Corporation + * Copyright 2022 Collabora Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "util/bitset.h" +#include "util/ralloc.h" +#include "util/sparse_bitset.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "util/u_qsort.h" +#include "util/u_worklist.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* + * An implementation of "Register Spilling and Live-Range Splitting for SSA-Form + * Programs" by Braun and Hack. + * + * Next-use distances are logically in ℤ ∪ {∞}, modelled as saturating uint32 + * and referred to as dist_t. Within a block, next-use data is dense. At block + * boundaries, next-use maps are stored as key-value pairs, where only variables + * with later uses (finite distance) are stored. That sparse representation + * ensures linear-time even for shaders with many blocks. + */ +#define DIST_INFINITY (UINT32_MAX) +typedef uint32_t dist_t; + +struct next_use { + uint32_t index; + dist_t dist; +}; + +static void +add_next_use(struct util_dynarray *nu, unsigned node, dist_t dist) +{ + struct next_use use = { .index = node, .dist = dist }; + util_dynarray_append(nu, use); +} + +#define foreach_next_use(nu, it) util_dynarray_foreach(nu, struct next_use, it) + +static dist_t +add_dist(dist_t A, dist_t B) +{ + return (A + B < A) ? DIST_INFINITY : (A + B); +} + +/* + * Calculate the minimum of two next-use sets. Values absent from one of the + * underlying sets are infinity so do not contribute to the minimum, instead + * acting like a set union. + */ +static bool +minimum_next_uses(struct util_dynarray *nu, + const struct util_dynarray *from, + dist_t *tmp_dist, + struct u_sparse_bitset *tmp_set) +{ + /* Convert "from" to be dense */ + u_sparse_bitset_clear_all(tmp_set); + + foreach_next_use(from, it) { + u_sparse_bitset_set(tmp_set, it->index); + tmp_dist[it->index] = it->dist; + } + + bool progress = false; + + /* Take the minimum of common elements */ + foreach_next_use(nu, it) { + if (u_sparse_bitset_test(tmp_set, it->index)) { + if (tmp_dist[it->index] < it->dist) { + it->dist = tmp_dist[it->index]; + progress = true; + } + + u_sparse_bitset_clear(tmp_set, it->index); + } + } + + /* Add elements that are only in "from" */ + U_SPARSE_BITSET_FOREACH_SET(tmp_set, index) { + add_next_use(nu, index, tmp_dist[index]); + progress = true; + } + + return progress; +} + +static uint32_t +inst_cycles(const jay_inst *I) +{ + return 1; +} + +struct spill_block { + /* W/S sets at the start/end of the block, see spill_ctx::{W,S} */ + struct u_sparse_bitset W_in, W_out, S_in, S_out; + + /* Next-use maps at the start/end of the block */ + struct util_dynarray next_use_in, next_use_out; + + /* Estimated cycle count of the block */ + uint32_t cycles; +}; + +struct spill_ctx { + jay_function *func; + + /* Register file being spilled */ + enum jay_file file; + + /* Set of values whose file equals `file` */ + BITSET_WORD *in_file; + + /* Set of values currently available in the register file */ + struct u_sparse_bitset W; + + /* For W-entry calculation, phis with a spilled source. For + * coupling calculation, phis defined along the given edge. + */ + struct u_sparse_bitset phi_set; + + /* |W| = Current register pressure */ + unsigned nW; + + /* For each variable in N, local IPs of next-use. Else, infinite. */ + struct u_sparse_bitset N; + dist_t *next_uses; + + /* Current local IP relative to the start of the block */ + uint32_t ip; + + /* Set of live values that have been spilled. Contrary to the paper, this + * is not a subset of W: the definition in the paper is bogus. + */ + struct u_sparse_bitset S; + + /* If a value is rematerializable or a phi, its definition. Else, NULL */ + jay_inst **defs; + + /* Maximum register pressure allowed */ + unsigned k; + + /* Number of variables */ + unsigned n; + + /* Information on blocks indexed in source order */ + struct spill_block *blocks; + + /* Preallocated array of candidates for calculating W entry */ + struct next_use *candidates; + struct util_dynarray next_ip; +}; + +static inline jay_def +jay_def_as_mem(struct spill_ctx *ctx, jay_def idx) +{ + assert(idx.file == GPR || idx.file == UGPR); + idx.file = idx.file == UGPR ? UMEM : MEM; + idx._payload = jay_base_index(idx) + ctx->n; + return idx; +} + +static bool +can_remat(jay_inst *I) +{ + /* TODO */ + return false; +} + +static bool +can_remat_node(struct spill_ctx *ctx, unsigned node) +{ + return ctx->defs[node] && ctx->defs[node]->op != JAY_OPCODE_PHI_DST; +} + +static jay_inst * +remat_to(jay_builder *b, jay_def dst, struct spill_ctx *ctx, unsigned node) +{ + jay_inst *I = ctx->defs[node]; + assert(can_remat(I)); + + UNREACHABLE("invalid remat"); +} + +static void +insert_spill(jay_builder *b, struct spill_ctx *ctx, unsigned node) +{ + if (!can_remat_node(ctx, node)) { + jay_def idx = jay_scalar(ctx->file, node); + jay_MOV(b, jay_def_as_mem(ctx, idx), idx); + } +} + +static void +insert_reload(struct spill_ctx *ctx, + jay_block *block, + jay_cursor cursor, + unsigned node) +{ + jay_builder b = jay_init_builder(ctx->func, cursor); + jay_def idx = jay_scalar(ctx->file, node); + + /* Reloading breaks SSA, but jay_repair_ssa will repair */ + if (can_remat_node(ctx, node)) { + remat_to(&b, idx, ctx, node); + } else { + jay_MOV(&b, idx, jay_def_as_mem(ctx, idx)); + } +} + +/* Insert into the register file */ +static void +insert_W(struct spill_ctx *ctx, unsigned v) +{ + assert(!u_sparse_bitset_test(&ctx->W, v)); + assert(BITSET_TEST(ctx->in_file, v)); + + u_sparse_bitset_set(&ctx->W, v); + ctx->nW++; +} + +/* Remove from the register file */ +static void +remove_W(struct spill_ctx *ctx, unsigned v) +{ + assert(u_sparse_bitset_test(&ctx->W, v)); + assert(BITSET_TEST(ctx->in_file, v)); + + u_sparse_bitset_clear(&ctx->W, v); + ctx->nW--; +} + +static int +nu_score(struct spill_ctx *ctx, struct next_use nu) +{ + /* We assume that rematerializing - even before every instuction - is + * cheaper than spilling. As long as one of the nodes is rematerializable + * (with distance > 0), we choose it over spilling. Within a class of nodes + * (rematerializable or not), compare by next-use-distance. + */ + bool remat = can_remat_node(ctx, nu.index) && nu.dist > 0; + return (remat ? 0 : 100000) + nu.dist; +} + +static int +cmp_dist(const void *left_, const void *right_, void *ctx) +{ + const struct next_use *left = left_; + const struct next_use *right = right_; + int l = nu_score(ctx, *left), r = nu_score(ctx, *right); + + return (l > r) - (l < r); +} + +/* + * Limit the register file W to maximum size m by evicting registers. + */ +static ATTRIBUTE_NOINLINE void +limit(struct spill_ctx *ctx, jay_inst *I, unsigned m) +{ + /* Nothing to do if we're already below the limit */ + if (ctx->nW <= m) { + return; + } + + /* Gather candidates for eviction. Note that next_uses gives IPs whereas + * cmp_dist expects relative distances. This requires us to subtract ctx->ip + * to ensure that cmp_dist works properly. Even though logically it shouldn't + * affect the sorted order, practically this matters for correctness with + * rematerialization. See the dist=0 test in cmp_dist. + */ + struct next_use vars[JAY_NUM_UGPR]; + unsigned j = 0; + + U_SPARSE_BITSET_FOREACH_SET(&ctx->W, i) { + assert(ctx->next_uses[i] != DIST_INFINITY && "live in W"); + dist_t dist = ctx->next_uses[i] - ctx->ip; + + assert(j < ARRAY_SIZE(vars)); + vars[j++] = (struct next_use) { .index = i, .dist = dist }; + } + + /* Sort by next-use distance */ + util_qsort_r(vars, j, sizeof(struct next_use), cmp_dist, ctx); + + /* Evict what doesn't fit, inserting a spill for evicted values that we + * haven't spilled before with a future use. + */ + for (unsigned i = m; i < j; ++i) { + if (!u_sparse_bitset_test(&ctx->S, vars[i].index)) { + jay_builder b = jay_init_builder(ctx->func, jay_before_inst(I)); + insert_spill(&b, ctx, vars[i].index); + u_sparse_bitset_set(&ctx->S, vars[i].index); + } + + remove_W(ctx, vars[i].index); + } +} + +/* + * Insert coupling code on block boundaries. This must ensure: + * + * - anything live-in we expect to have spilled is spilled + * - anything live-in we expect to have filled is filled + * - phi sources are spilled if the destination is spilled + * - phi sources are filled if the destination is not spilled + * + * The latter two requirements ensure correct pressure calculations for phis. + */ +static ATTRIBUTE_NOINLINE void +insert_coupling_code(struct spill_ctx *ctx, jay_block *pred, jay_block *succ) +{ + jay_builder b = jay_init_builder(ctx->func, jay_before_function(ctx->func)); + struct spill_block *sp = &ctx->blocks[pred->index]; + struct spill_block *ss = &ctx->blocks[succ->index]; + + /* Insert spill/fill at phi sources to match their destination */ + jay_foreach_phi_src_in_block(pred, phi_src) { + jay_inst *phi_dst = ctx->defs[jay_phi_src_index(phi_src)]; + unsigned src = jay_index(phi_src->src[0]); + + if (phi_src->src[0].file == ctx->file) { + if (jay_is_mem(phi_dst->dst)) { + if (!u_sparse_bitset_test(&sp->S_out, src)) { + /* Spill the phi source. TODO: avoid redundant spills here */ + b.cursor = jay_after_block_logical(pred); + insert_spill(&b, ctx, src); + } + + if (can_remat_node(ctx, jay_index(phi_src->src[0]))) { + jay_def idx = jay_scalar(ctx->file, src); + jay_def tmp = jay_alloc_def(&b, ctx->file, 1); + + b.cursor = jay_before_function(ctx->func); + remat_to(&b, tmp, ctx, src); + jay_MOV(&b, jay_def_as_mem(ctx, idx), tmp); + } + + /* Use the spilled version */ + phi_src->src[0] = jay_def_as_mem(ctx, phi_src->src[0]); + jay_set_phi_src_index(phi_src, jay_index(phi_dst->dst)); + } else if (!u_sparse_bitset_test(&sp->W_out, src)) { + /* Fill the phi source in the predecessor */ + jay_block *reload_block = jay_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, jay_along_edge(pred, succ), src); + } + } + } + + /* Anything assumed to be spilled in succ must be spilled along all edges. */ + U_SPARSE_BITSET_FOREACH_SET(&ss->S_in, v) { + if (!u_sparse_bitset_test(&sp->S_out, v)) { + b.cursor = jay_along_edge(pred, succ); + insert_spill(&b, ctx, v); + } + } + + jay_foreach_phi_dst_in_block(succ, phi) { + u_sparse_bitset_set(&ctx->phi_set, jay_index(phi->dst)); + } + + /* Variables in W at the start of succ must be defined along the edge. + * If not live at the end of the predecessor (and it's not a phi defined in + * the successor), insert a reload. + */ + U_SPARSE_BITSET_FOREACH_SET(&ss->W_in, v) { + if (!u_sparse_bitset_test(&sp->W_out, v) && + !u_sparse_bitset_test(&ctx->phi_set, v)) { + + jay_block *reload_block = jay_edge_to_block(pred, succ); + insert_reload(ctx, reload_block, jay_along_edge(pred, succ), v); + } + } +} + +static dist_t +lookup_next_use(struct spill_ctx *ctx, unsigned v) +{ + return u_sparse_bitset_test(&ctx->N, v) ? ctx->next_uses[v] : DIST_INFINITY; +} + +/* + * Produce an array of next-use IPs relative to the start of the block. This is + * an array of dist_t scalars, representing the next-use IP of each SSA dest + * (right-to-left) and SSA source (left-to-right) of each instuction in the + * block (bottom-to-top). Its size equals the # of SSA sources in the block. + */ +static ATTRIBUTE_NOINLINE void +populate_local_next_use(struct spill_ctx *ctx, jay_block *block) +{ + struct spill_block *sb = &ctx->blocks[block->index]; + unsigned ip = sb->cycles; + + foreach_next_use(&sb->next_use_out, it) { + dist_t d = add_dist(it->dist, ip); + + if (d != DIST_INFINITY) { + u_sparse_bitset_set(&ctx->N, it->index); + ctx->next_uses[it->index] = d; + } + } + + jay_foreach_inst_in_block_rev(block, I) { + ip -= inst_cycles(I); + + jay_foreach_src_index(I, s, c, v) { + if (I->src[s].file == ctx->file) { + if (I->op != JAY_OPCODE_PHI_SRC) { + util_dynarray_append(&ctx->next_ip, lookup_next_use(ctx, v)); + } + + ctx->next_uses[v] = ip; + u_sparse_bitset_set(&ctx->N, v); + } + } + + if (I->dst.file == ctx->file) { + jay_foreach_index_rev(I->dst, _, v) { + util_dynarray_append(&ctx->next_ip, lookup_next_use(ctx, v)); + } + } + } + + assert(ip == 0 && "cycle counting is consistent"); +} + +/* + * Insert spills/fills for a single basic block, following Belady's algorithm. + * Corresponds to minAlgorithm from the paper. + */ +static ATTRIBUTE_NOINLINE void +min_algorithm(struct spill_ctx *ctx, + jay_block *block, + struct spill_block *sb, + dist_t *next_ips, + unsigned next_use_cursor) +{ + jay_foreach_inst_in_block(block, I) { + assert(ctx->nW <= ctx->k && "invariant"); + + /* Phis are special since they happen along the edge. When we initialized + * W and S, we implicitly chose which phis are spilled. So, here we just + * need to rewrite the phis to write into memory. + * + * Phi sources are handled later. + */ + if (I->op == JAY_OPCODE_PHI_DST) { + if (I->dst.file == ctx->file) { + if (!u_sparse_bitset_test(&ctx->W, jay_index(I->dst))) { + u_sparse_bitset_set(&ctx->S, jay_index(I->dst)); + I->dst = jay_def_as_mem(ctx, I->dst); + } + } + + ctx->ip += inst_cycles(I); + continue; + } else if (I->op == JAY_OPCODE_PHI_SRC) { + break; + } + + /* Any source that is not in W needs to be reloaded. Gather the set R of + * such values, and add them to the register file. + */ + unsigned R[JAY_MAX_SRCS], nR = 0; + + jay_foreach_src_index(I, s, c, v) { + if (I->src[s].file == ctx->file && !u_sparse_bitset_test(&ctx->W, v)) { + R[nR++] = v; + insert_W(ctx, v); + + assert(u_sparse_bitset_test(&ctx->S, v) && "must have spilled"); + assert(nR <= ARRAY_SIZE(R) && "maximum source count"); + } + } + + /* Limit W to make space for the operands. + * + * We need to round up to power-of-two destination sizes to match the + * rounding in demand calculation. + */ + bool has_dst = I->dst.file == ctx->file; + unsigned dst_size = util_next_power_of_two(jay_num_values(I->dst)); + limit(ctx, I, ctx->k - (has_dst ? dst_size : 0)); + + /* Add destinations to the register file */ + if (I->dst.file == ctx->file) { + jay_foreach_index(I->dst, _, index) { + assert(next_use_cursor >= 1); + ctx->next_uses[index] = next_ips[--next_use_cursor]; + + if (ctx->next_uses[index] != DIST_INFINITY) { + insert_W(ctx, index); + } + } + } + + /* Update next-use distances for this instuction. Unlike the paper, we + * require W contain only live values (with finite next-use distance). + * + * This happens after the above limit() calls to model sources as + * late-kill. This is conservative and could be improved, but it matches + * how we currently estimate register demand. + */ + jay_foreach_src_index_rev(I, s, c, node) { + if (I->src[s].file == ctx->file) { + assert(next_use_cursor >= 1); + ctx->next_uses[node] = next_ips[--next_use_cursor]; + + if (ctx->next_uses[node] == DIST_INFINITY) { + remove_W(ctx, node); + } + } + } + + /* Add reloads for the sources in front of the instuction. */ + for (unsigned i = 0; i < nR; ++i) { + insert_reload(ctx, block, jay_before_inst(I), R[i]); + } + + ctx->ip += inst_cycles(I); + + if (jay_debug & JAY_DBG_PRINTDEMAND) { + printf("(SP) %u: ", ctx->nW); + jay_print_inst(stdout, I); + } + } + + assert(next_use_cursor == 0 && "exactly sized"); + + u_sparse_bitset_dup(&sb->W_out, &ctx->W); + u_sparse_bitset_dup(&sb->S_out, &ctx->S); +} + +/* + * TODO: Implement section 4.2 of the paper. + * + * For now, we implement the simpler heuristic in Hack's thesis: sort + * the live-in set (+ destinations of phis) by next-use distance. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry_loop_header(struct spill_ctx *ctx, jay_block *block) +{ + unsigned j = 0; + /* TODO: Account for phis too! */ + foreach_next_use(&ctx->blocks[block->index].next_use_in, it) { + assert(j < ctx->n); + ctx->candidates[j++] = *it; + } + + /* Take the best candidates sorted by next-use distance */ + unsigned n = MIN2(j, ctx->k - ctx->nW); + if (n < j) { + util_qsort_r(ctx->candidates, j, sizeof(struct next_use), cmp_dist, ctx); + } + + for (unsigned i = 0; i < n; ++i) { + insert_W(ctx, ctx->candidates[i].index); + } +} + +/* + * Compute W_entry for a block. Section 4.2 in the paper. + */ +static ATTRIBUTE_NOINLINE void +compute_w_entry(struct spill_ctx *ctx, jay_block *block) +{ + unsigned j = 0; + + /* Variables that are in all predecessors are assumed in W_entry. Phis and + * variables in some predecessors are scored by next-use. + */ + U_SPARSE_BITSET_FOREACH_SET(&ctx->N, i) { + bool all = true, any = false; + + jay_foreach_predecessor(block, P) { + bool in = u_sparse_bitset_test(&ctx->blocks[(*P)->index].W_out, i); + all &= in; + any |= in; + } + + if (all) { + insert_W(ctx, i); + } else if (any) { + ctx->candidates[j++] = + (struct next_use) { .index = i, .dist = ctx->next_uses[i] }; + } + } + + jay_foreach_predecessor(block, pred) { + jay_foreach_phi_src_in_block(*pred, I) { + if (!u_sparse_bitset_test(&ctx->blocks[(*pred)->index].W_out, + jay_index(I->src[0]))) { + + u_sparse_bitset_set(&ctx->phi_set, jay_phi_src_index(I)); + } + } + } + + /* Heuristic: if any phi source is spilled, spill the phi. While suboptimal, + * this reduces pointless spills/fills with massive phi webs. + */ + jay_foreach_phi_dst_in_block(block, I) { + if (!u_sparse_bitset_test(&ctx->phi_set, jay_index(I->dst))) { + ctx->candidates[j++] = (struct next_use) { + .index = jay_index(I->dst), + .dist = ctx->next_uses[jay_index(I->dst)], + }; + } + } + + /* Take the best candidates sorted by next-use distance */ + unsigned n = MIN2(j, ctx->k - ctx->nW); + if (n < j) { + util_qsort_r(ctx->candidates, j, sizeof(struct next_use), cmp_dist, ctx); + } + + for (unsigned i = 0; i < n; ++i) { + insert_W(ctx, ctx->candidates[i].index); + } +} + +/* + * We initialize S with the union of S at the exit of (forward edge) + * predecessors and the complement of W, intersected with the live-in set. The + * former propagates S forward. The latter ensures we spill along the edge when + * a live value is not selected for the entry W. + */ +static ATTRIBUTE_NOINLINE void +compute_s_entry(struct spill_ctx *ctx, jay_block *block) +{ + jay_foreach_predecessor(block, pred) { + U_SPARSE_BITSET_FOREACH_SET(&ctx->blocks[(*pred)->index].S_out, v) { + if (u_sparse_bitset_test(&block->live_in, v)) { + u_sparse_bitset_set(&ctx->S, v); + } + } + } + + U_SPARSE_BITSET_FOREACH_SET(&block->live_in, v) { + if (BITSET_TEST(ctx->in_file, v) && !u_sparse_bitset_test(&ctx->W, v)) { + u_sparse_bitset_set(&ctx->S, v); + } + } + + u_sparse_bitset_dup(&ctx->blocks[block->index].S_in, &ctx->S); +} + +static ATTRIBUTE_NOINLINE void +global_next_use_distances(struct spill_ctx *ctx, void *memctx) +{ + u_worklist worklist; + u_worklist_init(&worklist, ctx->func->num_blocks, NULL); + + jay_foreach_block(ctx->func, block) { + struct spill_block *sb = &ctx->blocks[block->index]; + + util_dynarray_init(&sb->next_use_in, memctx); + util_dynarray_init(&sb->next_use_out, memctx); + + jay_foreach_inst_in_block(block, I) { + sb->cycles += inst_cycles(I); + } + + jay_worklist_push_head(&worklist, block); + } + + /* Iterate the work list in reverse order since liveness is backwards */ + while (!u_worklist_is_empty(&worklist)) { + jay_block *block = jay_worklist_pop_head(&worklist); + struct spill_block *sb = &ctx->blocks[block->index]; + + /* Clear locally accessed set (W) */ + u_sparse_bitset_clear_all(&ctx->W); + util_dynarray_clear(&sb->next_use_in); + + uint32_t cycle = 0; + + /* Calculate dists */ + jay_foreach_inst_in_block(block, I) { + /* Record first use before def */ + jay_foreach_src_index(I, s, c, index) { + if (I->src[s].file == ctx->file && + !u_sparse_bitset_test(&ctx->W, index)) { + + add_next_use(&sb->next_use_in, index, cycle); + u_sparse_bitset_set(&ctx->W, index); + } + } + + /* Record defs */ + jay_foreach_index(I->dst, _, index) { + u_sparse_bitset_set(&ctx->W, index); + } + + cycle += inst_cycles(I); + } + + /* Apply transfer function to get our entry state. */ + foreach_next_use(&sb->next_use_out, it) { + if (!u_sparse_bitset_test(&ctx->W, it->index)) { + add_next_use(&sb->next_use_in, it->index, + add_dist(it->dist, sb->cycles)); + } + } + + /* Propagate successor live-in to pred live-out, joining with min */ + jay_foreach_predecessor(block, pred) { + if (minimum_next_uses(&ctx->blocks[(*pred)->index].next_use_out, + &sb->next_use_in, ctx->next_uses, + &ctx->phi_set)) { + jay_worklist_push_tail(&worklist, *pred); + } + } + } + + u_worklist_fini(&worklist); + +#ifndef NDEBUG + /* In debug builds, validate the following invariant: + * + * Next-use distance is finite iff live and in file. + */ + jay_foreach_block(ctx->func, blk) { + struct spill_block *sb = &ctx->blocks[blk->index]; + + for (unsigned i = 0; i < 2; i++) { + struct util_dynarray *nu = i ? &sb->next_use_out : &sb->next_use_in; + struct u_sparse_bitset *live = i ? &blk->live_out : &blk->live_in; + + u_sparse_bitset_clear_all(&ctx->W); + + foreach_next_use(nu, it) { + assert(u_sparse_bitset_test(live, it->index) && + BITSET_TEST(ctx->in_file, it->index)); + + u_sparse_bitset_set(&ctx->W, it->index); + } + + U_SPARSE_BITSET_FOREACH_SET(live, i) { + if (BITSET_TEST(ctx->in_file, i)) { + assert(u_sparse_bitset_test(&ctx->W, i)); + } + } + } + } +#endif +} + +void +jay_spill(jay_function *func, enum jay_file file, unsigned k) +{ + void *memctx = ralloc_context(NULL); + void *linctx = linear_context(memctx); + struct spill_ctx ctx = { .func = func, .file = file, .k = k }; + + ctx.n = func->ssa_alloc; + ctx.in_file = BITSET_LINEAR_ZALLOC(linctx, ctx.n); + ctx.defs = linear_zalloc_array(linctx, jay_inst *, ctx.n); + ctx.next_uses = linear_alloc_array(linctx, dist_t, ctx.n); + ctx.candidates = linear_alloc_array(linctx, struct next_use, ctx.n); + ctx.blocks = + linear_zalloc_array(linctx, struct spill_block, func->num_blocks); + + jay_foreach_inst_in_func(func, block, I) { + if (can_remat(I) || I->op == JAY_OPCODE_PHI_DST) { + ctx.defs[jay_index(I->dst)] = I; + } + + if (I->dst.file == file) { + BITSET_SET_COUNT(ctx.in_file, jay_base_index(I->dst), + jay_num_values(I->dst)); + } + } + + u_sparse_bitset_init(&ctx.W, ctx.n, memctx); + u_sparse_bitset_init(&ctx.S, ctx.n, memctx); + u_sparse_bitset_init(&ctx.N, ctx.n, memctx); + u_sparse_bitset_init(&ctx.phi_set, ctx.n, memctx); + util_dynarray_init(&ctx.next_ip, memctx); + + global_next_use_distances(&ctx, memctx); + + /* Reserve a memory variable for every regular variable */ + func->ssa_alloc *= 2; + + jay_foreach_block(func, block) { + ctx.nW = 0; + ctx.ip = 0; + + u_sparse_bitset_clear_all(&ctx.W); + u_sparse_bitset_clear_all(&ctx.S); + u_sparse_bitset_clear_all(&ctx.N); + util_dynarray_clear(&ctx.next_ip); + + populate_local_next_use(&ctx, block); + + struct spill_block *sb = &ctx.blocks[block->index]; + dist_t *next_ips = util_dynarray_element(&ctx.next_ip, dist_t, 0); + unsigned nu_cursor = util_dynarray_num_elements(&ctx.next_ip, dist_t); + + /* Populate next-use with phi destinations, which are not in the + * next_use_in set but are accounted for when computing W_entry. + */ + jay_foreach_phi_dst_in_block(block, I) { + if (I->dst.file == file) { + assert(nu_cursor >= 1); + ctx.next_uses[jay_index(I->dst)] = next_ips[--nu_cursor]; + u_sparse_bitset_set(&ctx.N, jay_index(I->dst)); + } + } + + if (block->loop_header) { + compute_w_entry_loop_header(&ctx, block); + } else if (jay_num_predecessors(block) /* skip start blocks */) { + compute_w_entry(&ctx, block); + } + + assert(ctx.nW <= ctx.k && "invariant"); + u_sparse_bitset_dup(&sb->W_in, &ctx.W); + + compute_s_entry(&ctx, block); + min_algorithm(&ctx, block, sb, next_ips, nu_cursor); + } + + /* Now that all blocks are processed separately, stitch it together */ + jay_foreach_block(func, block) { + jay_foreach_predecessor(block, pred) { + u_sparse_bitset_clear_all(&ctx.phi_set); + insert_coupling_code(&ctx, *pred, block); + } + } + + ralloc_free(memctx); + + /* Spilling breaks SSA, so we need to repair before validating */ + jay_repair_ssa(func); + jay_validate(func->shader, "Spilling"); + + /* Remat can introduce dead code */ + jay_opt_dead_code(func->shader); +} diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c new file mode 100644 index 00000000000..935ae4d2727 --- /dev/null +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -0,0 +1,576 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include +#include "compiler/brw/brw_disasm_info.h" +#include "compiler/brw/brw_eu.h" +#include "compiler/brw/brw_eu_defines.h" +#include "compiler/brw/brw_eu_inst.h" +#include "compiler/brw/brw_reg.h" +#include "compiler/brw/brw_reg_type.h" +#include "dev/intel_debug.h" +#include "util/macros.h" +#include "util/u_dynarray.h" +#include "util/u_math.h" +#include "jay.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +static inline enum brw_reg_type +to_brw_reg_type(enum jay_type type) +{ + /* clang-format off */ + switch (type) { + case JAY_TYPE_UNTYPED: + case JAY_TYPE_U8: return BRW_TYPE_UB; + case JAY_TYPE_U16: return BRW_TYPE_UW; + case JAY_TYPE_U32: return BRW_TYPE_UD; + case JAY_TYPE_U64: return BRW_TYPE_UQ; + case JAY_TYPE_S8: return BRW_TYPE_B; + case JAY_TYPE_S16: return BRW_TYPE_W; + case JAY_TYPE_S32: return BRW_TYPE_D; + case JAY_TYPE_S64: return BRW_TYPE_Q; + case JAY_TYPE_F16: return BRW_TYPE_HF; + case JAY_TYPE_F32: return BRW_TYPE_F; + case JAY_TYPE_F64: return BRW_TYPE_DF; + case JAY_TYPE_BF16: return BRW_TYPE_BF; + default: UNREACHABLE("invalid type"); + } + /* clang-format on */ +} + +static inline unsigned +to_def_grf_16(struct jay_partition *p, jay_def d) +{ + unsigned count = jay_num_values(d); + if (count == 0 || !(d.file == GPR || d.file == UGPR)) { + return d.reg; + } + + unsigned base = 0; + for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) { + unsigned offset = d.reg - base; + + if (offset < p->blocks[d.file][i].len) { + assert(offset + count <= p->blocks[d.file][i].len && + "vectors must not cross partition boundaries"); + + return (p->blocks[d.file][i].start + offset) * 2 + d.hi; + } + + base += p->blocks[d.file][i].len; + } + + UNREACHABLE("virtual register must be in a block"); +} + +static inline brw_reg +to_brw_reg(jay_function *f, + const jay_inst *I, + signed idx, + unsigned simd_offs, + bool force_hi) +{ + bool is_dest = idx < 0; + enum jay_type type = is_dest ? I->type : jay_src_type(I, idx); + jay_def d = is_dest ? I->dst : I->src[idx]; + d.hi |= force_hi; + + struct brw_reg R; + unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0; + + if (jay_is_imm(d)) { + /* Immediates have size restrictions but can zero extend */ + if (jay_type_size_bits(type) == 64) { + type = jay_type_resize(type, 32); + } else if (I->op == JAY_OPCODE_BFN) { + assert(jay_as_uint(d) < UINT16_MAX); + type = JAY_TYPE_U16; + } + + R = brw_imm_ud(jay_as_uint(d)); + } else if (jay_is_null(d)) { + R = brw_null_reg(); + } else if (d.file == UGPR) { + unsigned grf = (reg >> 1) / 8; + offset_B = ((reg >> 1) % 8) * 4; + + if (d.file == UGPR) { + R = brw_ud1_grf(grf, 0); + } else { + R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0); + } + + /* Handle 3-src restrictions and vectorized uniform code. */ + if (is_dest || jay_num_values(d) >= 8) { + R = vec8(R); + } + + /* Some operations have special restrictions on the destination stride, + * but if we write a single UGPR the stride is ignored.. Specify + * whatever stride is needed to satisfy the rules. + */ + if (is_dest) { + /* BSpec 56640 "Special Restrictions" says: + * + * "Conversion between HF and Integer must be DWord-aligned + * and strided by a DWord on the destination." + */ + enum jay_type src0_type = jay_src_type(I, 0); + if ((I->type == JAY_TYPE_F16 && !jay_type_is_any_float(src0_type)) || + (src0_type == JAY_TYPE_F16 && !jay_type_is_any_float(I->type))) { + assert(jay_num_values(d) == 1 && "must not vectorize HF<->Int"); + R = stride(R, 8, 2, 4); + } + + /* Packed floats have restrictions on mixed sizes. Use <2>. */ + if (jay_type_size_bits(I->type) == 16 && + jay_type_size_bits(jay_src_type(I, 0)) != 16) { + assert(jay_num_values(d) == 1 && "must not vectorize mixed float"); + R = stride(R, 4, 2, 2); + } + } + } else if (d.file == GPR) { + enum jay_stride def_stride = jay_def_stride(f->shader, d); + uint32_t type_bits = jay_type_size_bits(type); + unsigned stride_bits = jay_stride_to_bits(def_stride); + unsigned simd_width = jay_simd_width_physical(f->shader, I); + + unsigned grf; + if (def_stride == JAY_STRIDE_2) { + /* Bit 0 selects between lo/hi halves of the GPR */ + grf = (reg / 2) * jay_grf_per_gpr(f->shader); + offset_B = (reg & 1) * 2 * f->shader->dispatch_width; + } else { + /* Low bits are an offset in 2-byte words into the GRF */ + unsigned mask = BITFIELD_MASK(stride_bits / 32); + grf = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader); + offset_B = (reg & mask) * 2; + } + + R = byte_offset(xe2_vec8_grf(grf, 0), + simd_offs * simd_width * stride_bits / 8); + + if (stride_bits == (type_bits * 4)) { + R = stride(R, 8, 2, 4); + } else if (stride_bits == (type_bits * 2)) { + R = stride(R, 4, 2, 2); + } else { + assert(stride_bits == type_bits); + } + + /* Broadcast is equivalent to <8, 8, 1> for SIMD1 instructions. Use that + * instead due to regioning restrictions. + */ + if (simd_width == 1) { + R = vec1(R); + } + } else if (jay_is_flag(d)) { + /* Explicit flags act like UGPRs. As sources they broadcast to all lanes, + * so we may ignore the SIMD offset. As destinations, they are written by + * SIMD1 instructions and are never SIMD split. + */ + assert(simd_offs == 0 || idx >= 0); + unsigned offs_B = d.reg * (f->shader->dispatch_width / 8); + R = brw_flag_subreg(offs_B / 2); + } else if (d.file == J_ADDRESS) { + R = brw_address_reg(d.reg); + } else if (d.file == J_ARF) { + R = brw_ud1_reg(ARF, jay_base_index(d), 0); + } else { + UNREACHABLE("unexpected file"); + } + + R.negate = d.negate; + R.abs = d.abs; + return byte_offset(retype(R, to_brw_reg_type(type)), offset_B); +} + +#define SRC(i) to_brw_reg(f, I, i, simd_offs, false) + +#define OP0(hw) \ + case JAY_OPCODE_##hw: \ + brw_##hw(p); \ + break; + +#define OP1(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu1(p, BRW_OPCODE_##hw, dst, SRC(0)); \ + break; + +#define OP2(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu2(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1)); \ + break; + +#define OP3(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(0), SRC(1), SRC(2)); \ + break; + +#define OP3_SWAP(jay, hw) \ + case JAY_OPCODE_##jay: \ + brw_alu3(p, BRW_OPCODE_##hw, dst, SRC(2), SRC(1), SRC(0)); \ + break; + +static struct brw_reg +quad_swizzle(struct brw_reg r, const jay_inst *I) +{ + /* clang-format off */ + switch (jay_quad_swizzle_swizzle(I)) { + case JAY_QUAD_SWIZZLE_XXXX: return suboffset(stride(r, 4, 4, 0), 0); + case JAY_QUAD_SWIZZLE_YYYY: return suboffset(stride(r, 4, 4, 0), 1); + case JAY_QUAD_SWIZZLE_ZZZZ: return suboffset(stride(r, 4, 4, 0), 2); + case JAY_QUAD_SWIZZLE_WWWW: return suboffset(stride(r, 4, 4, 0), 3); + case JAY_QUAD_SWIZZLE_XXZZ: return suboffset(stride(r, 2, 2, 0), 0); + case JAY_QUAD_SWIZZLE_YYWW: return suboffset(stride(r, 2, 2, 0), 1); + case JAY_QUAD_SWIZZLE_XYXY: return suboffset(stride(r, 0, 2, 1), 0); + case JAY_QUAD_SWIZZLE_ZWZW: return suboffset(stride(r, 0, 2, 1), 2); + } + /* clang-format on */ + + UNREACHABLE("invalid quad swizzle"); +} + +/* Runs once per SIMD-split, so must not modify the instruction! */ +static void +emit(struct brw_codegen *p, + jay_function *f, + const jay_inst *I, + unsigned simd_offs) +{ + ASSERTED unsigned nr_ins_before = p->nr_insn; + unsigned exec_size = jay_simd_width_physical(f->shader, I); + // jay_print_inst(stdout, (jay_inst *) I); + + /* Fix up SWSB dependencies for SIMD split instructions. The latter + * instructions do not need to redundantly wait on an SBID but might + * replicate their regdists. + */ + struct tgl_swsb dep = + simd_offs && !I->replicate_dep ? tgl_swsb_null() : I->dep; + dep.mode = simd_offs ? TGL_SBID_NULL : dep.mode; + + if (I->decrement_dep) { + unsigned delta = simd_offs * jay_macro_length(I); + assert(dep.regdist > delta); + dep.regdist -= delta; + } + + brw_set_default_exec_size(p, util_logbase2(exec_size)); + brw_set_default_mask_control(p, jay_is_no_mask(I)); + brw_set_default_swsb(p, dep); + brw_set_default_saturate(p, I->saturate); + + /* Quad swizzle can get split down to SIMD4 even on Xe2 where we don't have + * NibCtrl. Fortunately, it's NoMask so it doesn't matter. + */ + if (I->op != JAY_OPCODE_QUAD_SWIZZLE) { + brw_set_default_group(p, simd_offs * exec_size); + } + + /* Grab the hardware predicate, corresponding either to a logical predicate + * or SEL's selector. + */ + const jay_def *pred = I->predication ? jay_inst_get_predicate((void *) I) : + I->op == JAY_OPCODE_SEL ? &I->src[2] : + NULL; + + brw_set_default_predicate_control(p, pred ? BRW_PREDICATE_NORMAL : + BRW_PREDICATE_NONE); + brw_set_default_predicate_inverse(p, pred && pred->negate); + + /* Jay/brw enums line up by construction */ + enum brw_conditional_mod cmod = + (enum brw_conditional_mod) I->conditional_mod; + + if (!jay_is_null(I->cond_flag)) { + assert(!(pred && pred->reg != I->cond_flag.reg) && "must be tied"); + pred = &I->cond_flag; + } + + if (pred) { + unsigned reg = pred->reg * jay_phys_flag_per_virt(f->shader); + brw_set_default_flag_reg(p, reg / 2, reg % 2); + } + + if (I->op == JAY_OPCODE_MIN) { + cmod = BRW_CONDITIONAL_L; + } else if (I->op == JAY_OPCODE_MAX) { + cmod = BRW_CONDITIONAL_GE; + } + + struct brw_reg dst = to_brw_reg(f, I, -1, simd_offs, false); + + switch (I->op) { + OP0(ELSE) + OP0(ENDIF) + OP0(WHILE) + OP0(BREAK) + OP1(MOV, MOV) + OP1(MODIFIER, MOV) + OP1(RNDD, RNDD) + OP1(RNDZ, RNDZ) + OP1(RNDE, RNDE) + OP1(FRC, FRC) + OP1(BFREV, BFREV) + OP1(CBIT, CBIT) + OP1(NOT, NOT) + OP1(FBL, FBL) + OP1(FBH, FBH) + OP1(LZD, LZD) + OP2(ROL, ROL) + OP2(AVG, AVG) + OP2(ADD, ADD) + OP2(MUL, MUL) + OP2(SEL, SEL) + OP2(MIN, SEL) + OP2(MAX, SEL) + OP2(MUL_32X16, MUL) + OP2(AND, AND) + OP2(AND_U32_U16, AND) + OP2(OR, OR) + OP2(XOR, XOR) + OP2(ASR, ASR) + OP2(SHR, SHR) + OP2(SHL, SHL) + OP2(BFI1, BFI1) + OP3(BFI2, BFI2) + OP3(ADD3, ADD3) + OP3(CSEL, CSEL) + OP3(DP4A_UU, DP4A) + OP3(DP4A_SS, DP4A) + OP3(DP4A_SU, DP4A) + OP3_SWAP(MAD, MAD) + OP3_SWAP(BFE, BFE) + + case JAY_OPCODE_LOOP_ONCE: + /* TODO: Is there a better way to do this? */ + brw_BREAK(p); + brw_WHILE(p); + break; + + case JAY_OPCODE_IF: + brw_IF(p, util_logbase2(exec_size)); + break; + + case JAY_OPCODE_MATH: + gfx6_math(p, dst, jay_math_op(I), SRC(0), + retype(brw_null_reg(), to_brw_reg_type(I->type))); + break; + + case JAY_OPCODE_BFN: + brw_BFN(p, dst, SRC(0), SRC(1), SRC(2), brw_imm_ud(jay_bfn_ctrl(I))); + break; + + case JAY_OPCODE_DESWIZZLE_16: + brw_set_default_exec_size(p, BRW_EXECUTE_16); + brw_MOV(p, retype(xe2_vec8_grf(jay_deswizzle_16_dst(I), 0), BRW_TYPE_UD), + retype(xe2_vec8_grf(jay_deswizzle_16_src(I), 0), BRW_TYPE_UD)); + break; + + case JAY_OPCODE_CVT: { + unsigned index = jay_cvt_index(I); + bool force_hi = false; + + /* We will apply a suboffset for the specific subword being converted. In + * the case where we have a subword (16-bit) stride, accesses to the upper + * half will be instead to a discontiguous GRF so we have to fix up. This + * affects u8->u32 conversions. + */ + if (I->src[0].file == GPR) { + unsigned type_size_B = jay_type_size_bits(jay_cvt_src_type(I)) / 8; + unsigned index_B = index * type_size_B; + unsigned stride_B = + jay_stride_to_bits(jay_def_stride(f->shader, I->src[0])) / 8; + + if (index_B >= stride_B) { + assert(stride_B == 2 && index_B <= 4 && !I->src[0].hi); + force_hi = true; + index = (index_B % stride_B) / type_size_B; + } + } + + brw_MOV(p, dst, + suboffset(to_brw_reg(f, I, 0, simd_offs, force_hi), index)); + break; + } + + case JAY_OPCODE_SYNC: + brw_SYNC(p, jay_sync_op(I)); + break; + + case JAY_OPCODE_CMP: + brw_CMP(p, dst, I->conditional_mod, SRC(0), SRC(1)); + break; + + case JAY_OPCODE_MOV_IMM64: + brw_MOV(p, dst, brw_imm_u64(jay_mov_imm64_imm(I))); + break; + + case JAY_OPCODE_RELOC: + brw_MOV_reloc_imm(p, dst, BRW_TYPE_UD, jay_reloc_param(I), + jay_reloc_base(I)); + break; + + case JAY_OPCODE_QUAD_SWIZZLE: + brw_MOV(p, dst, quad_swizzle(SRC(0), I)); + break; + + case JAY_OPCODE_BROADCAST_IMM: + brw_MOV(p, dst, get_element(SRC(0), jay_broadcast_imm_lane(I))); + break; + + case JAY_OPCODE_SEND: + brw_SEND(p, jay_send_sfid(I), dst, SRC(2), SRC(3), SRC(0), SRC(1), + jay_send_ex_desc_imm(I), jay_send_ex_mlen(I), + jay_send_bindless(I), jay_send_eot(I), false /* gather */); + if (jay_send_check_tdr(I)) { + brw_eu_inst_set_opcode(p->isa, brw_eu_last_inst(p), BRW_OPCODE_SENDC); + } + break; + + /* Gfx20+ has separate Render Target Array indices for each pair of subspans + * in order to support multiple polygons, so we need to use a <1;8,0> region + * in order to select the word for each channel. + */ + case JAY_OPCODE_EXTRACT_LAYER: + brw_AND(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UW), 1, 8, 0), + brw_imm_uw(0x7ff)); + break; + + case JAY_OPCODE_EXPAND_QUAD: + brw_MOV(p, dst, stride(SRC(simd_offs), 1, 4, 0)); + break; + + case JAY_OPCODE_OFFSET_PACKED_PIXEL_COORDS: + brw_set_default_exec_size(p, BRW_EXECUTE_32); + brw_set_default_group(p, 0); + brw_ADD(p, retype(dst, BRW_TYPE_UW), retype(SRC(0), BRW_TYPE_UW), + brw_imm_uv(0x11100100)); + break; + + case JAY_OPCODE_LANE_ID_8: + brw_set_default_exec_size(p, BRW_EXECUTE_8); + brw_MOV(p, dst, brw_imm_uv(0x76543210)); + break; + + case JAY_OPCODE_LANE_ID_EXPAND: + brw_set_default_exec_size(p, util_logbase2(jay_lane_id_expand_width(I))); + brw_ADD(p, suboffset(dst, jay_lane_id_expand_width(I)), SRC(0), + brw_imm_uw(jay_lane_id_expand_width(I))); + break; + + case JAY_OPCODE_EXTRACT_BYTE_PER_8LANES: + brw_MOV(p, dst, stride(retype(SRC(simd_offs), BRW_TYPE_UB), 1, 8, 0)); + break; + + case JAY_OPCODE_SHR_ODD_SUBSPANS_BY_4: + brw_SHR(p, dst, SRC(0), brw_imm_uv(0x44440000)); + break; + + case JAY_OPCODE_MUL_32: { + brw_MUL(p, retype(brw_acc_reg(1), to_brw_reg_type(I->type)), SRC(0), + subscript(SRC(1), BRW_TYPE_UW, 0)); + + brw_set_default_swsb(p, tgl_swsb_null()); + brw_alu2(p, jay_mul_32_high(I) ? BRW_OPCODE_MACH : BRW_OPCODE_MACL, dst, + SRC(0), SRC(1)); + break; + } + + case JAY_OPCODE_SHUFFLE: { + struct brw_reg a0 = brw_address_reg(0); + unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]); + unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width; + + brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B)); + brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD)); + break; + } + + default: + jay_print_inst(stderr, (jay_inst *) I); + UNREACHABLE("Unhandled opcode"); + } + + if (cmod != BRW_CONDITIONAL_NONE) { + brw_eu_inst_set_cond_modifier(p->devinfo, brw_eu_last_inst(p), cmod); + } + + assert(p->nr_insn == (nr_ins_before + jay_macro_length(I)) && + "Jay instructions must map 1:n to GEN instructions"); +} + +struct jay_shader_bin * +jay_to_binary(jay_shader *s, void *const_data, size_t const_data_size) +{ + struct jay_shader_bin *bin = rzalloc(s, struct jay_shader_bin); + + struct util_dynarray prog; + util_dynarray_init(&prog, bin); + + struct brw_isa_info isa; + struct brw_codegen p; + + brw_init_isa_info(&isa, s->devinfo); + brw_init_codegen(&isa, &p, bin); + int start_offset = p.next_insn_offset; + + /* TODO: Multifunction properly */ + jay_foreach_function(s, f) { + jay_foreach_block(f, block) { + if (block->loop_header) { + brw_DO(&p, 0); + } + + jay_foreach_inst_in_block(block, I) { + for (unsigned i = 0; i < (1 << jay_simd_split(s, I)); ++i) { + emit(&p, f, I, i); + } + } + } + } + + int final_halt_offset = -1 /* TODO */; + brw_set_uip_jip(&p, start_offset, final_halt_offset); + + struct disasm_info *disasm = disasm_initialize(p.isa, NULL); + + disasm_new_inst_group(disasm, 0); + disasm_new_inst_group(disasm, p.next_insn_offset); + + UNUSED bool valid = true; +#ifndef NDEBUG + valid = + brw_validate_instructions(p.isa, p.store, 0, p.next_insn_offset, disasm); +#endif + + brw_compact_instructions(&p, start_offset, disasm); + + if (INTEL_DEBUG(intel_debug_flag_for_shader_stage(s->stage)) || !valid) { + dump_assembly(p.store, 0, p.next_insn_offset, disasm, NULL, stdout); + } + + if (!valid) { + UNREACHABLE("invalid assembly"); + } + + struct brw_stage_prog_data *prog_data = &s->prog_data->base; + + assert(prog_data->const_data_size == 0); + if (const_data_size > 0) { + prog_data->const_data_size = const_data_size; + prog_data->const_data_offset = + brw_append_data(&p, const_data, const_data_size, 32); + } + + bin->kernel = brw_get_program(&p, &bin->size); + s->prog_data->base.relocs = + brw_get_shader_relocs(&p, &s->prog_data->base.num_relocs); + + return bin; +} diff --git a/src/intel/compiler/jay/jay_validate.c b/src/intel/compiler/jay/jay_validate.c new file mode 100644 index 00000000000..7a3a6953fb7 --- /dev/null +++ b/src/intel/compiler/jay/jay_validate.c @@ -0,0 +1,328 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +#ifndef NDEBUG + +enum validate_block_state { + STATE_PHI_DST, + STATE_NORMAL, + STATE_LATE, +}; + +struct validate_state { + bool failed; + bool post_ra; + const char *when; + jay_inst *I; + jay_block *block; + jay_function *func; + BITSET_WORD *defs; + enum jay_file *files; + enum validate_block_state block_state; +}; + +static enum validate_block_state +block_state_for_inst(jay_inst *I) +{ + if (I->op == JAY_OPCODE_PHI_DST || I->op == JAY_OPCODE_PRELOAD) { + return STATE_PHI_DST; + } else if (I->op == JAY_OPCODE_PHI_SRC || + (jay_op_is_control_flow(I->op) && I->op != JAY_OPCODE_ELSE)) { + return STATE_LATE; + } else { + return STATE_NORMAL; + } +} + +static void +chirp(struct validate_state *validate, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + + if (!validate->failed) { + fprintf(stderr, "jay shader validation failed (after %s):\n", + validate->when); + validate->failed = true; + } + if (validate->I) { + fprintf(stderr, + " invalid instruction in block %d: ", validate->block->index); + jay_print_inst(stderr, validate->I); + } + fprintf(stderr, " "); + vfprintf(stderr, fmt, args); + fprintf(stderr, "\n\n"); + + va_end(args); +} + +#define CHECK(cond) \ + if (!(cond)) { \ + chirp(validate, "assertion failed at %s:%u\n %s", __FILE__, __LINE__, \ + #cond); \ + } + +static void +validate_flagness(struct validate_state *validate, + jay_def def, + enum jay_type type, + const char *name) +{ + CHECK(type != JAY_TYPE_U1 || jay_is_flag(def) || jay_is_null(def)); +} + +static unsigned +get_src_words(struct validate_state *validate, jay_inst *I, unsigned s) +{ + if (I->op == JAY_OPCODE_EXPAND_QUAD) { + return 4; + } + + bool vectorized = I->dst.file == UGPR && + jay_num_values(I->dst) > jay_type_vector_length(I->type) && + I->op != JAY_OPCODE_SEND && + jay_num_values(I->src[s]) > 1; + + unsigned elsize = jay_type_vector_length(jay_src_type(I, s)); + unsigned words = elsize * (vectorized ? jay_num_values(I->dst) : 1); + + if (vectorized && I->src[s].file == GPR) { + CHECK(words == validate->func->shader->dispatch_width); + return 1; + } else { + return words; + } +} + +/* + * Validate the fundamental invariants of static single assignment form. + */ +static void +validate_ssa(struct validate_state *validate, jay_inst *I) +{ + jay_foreach_src_index(I, src_index, _, ssa_index) { + CHECK(BITSET_TEST(validate->defs, ssa_index) && "defs dominate uses"); + CHECK(validate->files[ssa_index] == I->src[src_index].file && + "consistent files"); + } + + jay_foreach_dst_index(I, d, ssa_index) { + CHECK(!BITSET_TEST(validate->defs, ssa_index) && "single definition"); + BITSET_SET(validate->defs, ssa_index); + validate->files[ssa_index] = d.file; + } +} + +/* + * Validate the invariants of jay_def. + */ +static void +validate_def(struct validate_state *validate, jay_def def, const char *kind) +{ + CHECK(!jay_is_null(def) || !def.reg); + + if (def.collect) { + CHECK(jay_num_values(def) >= 2); + CHECK(def.file == GPR || def.file == UGPR); + + bool contiguous = true; + jay_foreach_comp(def, c) { + uint32_t index = jay_channel(def, c); + contiguous &= index == (jay_channel(def, 0) + c); + CHECK(index != JAY_SENTINEL); + } + + CHECK(!contiguous); + } else if (def.file == J_IMM) { + CHECK(!def.reg); + CHECK(!def.num_values_m1); + CHECK(!def.negate); + CHECK(!def.abs); + } else if (def.file == ACCUM || def.file == UACCUM || def.hi) { + CHECK(validate->post_ra); + } else { + CHECK(jay_base_index(def) != JAY_SENTINEL || validate->post_ra); + } + + if (jay_is_ssa(def) && jay_channel(def, 0) != JAY_SENTINEL) { + jay_foreach_comp(def, c) { + CHECK(jay_channel(def, c) < validate->func->ssa_alloc); + } + } + + CHECK(jay_num_values(def) == 1 || !jay_is_flag(def)); +} + +/** + * Validate an instruction. + */ +static void +validate_inst(struct validate_state *validate, jay_inst *I) +{ + validate->I = I; + + /* Block states are monotonic. */ + enum validate_block_state state = block_state_for_inst(I); + CHECK(state >= validate->block_state); + validate->block_state = state; + + const struct jay_opcode_info *opinfo = &jay_opcode_infos[I->op]; + + validate_def(validate, I->dst, "dst"); + validate_def(validate, I->cond_flag, "cond_flag"); + + jay_foreach_src(I, s) { + validate_def(validate, I->src[s], "source"); + } + + if (!validate->post_ra) { + validate_ssa(validate, I); + } + + CHECK(I->num_srcs <= JAY_MAX_SRCS); + + validate_flagness(validate, I->dst, I->type, "destination"); + validate_flagness(validate, I->cond_flag, JAY_TYPE_U1, "cond_flag"); + + CHECK(!I->conditional_mod || + !jay_is_null(I->cond_flag) || + I->op == JAY_OPCODE_CSEL); + + /* These assumptions are baked into the definition of broadcast_flag and + * required to ensure correctness with the lane masking. + */ + CHECK(!I->broadcast_flag || + (!jay_is_null(I->cond_flag) && + jay_is_null(I->dst) && + I->cond_flag.file == FLAG && + (I->op == JAY_OPCODE_CMP || I->op == JAY_OPCODE_MOV))); + + /* Standard modifiers only allowed on some instructions */ + CHECK(!I->conditional_mod || opinfo->cmod || I->op == JAY_OPCODE_CSEL); + CHECK(!I->saturate || opinfo->sat); + + unsigned num_srcs = I->num_srcs; + + if (I->predication) { + CHECK(num_srcs >= I->predication); + + if (jay_inst_has_default(I)) { + CHECK(jay_inst_get_default(I)->file == I->dst.file); + } + + CHECK(jay_is_flag(*jay_inst_get_predicate(I))); + CHECK(!jay_is_null(*jay_inst_get_predicate(I))); + + num_srcs -= I->predication; + } + + if (validate->post_ra) { + CHECK(jay_simd_width_logical(validate->func->shader, I) > 0); + CHECK(jay_simd_width_physical(validate->func->shader, I) > 0); + } + + /* Number of sources should match for our opcode. If opinfo->num_srcs + * is zero, then it may actually take a variable number of sources. + */ + CHECK(num_srcs == opinfo->num_srcs || opinfo->num_srcs == 0); + + for (unsigned s = 0; s < num_srcs; s++) { + if (jay_is_ssa(I->src[s]) && !jay_is_null(I->src[s])) { + unsigned expected = get_src_words(validate, I, s); + unsigned words = jay_num_values(I->src[s]); + if (I->op != JAY_OPCODE_SEND || s < 2) { + CHECK(expected == words); + } + + validate_flagness(validate, I->src[s], jay_src_type(I, s), "source"); + } + + CHECK(!I->src[s].negate || jay_has_src_mods(I, s)); + } + + switch (I->op) { + case JAY_OPCODE_SEL: + CHECK(jay_is_flag(I->src[2]) && "SEL src[2] (selector) must be a flag"); + break; + case JAY_OPCODE_SWAP: + CHECK(I->src[0].file == I->src[1].file && "SWAP files must match"); + break; + default: + break; + } +} + +static void +jay_validate_function(struct validate_state *validate) +{ + validate->defs = BITSET_CALLOC(validate->func->ssa_alloc); + validate->files = + calloc(validate->func->ssa_alloc, sizeof(validate->files[0])); + + jay_foreach_block(validate->func, block) { + validate->block = block; + validate->I = NULL; + + CHECK(block->successors[0] || !block->successors[1]); + + /* Post-RA we can remove physical jumps though they exist logically */ + if (block->successors[1] && !validate->post_ra) { + CHECK(jay_block_ending_jump(block) != NULL); + } + + /* If a block has multiple successors, and one of them has multiple + * predecessors, then we've detected a critical edge. + */ + if (jay_num_successors(block) > 1 && !validate->post_ra) { + jay_foreach_successor(block, succ) { + if (jay_num_predecessors(succ) > 1) { + chirp(validate, "Critical edge (B%u -> B%u) is not allowed", + block->index, succ->index); + } + } + } + + validate->block_state = 0; + jay_foreach_inst_in_block(block, inst) { + validate_inst(validate, inst); + } + } + + /* Validate that there are no dead phis. RA relies on this. */ + if (!validate->post_ra) { + jay_foreach_block(validate->func, block) { + jay_foreach_phi_src_in_block(block, phi) { + CHECK(BITSET_TEST(validate->defs, jay_phi_src_index(phi))); + } + } + } + + free(validate->defs); + free(validate->files); +} + +void +jay_validate(jay_shader *s, const char *when) +{ + struct validate_state validate = { .when = when, .post_ra = s->post_ra }; + + jay_foreach_function(s, f) { + validate.func = f; + jay_validate_function(&validate); + } + + if (validate.failed) { + fprintf(stderr, "jay shader that failed validation:\n"); + jay_print(stderr, s); + abort(); + } +} + +#endif diff --git a/src/intel/compiler/jay/jay_validate_ra.c b/src/intel/compiler/jay/jay_validate_ra.c new file mode 100644 index 00000000000..02bd20b57bd --- /dev/null +++ b/src/intel/compiler/jay/jay_validate_ra.c @@ -0,0 +1,217 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2024 Alyssa Rosenzweig + * SPDX-License-Identifier: MIT + */ + +#include "util/ralloc.h" +#include "jay_ir.h" +#include "jay_opcodes.h" +#include "jay_private.h" + +/* Validatation doesn't make sense in release builds */ +#ifndef NDEBUG + +struct regfile { + /* For each register in each file, records the SSA index currently stored + * in that register (or zero if undefined contents). + */ + uint32_t *r[JAY_NUM_SSA_FILES]; + + /* Size of each register file */ + size_t n[JAY_NUM_SSA_FILES]; +}; + +static uint32_t * +reg(struct regfile *rf, enum jay_file file, uint32_t reg) +{ + /* FLAG and UFLAG share their registers. TODO: Rework? */ + if (file == UFLAG) { + file = FLAG; + } + + assert(file < JAY_NUM_SSA_FILES); + assert(reg < rf->n[file]); + return &rf->r[file][reg]; +} + +static uint32_t * +def_reg(struct regfile *rf, jay_def x, uint32_t component) +{ + return reg(rf, x.file, x.reg + component); +} + +static void +print_regfile(struct regfile *rf, FILE *fp) +{ + fprintf(fp, "regfile: \n"); + jay_foreach_ssa_file(file) { + for (unsigned i = 0; i < rf->n[file]; ++i) { + uint32_t v = *reg(rf, file, i); + const char *prefixes = "ruf"; /* XXX: share with jay_print */ + + if (v) { + fprintf(fp, " %c%u = %u\n", prefixes[file], i, v); + } + } + } + fprintf(fp, "\n"); +} + +static bool +validate_src(struct jay_partition *partition, + jay_inst *I, + unsigned s, + struct regfile *rf, + jay_def def) +{ + jay_foreach_comp(def, c) { + uint32_t actual = *def_reg(rf, def, c); + + if (def.file == GPR) { + assert(jay_gpr_to_stride(partition, def.reg) == + jay_gpr_to_stride(partition, def.reg + c)); + } + + if (actual == 0 || actual != jay_channel(def, c)) { + fprintf(stderr, "invalid RA for source %u, channel %u.\n", s, c); + + fprintf(stderr, "expected index %u but", jay_channel(def, c)); + if (actual) + fprintf(stderr, " got index %u\n", actual); + else + fprintf(stderr, " register is undefined\n"); + + jay_print_inst(stderr, I); + print_regfile(rf, stderr); + return false; + } + } + + return true; +} + +static bool +validate_block(jay_function *func, jay_block *block, struct regfile *blocks) +{ + struct regfile *rf = &blocks[block->index]; + bool success = true; + + /* Pathological shaders can end up with loop headers that have only a + * single predecessor and act like normal blocks. Validate them as such, + * since RA treats them as such implicitly. Affects: + * + * dEQP-VK.graphicsfuzz.spv-stable-mergesort-dead-code + */ + bool loop_header = block->loop_header && jay_num_predecessors(block) > 1; + + /* Initialize the register file based on predecessors. */ + /* Initialize with the exit state of any one predecessor */ + jay_block *first_pred = jay_first_predecessor(block); + if (first_pred) { + struct regfile *pred_rf = &blocks[first_pred->index]; + + jay_foreach_ssa_file(f) { + memcpy(rf->r[f], pred_rf->r[f], rf->n[f] * sizeof(uint32_t)); + } + } + + /* TODO: Handle loop header validation better */ + if (!loop_header) { + /* Intersect with the other predecessor. If a register has different + * values coming in from each block, it is considered undefined at the + * start of the block. + */ + jay_foreach_predecessor(block, pred) { + struct regfile *pred_rf = &blocks[(*pred)->index]; + + jay_foreach_ssa_file(file) { + for (unsigned r = 0; r < rf->n[file]; ++r) { + if (*reg(rf, file, r) != *reg(pred_rf, file, r)) { + *reg(rf, file, r) = 0; + } + } + } + } + } + + jay_foreach_inst_in_block(block, I) { + /* Validate sources */ + jay_foreach_ssa_src(I, s) { + if (jay_channel(I->src[s], 0) != JAY_SENTINEL) { + success &= + validate_src(&func->shader->partition, I, s, rf, I->src[s]); + } + } + + /* Record destinations */ + jay_foreach_dst(I, dst) { + if (jay_channel(dst, 0) != JAY_SENTINEL) { + jay_foreach_comp(dst, c) { + *def_reg(rf, dst, c) = jay_channel(dst, c); + + if (dst.file == GPR) { + struct jay_partition *p = &func->shader->partition; + assert(jay_gpr_to_stride(p, dst.reg) == + jay_gpr_to_stride(p, dst.reg + c)); + } + } + } + } + + if (I->op == JAY_OPCODE_MOV && + jay_channel(I->dst, 0) == JAY_SENTINEL && + jay_is_ssa(I->src[0]) && + jay_channel(I->src[0], 0) == JAY_SENTINEL) { + + /* Lowered live range splits don't have SSA associated, handle + * directly at the register level. + */ + assert(jay_num_values(I->dst) == jay_num_values(I->src[0])); + + jay_foreach_comp(I->dst, c) { + *def_reg(rf, I->dst, c) = *def_reg(rf, I->src[0], c); + } + } else if (I->op == JAY_OPCODE_SWAP) { + assert(jay_num_values(I->src[0]) == jay_num_values(I->src[1])); + + jay_foreach_comp(I->src[0], c) { + SWAP(*def_reg(rf, I->src[0], c), *def_reg(rf, I->src[1], c)); + } + } + } + + return success; +} + +void +jay_validate_ra(jay_function *func) +{ + bool succ = true; + linear_ctx *lin_ctx = linear_context(func->shader); + struct regfile *blocks = + linear_zalloc_array(lin_ctx, struct regfile, func->num_blocks); + + jay_foreach_block(func, block) { + struct regfile *b = &blocks[block->index]; + assert(block->index < func->num_blocks); + + jay_foreach_ssa_file(file) { + b->n[file] = jay_num_regs(func->shader, file); + b->r[file] = linear_zalloc_array(lin_ctx, uint32_t, b->n[file]); + } + } + + jay_foreach_block(func, block) { + succ &= validate_block(func, block, blocks); + } + + if (!succ) { + jay_print_func(stderr, func); + UNREACHABLE("invalid RA"); + } + + linear_free_context(lin_ctx); +} + +#endif /* NDEBUG */ diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build new file mode 100644 index 00000000000..e9c47ada78c --- /dev/null +++ b/src/intel/compiler/jay/meson.build @@ -0,0 +1,109 @@ +# Copyright 2017 Intel Corporation +# SPDX-License-Identifier: MIT + +jay_opcodes = custom_target( + input : ['jay_opcodes_gen.py'], + output : ['jay_opcodes.c', 'jay_opcodes.h'], + command : [prog_python, '@INPUT@', '--code', '@OUTPUT0@', '--header', '@OUTPUT1@'], + depend_files : files('jay_opcodes.py'), +) + +idep_jay_opcodes_h = declare_dependency( + sources : [jay_opcodes[1]], + include_directories : include_directories('.'), +) + +jay_extra_info_h = custom_target( + input : ['jay_extra_info.h.py'], + output : 'jay_extra_info.h', + command : [prog_python, '@INPUT@', '@OUTPUT@'], + depend_files : files('jay_opcodes.py'), +) + +idep_jay_extra_info_h = declare_dependency( + sources : [jay_extra_info_h], + include_directories : include_directories('.'), +) + +jay_builder_opcodes_h = custom_target( + input : 'jay_builder_opcodes.h.py', + output : 'jay_builder_opcodes.h', + command : [prog_python, '@INPUT@', '@OUTPUT@'], + depend_files : files('jay_opcodes.py'), +) + +idep_jay_builder_opcodes_h = declare_dependency( + sources : [jay_builder_opcodes_h], + include_directories : include_directories('.'), +) + +jay_nir_algebraic = custom_target( + 'jay_nir_algebraic.c', + input : ['jay_nir_algebraic.py'], + output : 'jay_nir_algebraic.c', + command : [prog_python, '@INPUT@', '@OUTPUT@', '-p', dir_compiler_nir] , + depend_files : nir_algebraic_depends, +) + +libintel_compiler_jay_files = files( + 'jay.h', + 'jay_assign_flags.c', + 'jay_from_nir.c', + 'jay_ir.h', + 'jay_liveness.c', + 'jay_lower_post_ra.c', + 'jay_lower_pre_ra.c', + 'jay_lower_scoreboard.c', + 'jay_lower_spill.c', + 'jay_opt_dead_code.c', + 'jay_opt_control_flow.c', + 'jay_opt_propagate.c', + 'jay_print.c', + 'jay_private.h', + 'jay_repair_ssa.c', + 'jay_register_allocate.c', + 'jay_simd_width.c', + 'jay_spill.c', + 'jay_to_binary.c', + 'jay_validate.c', + 'jay_validate_ra.c', +) + +libintel_compiler_jay = static_library( + 'intel_compiler_jay', + [libintel_compiler_jay_files, jay_nir_algebraic, jay_opcodes[0]], + include_directories : [inc_include, inc_src, inc_intel], + c_args : [no_override_init_args, '-Wno-c23-extensions', '-Wno-array-bounds'], + gnu_symbol_visibility : 'hidden', + dependencies : [idep_nir_headers, idep_jay_opcodes_h, idep_jay_builder_opcodes_h, idep_jay_extra_info_h, idep_mesautil, idep_intel_dev], + build_by_default : false, +) + +idep_intel_compiler_jay = declare_dependency( + link_with : [libintel_compiler_jay], + dependencies : [ + idep_nir, + idep_vtn, + ], +) + +if with_tests + test( + 'jay_tests', + executable( + 'jay_tests', + files( + 'test/test-lower-post-ra.cpp', + 'test/test-optimizer.cpp', + 'test/test-repair-ssa.cpp', + ), + c_args : [c_msvc_compat_args, no_override_init_args], + gnu_symbol_visibility : 'hidden', + include_directories : [inc_include, inc_src, inc_intel], + dependencies: [idep_gtest, idep_nir, idep_jay_opcodes_h, idep_jay_builder_opcodes_h, idep_jay_extra_info_h, idep_mesautil, idep_intel_dev], + link_with : [libintel_compiler_jay], + ), + suite : ['intel'], + protocol : 'gtest', + ) +endif diff --git a/src/intel/compiler/jay/register-file.md b/src/intel/compiler/jay/register-file.md new file mode 100644 index 00000000000..b2053ccf348 --- /dev/null +++ b/src/intel/compiler/jay/register-file.md @@ -0,0 +1,57 @@ +# Glossary + +**lane**: A single work-item. + +**subgroup**: A collection of 8, 16, or 32 lanes executing in lockstep. +Avoid using the term _thread_ as it is ambiguous. + +**uniform**: A value that has the same value in every active lane of a subgroup. +Sometimes called _convergent_. Opposite of "non-uniform". + +**non-uniform**: A value that may have different values in different active +lanes within a subgroup. Sometimes called _divergent_. Opposite of "uniform". + +**GPR**: General-purpose register, a single non-uniform value viewed from the +perspective of a single lane. This is a 'virtual' or 'logical' register within +the SIMT programming model. It does not represent a physical machine +register. For that, see "GRF". + +**UGPR**: Uniform general purpose register, a single uniform value. This is +again a virtual or logical register. + +**GRF**: A physical Intel GPU register. On Xe2+, a GRF is 512-bits. On older +platforms, a GRF is 256-bits. Depending on the platform and the SIMD width, +different numbers of GRFs required to store a single GPR, and different numbers +of UGPRs fit into a single GRF. In SIMD32 mode on Xe2, 1 GPR requires 2 GRFs, +and 16 UGPRs fit into 1 GRF. + +**scalar**: A single value from the perspective of a single lane; a single GPR +or UGPR. Note that a scalar may be either uniform or non-uniform. Opposite of +"vector". + +**vector**: A collection of multiple values from the perspective of a single +lane. All scalars within the vector must be identically be GPRs or UGPRs. + +# Introduction + +Jay separates the logical register files (GPR and UGPR) from the +unified physical register file. We assign registers independently for each +logical file, and then post-RA we remap to physical GRFs. This simplifies RA. + +We decide a static GPR/UGPR split up front. Ideally, we'd just use the +first N registers for GPRs and the rest for UGPRs, or something like +that. Unfortunately, several hardware issues complicate this scheme... + +# End-of-thread SENDs + +End-of-thread SENDs require their source is in r112-r127. As their source will +always be per-thread, we want to make sure these are GPRs. + +# Payloads + +At the start of each thread, the register file is preloaded with a payload. +Parts of the payload act like UGPRs, parts act like GPRs, and parts act like... +something weird and in between. To minimize copying, we want to assign UGPRs to +the UGPR parts of the payload and GPRs to the GPR parts. As for the weird cases, +we model them as UGPR vectors and use special opcodes (lowered late to +regioning) to unpack to GPRs for normal handling. diff --git a/src/intel/compiler/jay/test/jay_test.h b/src/intel/compiler/jay/test/jay_test.h new file mode 100644 index 00000000000..43cc48b87ef --- /dev/null +++ b/src/intel/compiler/jay/test/jay_test.h @@ -0,0 +1,141 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_private.h" +#include "shader_enums.h" + +static inline jay_block * +jay_test_block(jay_function *f) +{ + jay_block *blk = jay_new_block(f); + list_addtail(&blk->link, &f->blocks); + return blk; +} + +/* Helper to generate a jay_builder suitable for creating test instructions */ +static inline jay_builder * +jay_test_builder(void *memctx) +{ + jay_shader *s = jay_new_shader(memctx, MESA_SHADER_COMPUTE); + jay_function *f = jay_new_function(s); + s->partition.base8 = 8; + + struct intel_device_info *devinfo = + rzalloc(memctx, struct intel_device_info); + s->devinfo = devinfo; + s->dispatch_width = 32; + + unsigned verx10 = 200; + devinfo->verx10 = verx10; + devinfo->ver = verx10 / 10; + assert(devinfo->ver > 0); + + /* We'll use low indices for test values */ + f->ssa_alloc = 10; + + jay_builder *b = rzalloc(memctx, jay_builder); + *b = jay_init_builder(f, jay_after_block(jay_test_block(f))); + return b; +} + +/* Helper to compare for logical equality of instructions. Need to compare the + * pointers, then compare raw data. + */ +static inline bool +jay_inst_equal(jay_inst *A, jay_inst *B) +{ + /* Check the plain old data portion of jay_inst. */ + unsigned header = sizeof(struct list_head); + if (memcmp((uint8_t *) A + header, (uint8_t *) B + header, + sizeof(jay_inst) - header)) + return false; + + /* All of the sizes are plain data. They match, so do a deep compare. */ + size_t size = (A->num_srcs * sizeof(jay_def)) + jay_inst_info_size(A); + return !memcmp(A->src, B->src, size); +} + +static inline bool +jay_block_equal(jay_block *A, jay_block *B) +{ + if (list_length(&A->instructions) != list_length(&B->instructions)) + return false; + + list_pair_for_each_entry(jay_inst, I, J, &A->instructions, &B->instructions, + link) { + if (!jay_inst_equal(I, J)) { + return false; + } + } + + return true; +} + +static inline bool +jay_function_equal(jay_function *A, jay_function *B) +{ + if (list_length(&A->blocks) != list_length(&B->blocks)) + return false; + + list_pair_for_each_entry(jay_block, blockA, blockB, &A->blocks, &B->blocks, + link) { + if (!jay_block_equal(blockA, blockB)) + return false; + } + + return true; +} + +static inline bool +jay_shader_equal(jay_shader *A, jay_shader *B) +{ + if (list_length(&A->functions) != list_length(&B->functions)) + return false; + + list_pair_for_each_entry(jay_function, functionA, functionB, &A->functions, + &B->functions, link) { + if (!jay_function_equal(functionA, functionB)) + return false; + } + + return true; +} + +#define ASSERT_SHADER_EQUAL(A, B) \ + if (!jay_shader_equal(A, B)) { \ + ADD_FAILURE(); \ + fprintf(stderr, "Pass produced unexpected results"); \ + fprintf(stderr, " Actual:\n"); \ + jay_print(stderr, A); \ + fprintf(stderr, " Expected:\n"); \ + jay_print(stderr, B); \ + fprintf(stderr, "\n"); \ + } + +#define INSTRUCTION_CASE_GEN(instr, expected, pass, validate) \ + do { \ + jay_builder *A = jay_test_builder(mem_ctx); \ + jay_builder *B = jay_test_builder(mem_ctx); \ + { \ + jay_builder *b = A; \ + instr; \ + } \ + if (validate) \ + jay_validate(A->shader, "test setup"); \ + { \ + jay_builder *b = B; \ + expected; \ + } \ + JAY_PASS(A->shader, pass); \ + ASSERT_SHADER_EQUAL(A->shader, B->shader); \ + } while (0) + +#define INSTRUCTION_CASE(instr, expected, pass) \ + INSTRUCTION_CASE_GEN(instr, expected, pass, true) diff --git a/src/intel/compiler/jay/test/test-lower-post-ra.cpp b/src/intel/compiler/jay/test/test-lower-post-ra.cpp new file mode 100644 index 00000000000..209d944f347 --- /dev/null +++ b/src/intel/compiler/jay/test/test-lower-post-ra.cpp @@ -0,0 +1,82 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_test.h" + +#include + +#define CASE(instr, expected) \ + INSTRUCTION_CASE( \ + { \ + A->shader->post_ra = true; \ + instr; \ + }, \ + { \ + B->shader->post_ra = true; \ + expected; \ + }, \ + jay_lower_post_ra) + +#define PRE jay_add_predicate_else +#define POST jay_add_predicate +#define CFLAG jay_set_cond_flag + +#define NEGCASE(x) CASE(x, x) + +class LowerPostRA : public testing::Test { + protected: + LowerPostRA() + { + mem_ctx = ralloc_context(NULL); + + x = jay_bare_reg(GPR, 1); + y = jay_bare_reg(GPR, 2); + z = jay_bare_reg(GPR, 3); + u4 = jay_bare_reg(UGPR, 4); + f0 = jay_bare_reg(FLAG, 0); + f1 = jay_bare_reg(FLAG, 1); + f2 = jay_bare_reg(FLAG, 2); + } + + ~LowerPostRA() + { + ralloc_free(mem_ctx); + } + + jay_inst *I; + void *mem_ctx; + jay_def x, y, z, u4, f0, f1, f2, nul = jay_null(); +}; + +TEST_F(LowerPostRA, Tied) +{ + CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0, z), + POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0)); + + CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), jay_negate(f0), z), + POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), jay_negate(f0))); +} + +TEST_F(LowerPostRA, InsertMove) +{ + CASE(PRE(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0, x), { + POST(b, jay_MOV(b, z, x), jay_negate(f0)); + POST(b, jay_ADD(b, JAY_TYPE_U32, z, x, y), f0); + }); +} + +TEST_F(LowerPostRA, RewriteToSel) +{ + CASE(PRE(b, jay_MOV(b, z, y), f0, x), + jay_SEL(b, JAY_TYPE_U32, z, x, y, jay_negate(f0))); +} + +TEST_F(LowerPostRA, CopyUGPR) +{ + NEGCASE(jay_MOV(b, x, u4)); + NEGCASE(jay_MOV(b, u4, x)); +} diff --git a/src/intel/compiler/jay/test/test-optimizer.cpp b/src/intel/compiler/jay/test/test-optimizer.cpp new file mode 100644 index 00000000000..739a2d15610 --- /dev/null +++ b/src/intel/compiler/jay/test/test-optimizer.cpp @@ -0,0 +1,312 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "util/lut.h" +#include "jay_builder.h" +#include "jay_ir.h" +#include "jay_private.h" +#include "jay_test.h" + +#include + +static void +jay_optimize_and_dce(jay_shader *shader) +{ + JAY_PASS(shader, jay_opt_propagate_forwards); + JAY_PASS(shader, jay_opt_propagate_backwards); + JAY_PASS(shader, jay_opt_dead_code); +} + +#define CASE(instr, expected) \ + INSTRUCTION_CASE( \ + { \ + instr; \ + jay_UNIT_TEST_u32(b, out); \ + }, \ + { \ + expected; \ + jay_UNIT_TEST_u32(b, out); \ + }, \ + jay_optimize_and_dce) + +#define NEGCASE(instr) CASE(instr, instr) +#define UNIT jay_UNIT_TEST_u32 + +#define NEG(x) jay_negate(x) + +#define MOV(T, src0) \ + ({ \ + jay_def dst = jay_alloc_def(b, GPR, 1); \ + jay_MODIFIER(b, T, dst, src0); \ + dst; \ + }) + +class Optimizer : public testing::Test { + protected: + Optimizer() + { + mem_ctx = ralloc_context(NULL); + + out = jay_scalar(GPR, 8); + wx = jay_scalar(TEST_FILE, 1); + wy = jay_scalar(TEST_FILE, 1); + wz = jay_scalar(TEST_FILE, 1); + } + + ~Optimizer() + { + ralloc_free(mem_ctx); + } + + void *mem_ctx; + + jay_def out, wx, wy, wz; +}; + +static enum jay_type float_types[] = { + JAY_TYPE_F16, + JAY_TYPE_F32, +}; + +TEST_F(Optimizer, Copyprop) +{ + CASE(jay_ADD(b, JAY_TYPE_U32, out, wx, jay_MOV_u32(b, wy)), + jay_ADD(b, JAY_TYPE_U32, out, wx, wy)); + + CASE(jay_ADD(b, JAY_TYPE_U32, out, wx, jay_MOV_u32(b, wy)), + jay_ADD(b, JAY_TYPE_U32, out, wx, wy)); +} + +TEST_F(Optimizer, FusedNeg) +{ + for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) { + enum jay_type T = float_types[i]; + + CASE(jay_ADD(b, T, out, wx, MOV(T, NEG(wy))), + jay_ADD(b, T, out, wx, NEG(wy))); + + CASE(jay_MUL(b, T, out, MOV(T, NEG(wy)), NEG(wx)), + jay_MUL(b, T, out, NEG(wy), NEG(wx))); + + CASE(jay_MAD(b, T, out, MOV(T, NEG(wy)), wz, NEG(MOV(T, NEG(wx)))), + jay_MAD(b, T, out, NEG(wy), wz, wx)); + } +} + +TEST_F(Optimizer, SELToFloat) +{ + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 3, x); + jay_SEL(b, JAY_TYPE_U32, out, wx, MOV(JAY_TYPE_F32, NEG(wy)), flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 3, x); + jay_SEL(b, JAY_TYPE_F32, out, wx, NEG(wy), flag); + }); +} + +TEST_F(Optimizer, FusedNot) +{ + CASE(jay_BFN(b, out, wx, jay_NOT_u32(b, wy), 0, UTIL_LUT3(a & b)), + jay_BFN(b, out, wx, wy, 0, UTIL_LUT3(a & ~b))); + + CASE(jay_AND(b, JAY_TYPE_U32, out, wx, jay_NOT_u32(b, wy)), + jay_AND(b, JAY_TYPE_U32, out, wx, jay_negate(wy))); + + CASE(jay_XOR(b, JAY_TYPE_U32, out, jay_NOT_u32(b, wx), wy), + jay_XOR(b, JAY_TYPE_U32, out, jay_negate(wx), wy)); + + CASE(jay_OR(b, JAY_TYPE_U32, out, jay_NOT_u32(b, wx), jay_NOT_u32(b, wy)), + jay_OR(b, JAY_TYPE_U32, out, jay_negate(wx), jay_negate(wy))); +} + +TEST_F(Optimizer, NegativeFusedFneg) +{ + for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) { + enum jay_type T = float_types[i]; + NEGCASE(jay_ADD(b, JAY_TYPE_U32, out, wx, MOV(T, NEG(wy)))); + NEGCASE(jay_ADD(b, JAY_TYPE_S32, out, wx, MOV(T, NEG(wy)))); + } +} + +/* TODO: test fneg with f64 */ + +TEST_F(Optimizer, FusedSat) +{ + for (unsigned i = 0; i < ARRAY_SIZE(float_types); ++i) { + enum jay_type T = float_types[i]; + + CASE( + { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, T, x, wx, MOV(T, NEG(wy))); + jay_MODIFIER(b, T, out, x)->saturate = true; + }, + { jay_ADD(b, T, out, wx, NEG(wy))->saturate = true; }); + + CASE( + { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_MUL(b, T, x, wx, MOV(T, NEG(wy))); + jay_MODIFIER(b, T, out, x)->saturate = true; + }, + { jay_MUL(b, T, out, wx, NEG(wy))->saturate = true; }); + + CASE( + { + jay_def x = jay_alloc_def(b, GPR, 1); + jay_MAX(b, T, x, wx, MOV(T, NEG(wy)))->saturate = true; + jay_MODIFIER(b, T, out, x)->saturate = true; + }, + { jay_MAX(b, T, out, wx, NEG(wy))->saturate = true; }); + } +} + +TEST_F(Optimizer, InverseBallotPropagate) +{ + CASE( + { + jay_def x = jay_alloc_def(b, UGPR, 1); + jay_def f = jay_alloc_def(b, FLAG, 1); + jay_ADD(b, JAY_TYPE_U32, x, wx, wy); + jay_MOV(b, f, x); + jay_SEL(b, JAY_TYPE_U32, out, wx, wy, f); + }, + { + UNUSED jay_def x = jay_alloc_def(b, UGPR, 1); + jay_def f = jay_alloc_def(b, FLAG, 1); + jay_ADD(b, JAY_TYPE_U32, f, wx, wy); + jay_SEL(b, JAY_TYPE_U32, out, wx, wy, f); + }); +} + +TEST_F(Optimizer, GtZero) +{ + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 0, x); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *add = jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_set_conditional_mod(b, add, flag, JAY_CONDITIONAL_GT); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); +} + +TEST_F(Optimizer, MultipleCmp) +{ + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def flag2 = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_LT, flag, 0, x); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_GT, flag2, 0, x); + jay_SEL(b, JAY_TYPE_U32, out, x, jay_SEL_u32(b, x, 123, flag), flag2); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def flag2 = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *add = jay_ADD(b, JAY_TYPE_S32, x, wx, NEG(wy)); + jay_set_conditional_mod(b, add, flag, JAY_CONDITIONAL_GT); + jay_CMP(b, JAY_TYPE_S32, JAY_CONDITIONAL_GT, flag2, 0, x); + jay_SEL(b, JAY_TYPE_U32, out, x, jay_SEL_u32(b, x, 123, flag), flag2); + }); +} + +TEST_F(Optimizer, TypeNeutralConditionalMods) +{ + enum jay_conditional_mod mods[] = { + JAY_CONDITIONAL_NE, + JAY_CONDITIONAL_EQ, + }; + + for (unsigned i = 0; i < 2; ++i) { + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *bfn3 = jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_set_conditional_mod(b, bfn3, flag, mods[i]); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + + CASE( + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_AND(b, JAY_TYPE_U32, x, wx, wy); + jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }, + { + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_inst *an = jay_AND(b, JAY_TYPE_U32, x, wx, wy); + jay_set_conditional_mod(b, an, flag, mods[i]); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + } +} + +TEST_F(Optimizer, SignednessMismatchConditionalMods) +{ + enum jay_conditional_mod mods[] = { + JAY_CONDITIONAL_LE, + JAY_CONDITIONAL_GT, + }; + + for (unsigned i = 0; i < 2; ++i) { + NEGCASE({ + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_CMP(b, JAY_TYPE_S32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + } +} + +TEST_F(Optimizer, FloatMismatchConditionalMods) +{ + enum jay_conditional_mod mods[] = { + JAY_CONDITIONAL_NAN, + JAY_CONDITIONAL_EQ, + JAY_CONDITIONAL_NE, + JAY_CONDITIONAL_LT, + }; + + for (unsigned i = 0; i < 2; ++i) { + NEGCASE({ + jay_def flag = jay_alloc_def(b, FLAG, 1); + jay_def x = jay_alloc_def(b, GPR, 1); + jay_BFN(b, x, wx, wy, wz, UTIL_LUT3(a & b & c)); + jay_CMP(b, JAY_TYPE_F32, mods[i], flag, x, 0); + jay_SEL(b, JAY_TYPE_U32, out, x, 123, flag); + }); + } +} diff --git a/src/intel/compiler/jay/test/test-repair-ssa.cpp b/src/intel/compiler/jay/test/test-repair-ssa.cpp new file mode 100644 index 00000000000..8d117746eee --- /dev/null +++ b/src/intel/compiler/jay/test/test-repair-ssa.cpp @@ -0,0 +1,213 @@ +/* + * Copyright 2026 Intel Corporation + * Copyright 2024 Alyssa Rosenzweig + * Copyright 2022 Collabora, Ltd. + * SPDX-License-Identifier: MIT + */ + +#include "jay_builder.h" +#include "jay_builder_opcodes.h" +#include "jay_ir.h" +#include "jay_test.h" + +#include + +JAY_DEFINE_FUNCTION_PASS(pass, jay_repair_ssa) + +#define CASE(instr) \ + INSTRUCTION_CASE_GEN( \ + { \ + UNUSED bool repaired = false; \ + b->func->ssa_alloc = 1; \ + instr \ + }, \ + { \ + UNUSED bool repaired = true; \ + b->func->ssa_alloc = 1; \ + instr \ + }, \ + pass, false) + +class RepairSSA : public testing::Test { + protected: + RepairSSA() + { + mem_ctx = ralloc_context(NULL); + } + + ~RepairSSA() + { + ralloc_free(mem_ctx); + } + + void *mem_ctx; +}; + +static jay_def +jay_phi_2(jay_builder *b, jay_block *p1, jay_def v1, jay_block *p2, jay_def v2) +{ + assert(v2.file == v1.file || jay_is_null(v2)); + jay_def idx = jay_alloc_def(b, v1.file, 1); + jay_PHI_DST(b, idx); + jay_cursor saved = b->cursor; + + b->cursor = jay_after_block(p1); + jay_PHI_SRC_u32(b, v1, jay_index(idx)); + + b->cursor = jay_after_block(p2); + jay_PHI_SRC_u32(b, jay_is_null(v2) ? idx : v2, jay_index(idx)); + + b->cursor = saved; + return idx; +} + +TEST_F(RepairSSA, Local) +{ + CASE({ + jay_def x = jay_MOV_u32(b, 0xcafe); + jay_def y = jay_MOV_u32(b, 0xefac); + + if (repaired) { + jay_UNIT_TEST(b, jay_ADD_f32(b, y, x)); + } else { + jay_ADD(b, JAY_TYPE_F32, x, y, x); + jay_UNIT_TEST(b, x); + } + }); +} + +/* A + * / \ + * B C + * \ / + * D + */ +TEST_F(RepairSSA, IfElse) +{ + CASE({ + jay_block *A = jay_first_block(b->func); + jay_block *B = jay_test_block(b->func); + jay_block *C = jay_test_block(b->func); + jay_block *D = jay_test_block(b->func); + + jay_block_add_successor(A, B); + jay_block_add_successor(A, C); + + jay_block_add_successor(B, D); + jay_block_add_successor(C, D); + + b->cursor = jay_after_block(A); + jay_IF(b); + + b->cursor = jay_after_block(B); + jay_def x = jay_MOV_u32(b, 0xcafe); + jay_def y = jay_MOV_u32(b, 0xbade); + + b->cursor = jay_after_block(C); + jay_ELSE(b); + jay_def x2 = repaired ? jay_alloc_def(b, UGPR, 1) : x; + jay_MOV(b, x2, 0xefac); + jay_def y2 = jay_MOV_u32(b, 0xbaee); + jay_ENDIF(b); + + b->cursor = jay_after_block(D); + jay_def y3 = jay_phi_2(b, B, y, C, y2); + if (repaired) + x = jay_phi_2(b, B, x, C, x2); + + jay_UNIT_TEST(b, jay_ADD_f32(b, x, y3)); + }); +} + +/* + * H + * | + * A---| + * / \ | + * B C | + * | / | + * | D---- + * | + * |-E + */ +TEST_F(RepairSSA, Loop) +{ + CASE({ + jay_block *H = jay_first_block(b->func); + jay_block *A = jay_test_block(b->func); + jay_block *B = jay_test_block(b->func); + jay_block *C = jay_test_block(b->func); + jay_block *D = jay_test_block(b->func); + jay_block *E = jay_test_block(b->func); + + jay_block_add_successor(H, A); + jay_block_add_successor(A, B); + jay_block_add_successor(A, C); + jay_block_add_successor(B, E); + jay_block_add_successor(C, D); + jay_block_add_successor(D, A); + + A->loop_header = true; + + b->cursor = jay_after_block(H); + jay_def x = jay_MOV_u32(b, 0xcafe); + + b->cursor = jay_after_block(A); + jay_def x_in = repaired ? jay_alloc_def(b, UGPR, 1) : x; + jay_def x_out = repaired ? jay_alloc_def(b, UGPR, 1) : x; + if (repaired) { + jay_PHI_DST(b, x_in); + } + jay_IF(b); + + b->cursor = jay_after_block(H); + if (repaired) { + jay_PHI_SRC_u32(b, x, jay_index(x_in)); + } + + b->cursor = jay_after_block(B); + jay_BREAK(b); + + b->cursor = jay_after_block(D); + jay_ADD(b, JAY_TYPE_U32, x_out, x_in, 1); + if (repaired) { + jay_PHI_SRC_u32(b, x_out, jay_index(x_in)); + } + jay_WHILE(b); + + b->cursor = jay_after_block(E); + jay_UNIT_TEST(b, x_in); + }); +} + +/* Same setup as IfElse */ +TEST_F(RepairSSA, TrivialPhisOptimized) +{ + CASE({ + jay_block *A = jay_first_block(b->func); + jay_block *B = jay_test_block(b->func); + jay_block *C = jay_test_block(b->func); + jay_block *D = jay_test_block(b->func); + + jay_block_add_successor(A, B); + jay_block_add_successor(A, C); + + jay_block_add_successor(B, D); + jay_block_add_successor(C, D); + + b->cursor = jay_after_block(A); + jay_def x = jay_MOV_u32(b, 0xcafe); + jay_IF(b); + + b->cursor = jay_after_block(C); + jay_ELSE(b); + jay_ENDIF(b); + + b->cursor = jay_after_block(D); + if (repaired) { + b->func->ssa_alloc++; + } + + jay_UNIT_TEST(b, jay_ADD_f32(b, x, x)); + }); +} diff --git a/src/intel/compiler/meson.build b/src/intel/compiler/meson.build index cdfdd00d5f8..0a2c0c1f66a 100644 --- a/src/intel/compiler/meson.build +++ b/src/intel/compiler/meson.build @@ -35,6 +35,7 @@ brw_device_sha1_gen_src = custom_target('brw_device_sha1_gen.c', command : [prog_python, '@INPUT0@', '--out', '@OUTPUT@']) subdir('brw') +subdir('jay') if with_intel_elk subdir('elk')