mesa/src/intel/compiler/jay/jay_liveness.c

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

229 lines
6.8 KiB
C
Raw Normal View History

intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
/*
* Copyright 2026 Intel Corporation
* SPDX-License-Identifier: MIT
*/
#include "util/bitset.h"
#include "util/macros.h"
#include "util/sparse_bitset.h"
#include "util/u_math.h"
#include "util/u_worklist.h"
#include "jay_ir.h"
#include "jay_opcodes.h"
#include "jay_private.h"
/* LiveIn = GEN + (LiveOut - KILL) */
static void
update_liveness_for_inst(BITSET_WORD *dead_defs,
struct u_sparse_bitset *live_in,
jay_inst *I)
{
/* No destination is live-in before the instruction, but any destination not
* live-in after is immediately dead.
*/
jay_foreach_dst_index(I, _, def) {
if (u_sparse_bitset_test(live_in, def)) {
u_sparse_bitset_clear(live_in, def);
} else {
BITSET_SET(dead_defs, def);
}
}
if (I->op == JAY_OPCODE_PHI_SRC) {
/* Phi sources do not require last-use bits. */
jay_foreach_src_index(I, src_idx, comp, index) {
u_sparse_bitset_set(live_in, index);
}
} else {
BITSET_ZERO(I->last_use);
unsigned last_use_i = 0;
jay_foreach_src_index(I, s, comp, index) {
/* If the source is not live after this instruction, but becomes
* live at this instruction, this is the last use.
*/
if (!u_sparse_bitset_test(live_in, index)) {
assert(last_use_i < JAY_NUM_LAST_USE_BITS);
BITSET_SET(I->last_use, last_use_i);
}
u_sparse_bitset_set(live_in, index);
++last_use_i;
}
}
}
/**
* Calculate liveness information for SSA values.
*
* This populates the jay_block::live_in/live_out bitsets and last_use flags.
*/
void
jay_compute_liveness(jay_function *f)
{
u_worklist worklist;
u_worklist_init(&worklist, f->num_blocks, NULL);
ralloc_free(f->dead_defs);
f->dead_defs = BITSET_RZALLOC(f, f->ssa_alloc);
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
BITSET_WORD *uniform = BITSET_CALLOC(f->ssa_alloc);
intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
jay_foreach_block(f, block) {
u_sparse_bitset_free(&block->live_out);
u_sparse_bitset_init(&block->live_out, f->ssa_alloc, block);
jay_worklist_push_head(&worklist, block);
}
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
jay_foreach_inst_in_func(f, _, I) {
jay_foreach_dst_index(I, dst, index) {
if (jay_is_uniform(dst)) {
BITSET_SET(uniform, index);
}
}
}
intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
while (!u_worklist_is_empty(&worklist)) {
/* Pop in reverse order since liveness is a backwards pass */
jay_block *block = jay_worklist_pop_head(&worklist);
/* Update its liveness information:
* 1. Assume everything liveout from this block was live_in
* 2. Clear live_in for anything defined in this block
*/
u_sparse_bitset_free(&block->live_in);
intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
u_sparse_bitset_dup(&block->live_in, &block->live_out);
jay_foreach_inst_in_block_rev(block, inst) {
update_liveness_for_inst(f->dead_defs, &block->live_in, inst);
}
/* Propagate block->live_in[] to the live_out[] of predecessors. Since
* phis are split, they are handled naturally without special cases.
*
* The physical control flow graph is a subset of the logical control flow
* graph. So, edges that are in both can use the fast merge, and other
* edges are physical-only and need to merge only UGPRs.
intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
*/
jay_foreach_predecessor(block, p, UGPR) {
bool progress = false;
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
if (jay_cfg_has_edge(*p, block, GPR)) {
progress = u_sparse_bitset_merge(&(*p)->live_out, &block->live_in);
} else {
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) {
if (BITSET_TEST(uniform, i)) {
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
progress |= !u_sparse_bitset_test(&(*p)->live_out, i);
u_sparse_bitset_set(&(*p)->live_out, i);
}
}
}
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
if (progress) {
jay_worklist_push_tail(&worklist, *p);
intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
}
}
}
#ifndef NDEBUG
jay_block *first_block = jay_first_block(f);
jay_block *last_block = list_last_entry(&f->blocks, jay_block, link);
assert(u_sparse_bitset_count(&first_block->live_in) == 0 && "invariant");
assert(u_sparse_bitset_count(&last_block->live_out) == 0 && "invariant");
#endif
u_worklist_fini(&worklist);
jay: introduce a physical control flow graph Consider: u0 = foo() if (divergent) { u0 = bar() r0 = baz(u0) } else { r0 = quux(u0) } Logically, this is fine, there is no interference between bar() and u0. But physically, both sides of the if execute so the bar() write to u0 overwrites the variable the else reads. So this is a miscompile. The solution is to model the extra edges in the physical control flow graph, which lives next to the existing logical control flow graph. Liveness for UGPRs now follows the physical CFG, while liveness for GPRs continues to follow the logical CFG. That models the interference properly, while still allowing phis to work as before (since phis writing UGPRs follow uniform bits of control flow that are necessarily critical edge free for the same reason the logical CFG is). Because our RA copies shuffled registers back at block ends (following Colombet), there's no issue with live range splits here (unlike aco which inserts phis for this case and then needs to worry about critical edges around those phis). There might still be an extremely-challenging-to-hit bug here with UGPR spilling which I need to think more about. It might be fine as-is? Not convinced though. But this is big enough and strictly less broken than what we have right now and the full solution will build on this, so here we are. Fixes artefating in SuperTuxKart and Celestia knows what else. Totals: Instrs: 2770938 -> 2771269 (+0.01%); split: -0.00%, +0.02% CodeSize: 40133712 -> 40138480 (+0.01%); split: -0.01%, +0.02% Totals from 158 (5.97% of 2647) affected shaders: Instrs: 514523 -> 514854 (+0.06%); split: -0.02%, +0.09% CodeSize: 7603040 -> 7607808 (+0.06%); split: -0.03%, +0.09% Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/41215>
2026-04-22 10:53:46 -04:00
free(uniform);
intel: add Jay Jay is a new SSA-based compiler for Intel GPUs. This is an early work-in-progress. It isn't ready to ship, but we'd like to move development in tree rather than rebasing the world every week. Please don't bother testing yet - we know the status and we're working on it! Jay's design is similar to other modern NIR backends, particularly ACO, NAK and AGX. It is fully SSA, deconstructing phis after RA. We use a Colombet register allocator similar to NAK, allowing us to handle Intel's complex register regioning restrictions in a straightforward way. Spilling logical registers is straightforward with Braun-Hack. Thanks to the SSA-based design, the entire backend is essentially linear time, regardless of register pressure, addressing brw's excessive compile time when especially spilling with brw. In this current early draft, we support a limited subset of all three APIs on Xe2. A lot works and a lot doesn't. The core compiler is there (spilling, scoreboarding, SIMD32, etc should more or less work), but there are details to fill in for both performance and correctness. We essentially pass conformance on OpenGL ES 3.0 and OpenCL 3.0, and we're busy iterating on Vulkan. Likewise, additional hardware support will come down the line. There's nothing fundamentally Xe2-specific here. I just have a Lunarlake laptop on my desk, Ken has a Battlemage card, and we had to pick _something_ as the first target. Co-authored-by: Kenneth Graunke <kenneth@whitecape.org> Signed-off-by: Alyssa Rosenzweig <alyssa.rosenzweig@intel.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/40835>
2025-11-27 17:57:25 -05:00
}
/*
* Calculate the register demand for each SSA file using the previously
* calculated liveness analysis. SSA makes this exact in linear-time.
*/
void
jay_calculate_register_demands(jay_function *func)
{
enum jay_file *files = calloc(func->ssa_alloc, sizeof(enum jay_file));
BITSET_WORD *killed = BITSET_CALLOC(func->ssa_alloc);
unsigned *max_demand = func->demand;
memset(max_demand, 0, sizeof(func->demand));
jay_foreach_inst_in_func(func, block, I) {
jay_foreach_dst_index(I, def, index) {
files[index] = def.file;
}
}
jay_foreach_block(func, block) {
unsigned demands[JAY_NUM_SSA_FILES] = {};
/* Everything live-in. */
U_SPARSE_BITSET_FOREACH_SET(&block->live_in, i) {
++demands[files[i]];
}
jay_foreach_ssa_file(f) {
max_demand[f] = MAX2(demands[f], max_demand[f]);
}
jay_foreach_inst_in_block(block, I) {
/* We must have enough register file space for the register payload */
if (I->op == JAY_OPCODE_PRELOAD) {
uint32_t max = jay_preload_reg(I) + jay_num_values(I->dst);
max_demand[I->dst.file] = MAX2(max_demand[I->dst.file], max);
}
/* Collect source values to kill */
jay_foreach_killed(I, s, c) {
BITSET_SET(killed, jay_channel(I->src[s], c));
}
/* Make destinations live */
jay_foreach_dst(I, d) {
demands[d.file] += util_next_power_of_two(jay_num_values(d));
}
/* Update maximum demands */
jay_foreach_ssa_file(f) {
max_demand[f] = MAX2(demands[f], max_demand[f]);
}
/* Dead destinations are those written by the instruction but killed
* immediately after the instruction finishes.
*/
jay_foreach_dst_index(I, d, index) {
if (BITSET_TEST(func->dead_defs, index)) {
assert(demands[d.file] > 0);
--demands[d.file];
}
}
jay_foreach_dst(I, d) {
unsigned n = jay_num_values(d);
demands[d.file] -= util_next_power_of_two(n) - n;
}
/* Late-kill sources */
jay_foreach_killed(I, s, c) {
uint32_t index = jay_channel(I->src[s], c);
if (BITSET_TEST(killed, index)) {
BITSET_CLEAR(killed, index);
assert(demands[I->src[s].file] > 0);
--demands[I->src[s].file];
}
}
if (jay_debug & JAY_DBG_PRINTDEMAND) {
printf("(LA) [G:%u\tU:%u] ", demands[GPR], demands[UGPR]);
jay_print_inst(stdout, I);
}
}
}
free(files);
free(killed);
}