From 4cb67cb07ae53dd47b261ce6a236899bf1cf7ea6 Mon Sep 17 00:00:00 2001 From: Kenneth Graunke Date: Wed, 2 Oct 2024 15:42:37 -0700 Subject: [PATCH] intel/brw: Use whole 512-bit registers in constant combining on Xe2 Xe2 increased the register size from 256-bits to 512-bits. So we can store 32 16-bit values in a register, rather than 16 values. Prior to this patch, we hadn't updated the pass, so the second half of each of our registers was unused. Backport-to: 24.2 Reviewed-by: Ian Romanick Part-of: --- .../compiler/brw_fs_combine_constants.cpp | 52 +++++++++++-------- 1 file changed, 30 insertions(+), 22 deletions(-) diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp index c2a8dcdbc05..02f4c4ad127 100644 --- a/src/intel/compiler/brw_fs_combine_constants.cpp +++ b/src/intel/compiler/brw_fs_combine_constants.cpp @@ -1144,31 +1144,35 @@ struct register_allocation { /** * Mask of currently available slots in this register. * - * Each register is 16, 16-bit slots. Allocations require 1, 2, or 4 slots - * for word, double-word, or quad-word values, respectively. + * Each register is 16 (32 on Xe2), 16-bit slots. Allocations require 1, + * 2, or 4 slots for word, double-word, or quad-word values, respectively. */ - uint16_t avail; + uint32_t avail; }; static brw_reg -allocate_slots(struct register_allocation *regs, unsigned num_regs, +allocate_slots(const intel_device_info *devinfo, + struct register_allocation *regs, unsigned num_regs, unsigned bytes, unsigned align_bytes, brw::simple_allocator &alloc) { assert(bytes == 2 || bytes == 4 || bytes == 8); assert(align_bytes == 2 || align_bytes == 4 || align_bytes == 8); + const unsigned slots_per_reg = + REG_SIZE * reg_unit(devinfo) / sizeof(uint16_t); + const unsigned words = bytes / 2; const unsigned align_words = align_bytes / 2; - const uint16_t mask = (1U << words) - 1; + const uint32_t mask = (1U << words) - 1; for (unsigned i = 0; i < num_regs; i++) { - for (unsigned j = 0; j <= (16 - words); j += align_words) { - const uint16_t x = regs[i].avail >> j; + for (unsigned j = 0; j <= (slots_per_reg - words); j += align_words) { + const uint32_t x = regs[i].avail >> j; if ((x & mask) == mask) { if (regs[i].nr == UINT_MAX) - regs[i].nr = alloc.allocate(1); + regs[i].nr = alloc.allocate(reg_unit(devinfo)); regs[i].avail &= ~(mask << j); @@ -1184,16 +1188,17 @@ allocate_slots(struct register_allocation *regs, unsigned num_regs, } static void -deallocate_slots(struct register_allocation *regs, unsigned num_regs, +deallocate_slots(const struct intel_device_info *devinfo, + struct register_allocation *regs, unsigned num_regs, unsigned reg_nr, unsigned subreg_offset, unsigned bytes) { assert(bytes == 2 || bytes == 4 || bytes == 8); assert(subreg_offset % 2 == 0); - assert(subreg_offset + bytes <= 32); + assert(subreg_offset + bytes <= REG_SIZE * reg_unit(devinfo)); const unsigned words = bytes / 2; const unsigned offset = subreg_offset / 2; - const uint16_t mask = ((1U << words) - 1) << offset; + const uint32_t mask = ((1U << words) - 1) << offset; for (unsigned i = 0; i < num_regs; i++) { if (regs[i].nr == reg_nr) { @@ -1206,7 +1211,8 @@ deallocate_slots(struct register_allocation *regs, unsigned num_regs, } static void -parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block, +parcel_out_registers(const intel_device_info *devinfo, + struct imm *imm, unsigned len, const bblock_t *cur_block, struct register_allocation *regs, unsigned num_regs, brw::simple_allocator &alloc) { @@ -1229,10 +1235,10 @@ parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block, for (unsigned i = 0; i < len; i++) { if (imm[i].block == cur_block && imm[i].used_in_single_block == used_in_single_block) { - const brw_reg reg = allocate_slots(regs, num_regs, - imm[i].size, - get_alignment_for_imm(&imm[i]), - alloc); + const brw_reg reg = allocate_slots(devinfo, regs, num_regs, + imm[i].size, + get_alignment_for_imm(&imm[i]), + alloc); imm[i].nr = reg.nr; imm[i].subreg_offset = reg.offset; @@ -1242,8 +1248,8 @@ parcel_out_registers(struct imm *imm, unsigned len, const bblock_t *cur_block, for (unsigned i = 0; i < len; i++) { if (imm[i].block == cur_block && imm[i].used_in_single_block) { - deallocate_slots(regs, num_regs, imm[i].nr, imm[i].subreg_offset, - imm[i].size); + deallocate_slots(devinfo, regs, num_regs, imm[i].nr, + imm[i].subreg_offset, imm[i].size); } } } @@ -1470,14 +1476,16 @@ brw_fs_opt_combine_constants(fs_visitor &s) struct register_allocation *regs = (struct register_allocation *) calloc(table.len, sizeof(regs[0])); + const unsigned all_avail = devinfo->ver >= 20 ? 0xffffffff : 0xffff; + for (int i = 0; i < table.len; i++) { regs[i].nr = UINT_MAX; - regs[i].avail = 0xffff; + regs[i].avail = all_avail; } foreach_block(block, s.cfg) { - parcel_out_registers(table.imm, table.len, block, regs, table.len, - s.alloc); + parcel_out_registers(devinfo, table.imm, table.len, block, regs, + table.len, s.alloc); } free(regs); @@ -1566,7 +1574,7 @@ brw_fs_opt_combine_constants(fs_visitor &s) struct brw_reg imm_reg = build_imm_reg_for_copy(imm); /* Ensure we have enough space in the register to copy the immediate */ - assert(reg.offset + brw_type_size_bytes(imm_reg.type) * width <= REG_SIZE); + assert(reg.offset + brw_type_size_bytes(imm_reg.type) * width <= REG_SIZE * reg_unit(devinfo)); ibld.MOV(retype(reg, imm_reg.type), imm_reg); }