diff --git a/src/intel/compiler/jay/jay_from_nir.c b/src/intel/compiler/jay/jay_from_nir.c index e325918be7d..b908e41a518 100644 --- a/src/intel/compiler/jay/jay_from_nir.c +++ b/src/intel/compiler/jay/jay_from_nir.c @@ -2522,11 +2522,6 @@ setup_vertex_payload(struct nir_to_jay_state *nj, struct payload_builder *p) { nj->payload.urb_handle = read_payload(p, GPR); - /* XXX: This is a hack to line up with the partition chosen in RA. This whole - * thing needs an overhaul. Need to think harder about partitioning. - */ - p->offsets[GPR] += 7; - setup_payload_dispatch_start(nj, p); setup_payload_push(nj, p); @@ -2605,22 +2600,52 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p) fs->bary[i] = read_vector_payload(p, GPR, 2); } - if (nj->s->prog_data->fs.uses_src_depth) { - fs->coord.z = read_payload(p, GPR); + struct { + bool cond; + jay_def *def; + } split_gprs[] = { + { nj->s->prog_data->fs.uses_src_depth, &fs->coord.z }, + { nj->s->prog_data->fs.uses_src_w, &fs->coord.w }, + { nj->s->prog_data->fs.uses_sample_mask, &fs->coverage_mask }, + }; + + unsigned extra_gpr = + split_gprs[0].cond + split_gprs[1].cond + split_gprs[2].cond; + bool odd = extra_gpr & 1; + + for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) { + if (split_gprs[i].cond) { + extra_gpr -= 1; + + /* Pad out to GPR alignment by reading the last split GPR as two UGPR + * halves and zipping them together below. This lets us construct a + * valid partition with minimal copying. + */ + if (extra_gpr == 0 && jay_grf_per_gpr(nj->s) == 2 && odd) { + *split_gprs[i].def = + read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); + } else { + *split_gprs[i].def = read_payload(p, GPR); + } + } } - if (nj->s->prog_data->fs.uses_src_w) { - fs->coord.w = read_payload(p, GPR); - } - - if (nj->s->prog_data->fs.uses_sample_mask) { - fs->coverage_mask = read_payload(p, GPR); - } + assert(extra_gpr == 0); if (nj->s->prog_data->fs.uses_pos_offset) { fs->sample_pos = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); } + nj->s->payload_ugprs = p->offsets[UGPR]; + + jay_def split[3] = { jay_null() }; + for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) { + if (!jay_is_null(*split_gprs[i].def) && + (*split_gprs[i].def).file == UGPR) { + split[i] = read_vector_payload(p, UGPR, jay_ugpr_per_grf(nj->s)); + } + } + setup_payload_dispatch_start(nj, p); setup_payload_push(nj, p); @@ -2639,6 +2664,13 @@ setup_fragment_payload(struct nir_to_jay_state *nj, struct payload_builder *p) } } + for (unsigned i = 0; i < ARRAY_SIZE(split_gprs); ++i) { + if (!jay_is_null(split[i]) && split_gprs[i].def->file == UGPR) { + *(split_gprs[i].def) = + jay_ZIP_UGPR16_u32(&nj->bld, *split_gprs[i].def, split[i]); + } + } + if (nj->s->prog_data->fs.uses_src_xy) { jay_def t = jay_alloc_def(&nj->bld, GPR, 1); jay_def lo = jay_extract_range(nj->payload.u0, 10, 4); @@ -2675,7 +2707,6 @@ jay_insert_payload_swizzle(jay_shader *s) jay_builder b = jay_init_builder(func, jay_before_function(func)); unsigned size = s->payload_gprs; - assert(s->partition.blocks[GPR][0].start == 1); /* Odd: copy both halves to contiguous pair after payload */ for (unsigned i = 0; i < (size / 2); ++i) { @@ -2936,6 +2967,8 @@ jay_compile(const struct intel_device_info *devinfo, if (debug) { fprintf(stdout, "Jay shader (post-RA):\n\n"); jay_print(stdout, s); + + jay_print_partition(&s->partition); } struct jay_shader_bin *bin = diff --git a/src/intel/compiler/jay/jay_ir.h b/src/intel/compiler/jay/jay_ir.h index 18f3e411780..1e92e2ec6bc 100644 --- a/src/intel/compiler/jay/jay_ir.h +++ b/src/intel/compiler/jay/jay_ir.h @@ -745,35 +745,66 @@ jay_stride_to_bits(enum jay_stride s) return 16 << s; } -#define JAY_PARTITION_BLOCKS (3) +#define jay_foreach_ra_file(file) \ + for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file) + +#define JAY_PARTITION_BLOCKS (6) + +enum jay_block_type { + JAY_BLOCK_NORMAL, + + /** A block suitable for EOT sends */ + JAY_BLOCK_EOT, + + /** A block reserved for post-RA spill lowering */ + JAY_BLOCK_SPILL, + + JAY_BLOCK_TYPES, +}; struct jay_register_block { - uint16_t start, len; + /** First GRF mapped by this block */ + uint16_t start_grf; + + /** First GPR/UGPR mapped by this block */ + uint16_t start_gpr; + + /** Length of this block in GPRs/UGPRs */ + uint16_t len_gpr; + + /** For GPR blocks, stride of GPRs in this block. */ + enum jay_stride stride; + + /** Special feature of the block */ + enum jay_block_type type:2; }; +static_assert(sizeof(struct jay_register_block) == 8, "packed"); struct jay_partition { - /** Consecutive ranges of GRFs in GPR/UGPRs. */ - struct jay_register_block blocks[JAY_NUM_GRF_FILES][JAY_PARTITION_BLOCKS]; + struct jay_register_block blocks[JAY_NUM_RA_FILES][JAY_PARTITION_BLOCKS]; + unsigned nr_blocks[JAY_NUM_RA_FILES]; /** Number of GPR/UGPRs per GRF, times 16. For example, 16 encodes SIMD16 - * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 UGPRs). - * 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR). + * 32-bit GPRs on Xe2 (1 GRF = 1 GPR). 256 encodes UGPRs (1 GRF = 16 + * UGPRs). 8 encodes SIMD32 32-bit GPRs on Xe2 (2 GRF = 1 GPR). */ - unsigned units_x16[JAY_NUM_GRF_FILES]; - - /** Base GPR for each stride. The file is partitioned (4, 8, 2, 4=EOT). */ - unsigned base8, base2, base_eot; - - /** Region of the UGPR partition suitable for large UGPR vectors */ - struct jay_register_block large_ugpr_block; + unsigned units_x16[JAY_NUM_RA_FILES]; }; -static inline enum jay_stride -jay_gpr_to_stride(const struct jay_partition *p, unsigned reg) +static inline struct jay_register_block +jay_lookup_block(const struct jay_partition *p, + unsigned reg, + enum jay_file file) { - return (reg < p->base8 || reg >= p->base_eot) ? JAY_STRIDE_4 : - reg >= p->base2 ? JAY_STRIDE_2 : - JAY_STRIDE_8; + for (unsigned i = 0; i < p->nr_blocks[file]; ++i) { + struct jay_register_block B = p->blocks[file][i]; + + if (reg >= B.start_gpr && reg < B.start_gpr + B.len_gpr) { + return B; + } + } + + UNREACHABLE("invalid reg"); } /** @@ -786,7 +817,7 @@ typedef struct jay_shader { union brw_any_prog_data *prog_data; unsigned spills, fills; unsigned scratch_size; - unsigned payload_gprs, push_grfs; + unsigned payload_gprs, payload_ugprs, push_grfs; /** * Ralloc linear context. Since we don't typically free as we go, @@ -1051,7 +1082,7 @@ static inline enum jay_stride jay_def_stride(const jay_shader *shader, jay_def x) { assert(x.file == GPR); - return jay_gpr_to_stride(&shader->partition, x.reg); + return jay_lookup_block(&shader->partition, x.reg, GPR).stride; } /* Represents an allocated register number with file in the top 3 bits. */ diff --git a/src/intel/compiler/jay/jay_lower_spill.c b/src/intel/compiler/jay/jay_lower_spill.c index 9bd265dcaf0..56762bf1a72 100644 --- a/src/intel/compiler/jay/jay_lower_spill.c +++ b/src/intel/compiler/jay/jay_lower_spill.c @@ -51,9 +51,26 @@ jay_lower_spill(jay_function *func) { jay_builder b = jay_init_builder(func, jay_before_function(func)); - /* We reserve the top UGPRs for spilling by ABI */ - unsigned ugpr_reservation = func->shader->num_regs[UGPR]; - assert(util_is_aligned(ugpr_reservation, func->shader->dispatch_width)); + /* We reserved a block of UGPRs for our use */ + signed ugpr_reservation = -1, gpr2 = -1; + for (unsigned i = 0; i < func->shader->partition.nr_blocks[GPR]; ++i) { + struct jay_register_block B = func->shader->partition.blocks[GPR][i]; + + if (B.stride == JAY_STRIDE_2) { + gpr2 = B.start_gpr; + } + } + + for (unsigned i = 0; i < func->shader->partition.nr_blocks[UGPR]; ++i) { + struct jay_register_block B = func->shader->partition.blocks[UGPR][i]; + + if (B.type == JAY_BLOCK_SPILL) { + ugpr_reservation = B.start_gpr; + } + } + + assert(ugpr_reservation >= 0 && "must have reserved something"); + assert(gpr2 >= 0 && "must have a stride-2 gpr"); jay_def sp = jay_bare_reg(UGPR, ugpr_reservation); sp.num_values_m1 = func->shader->dispatch_width - 1; @@ -80,7 +97,7 @@ jay_lower_spill(jay_function *func) jay_SHR(&b, JAY_TYPE_U32, ADDRESS_REG, tmpu, 4); /* We use a 32-bit strided stack: SP = scratch + (lane ID * 4) */ - jay_def tmp2 = jay_bare_reg(GPR, func->shader->partition.base2); + jay_def tmp2 = jay_bare_reg(GPR, gpr2); jay_LANE_ID_8(&b, tmp2); for (unsigned i = 8; i < b.shader->dispatch_width; i *= 2) { jay_LANE_ID_EXPAND(&b, tmp2, tmp2, i); diff --git a/src/intel/compiler/jay/jay_partition.c b/src/intel/compiler/jay/jay_partition.c new file mode 100644 index 00000000000..6d489f42170 --- /dev/null +++ b/src/intel/compiler/jay/jay_partition.c @@ -0,0 +1,239 @@ +/* + * Copyright 2026 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "jay_ir.h" +#include "jay_private.h" + +/* + * jay_partition_grf partitions the register file for the entire shader, + * satisfying functional and performance rules. The partition is specified in a + * convenient form within this file, as a flat array of jay_partition_builder + * structs, which build_partition translates to the more complicated + * jay_partition structs. + * + * All functions must share the same partition for correctness with non-uniform + * function calls. For unlinked library functions, we must use the ABI + * partition (TODO). + */ +struct jay_partition_builder { + enum jay_file file; + enum jay_stride stride; + signed len_grf; + enum jay_block_type type; +}; + +static void +build_partition(jay_shader *shader, struct jay_partition_builder *b, unsigned n) +{ + unsigned base_grf = 0, base_gpr[JAY_NUM_RA_FILES] = { 0 }; + struct jay_partition *p = &shader->partition; + + *p = (struct jay_partition) { + .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16, + .units_x16[GPR] = 16 / jay_grf_per_gpr(shader), + .units_x16[MEM] = 16 / jay_grf_per_gpr(shader), + }; + + for (unsigned i = 0; i < n; ++i) { + if (b[i].len_grf) { + enum jay_file file = b[i].file; + unsigned len_gpr = (b[i].len_grf * p->units_x16[file]) / 16; + bool grf = file < JAY_NUM_GRF_FILES; + assert(p->nr_blocks[file] < JAY_PARTITION_BLOCKS); + + p->blocks[file][p->nr_blocks[file]++] = (struct jay_register_block) { + .start_grf = grf ? base_grf : 0, + .start_gpr = base_gpr[file], + .len_gpr = (b[i].len_grf * p->units_x16[file]) / 16, + .stride = b[i].stride, + .type = b[i].type, + }; + + if (file < JAY_NUM_GRF_FILES) { + base_grf += b[i].len_grf; + base_gpr[file] += len_gpr; + } + } + } + + /* Validate the well formedness of the partition we built above */ + BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 }; + + for (enum jay_file file = 0; file < JAY_NUM_GRF_FILES; ++file) { + for (unsigned b = 0; b < p->nr_blocks[file]; ++b) { + struct jay_register_block B = p->blocks[file][b]; + unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file]; + + assert(len_grf > 0 && "no empty partitions"); + assert(B.start_grf + len_grf <= JAY_NUM_PHYS_GRF && "GRF file size"); + assert(!BITSET_TEST_COUNT(regs, B.start_grf, len_grf) && "uniqueness"); + + BITSET_SET_COUNT(regs, B.start_grf, len_grf); + } + } + + assert(BITSET_COUNT(regs) == JAY_NUM_PHYS_GRF && "all GRFs mapped"); +} + +void +jay_partition_grf(jay_shader *shader) +{ + /* Calculate the maximum register demand across all functions in the shader. + * We will use this to choose a good partition. + */ + unsigned demand[JAY_NUM_GRF_FILES] = { 0 }; + + jay_foreach_function(shader, f) { + jay_compute_liveness(f); + jay_calculate_register_demands(f); + + demand[GPR] = MAX2(demand[GPR], f->demand[GPR]); + demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]); + } + + /* We must have enough register file space for the register payload, plus the + * reserved UGPRs in the case we spill. That UGPR interferes with everything + * we preload so it needs to be reserved specially here for the worst case. + */ + jay_foreach_preload(jay_shader_get_entrypoint(shader), I) { + unsigned end = jay_preload_reg(I) + jay_num_values(I->dst); + unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0; + assert(I->dst.file < JAY_NUM_GRF_FILES); + demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra); + } + + /* Determine a good GPR/UGPR split informed by the demand calculation */ + unsigned grf_per_gpr = jay_grf_per_gpr(shader); + unsigned ugpr_per_grf = jay_ugpr_per_grf(shader); + unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf); + + /* We must have enough for SIMD1 images (TODO: Check if this actually + * applies. Or if we could eliminate this with smarter partitioning even.) + */ + unsigned min_ugprs = 16; + min_ugprs = MAX2(min_ugprs, 256); + + /* TODO: We could partition more cleverly */ + uniform_grfs = align(uniform_grfs, grf_per_gpr); + uniform_grfs = CLAMP(uniform_grfs, DIV_ROUND_UP(min_ugprs, ugpr_per_grf), + 128 - (32 * grf_per_gpr)); + unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; + + /* Check the split */ + assert((uniform_grfs * ugpr_per_grf) >= min_ugprs); + assert(nonuniform_grfs >= 32 * grf_per_gpr); + assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF); + + /* Set the targets for the virtual register file accordingly */ + shader->num_regs[GPR] = nonuniform_grfs / grf_per_gpr; + shader->num_regs[UGPR] = uniform_grfs * ugpr_per_grf; + + unsigned spill_reservation = 0, mem_slots = 0; + + /* Spilling requires reserving UGPRs for the lowered SENDs */ + if (demand[GPR] > jay_gpr_limit(shader)) { + spill_reservation = shader->dispatch_width / ugpr_per_grf; + + /* This should be an acceptable upper limit since we assign memory tightly + * thanks to the usual SSA allocator guarantees. + */ + mem_slots = demand[GPR] * grf_per_gpr; + shader->num_regs[MEM] = demand[GPR]; + } + + unsigned payload_4[2] = { 0, 0 }, payload_u[2] = { grf_per_gpr, 0 }; + unsigned eot_u = 0, eot_4 = 0; + + if (shader->stage == MESA_SHADER_VERTEX) { + payload_4[0] = 1; + payload_4[1] = shader->prog_data->vue.urb_read_length * 8; + payload_u[1] = shader->push_grfs; + eot_4 = 16; + } else if (shader->stage == MESA_SHADER_FRAGMENT) { + /* The SIMD32 fragment payload splits GPRs into low and high GRFs, with + * UGPRs mixed in between. jay_insert_payload_swizzle deals with this and + * swizzles things appropriately, we just need the partition to have two + * separate GPR block with a UGPR block in between. That requires the + * number of GPRs in the payload to be even. + */ + assert(util_is_aligned(shader->payload_gprs, grf_per_gpr) && + "payload constraint"); + + payload_4[0] = shader->payload_gprs; + payload_u[1] = (shader->payload_ugprs / ugpr_per_grf) - payload_u[0]; + payload_4[1] = grf_per_gpr == 2 ? shader->payload_gprs : 0; + eot_4 = 14; + eot_u = 1; + } else { + eot_u = 1; + } + + unsigned special_u = payload_u[0] + payload_u[1] + spill_reservation + eot_u; + unsigned special_4 = payload_4[0] + payload_4[1] + eot_4; + + /* TODO: Make the stride partition smarter */ + unsigned grf_8 = 8 * grf_per_gpr; + unsigned grf_2 = 8; + + struct jay_partition_builder blocks[] = { + /* Stage-specific payload */ + { UGPR, 0, payload_u[0] }, + { GPR, JAY_STRIDE_4, payload_4[0] }, + { UGPR, 0, payload_u[1] }, + { GPR, JAY_STRIDE_4, payload_4[1] }, + + /* General registers */ + { UGPR, 0, uniform_grfs - special_u }, + { GPR, JAY_STRIDE_4, nonuniform_grfs - (special_4 + grf_8 + grf_2) }, + { GPR, JAY_STRIDE_8, grf_8 }, + { GPR, JAY_STRIDE_2, grf_2 }, + + /* Spilling registers */ + { UGPR, 0, spill_reservation, JAY_BLOCK_SPILL }, + { MEM, JAY_STRIDE_4, mem_slots }, + + /* EOT */ + { UGPR, 0, eot_u, JAY_BLOCK_EOT }, + { GPR, JAY_STRIDE_4, eot_4, JAY_BLOCK_EOT }, + }; + + build_partition(shader, blocks, ARRAY_SIZE(blocks)); + + /* By construction of our partition, the entire GRF is used. */ + shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF; +} + +#define ANSI_END "\033[0m" +#define ANSI_BOLD "\033[1m" +#define ANSI_ITALIC "\033[3m" + +void +jay_print_partition(struct jay_partition *p) +{ + jay_foreach_ra_file(file) { + if (p->nr_blocks[file]) { + const char *files[JAY_NUM_RA_FILES] = { "GPR", "UGPR", "MEM" }; + printf("%s" ANSI_BOLD " GRF %s%s" ANSI_END "\n", + file ? "\n" : "", files[file], file == GPR ? " Stride" : ""); + } + + for (unsigned b = 0; b < p->nr_blocks[file]; ++b) { + struct jay_register_block B = p->blocks[file][b]; + unsigned len_grf = (B.len_gpr * 16) / p->units_x16[file]; + + printf(" %3u…%-3u %3u…%-3u", B.start_grf, B.start_grf + len_grf - 1, + B.start_gpr, B.start_gpr + B.len_gpr - 1); + + if (file == GPR) { + printf(" %u-bit", jay_stride_to_bits(B.stride)); + } + + const char *types[JAY_BLOCK_TYPES] = { "", " EOT", " Spill" }; + printf(ANSI_ITALIC "%s" ANSI_END "\n", types[B.type]); + } + } + + printf("\n"); +} diff --git a/src/intel/compiler/jay/jay_private.h b/src/intel/compiler/jay/jay_private.h index 78276fe78d8..1ce8638346e 100644 --- a/src/intel/compiler/jay/jay_private.h +++ b/src/intel/compiler/jay/jay_private.h @@ -39,6 +39,7 @@ void jay_calculate_register_demands(jay_function *f); void jay_spill(jay_function *func, unsigned limit); void jay_partition_grf(jay_shader *shader); +void jay_print_partition(struct jay_partition *p); void jay_register_allocate(jay_shader *s); void jay_assign_flags(jay_shader *s); void jay_assign_accumulators(jay_shader *s); @@ -86,6 +87,16 @@ struct jay_shader_bin *jay_to_binary(jay_shader *s, size_t const_data_size, bool debug); +static inline unsigned +jay_gpr_limit(jay_shader *shader) +{ + /* If testing spilling, set limit tightly. */ + bool test = (jay_debug & JAY_DBG_SPILL); + test &= shader->stage != MESA_SHADER_VERTEX; + + return test ? 13 : shader->num_regs[GPR]; +} + #ifdef __cplusplus } /* extern C */ #endif diff --git a/src/intel/compiler/jay/jay_register_allocate.c b/src/intel/compiler/jay/jay_register_allocate.c index b3b85dfbca4..dd04344c77c 100644 --- a/src/intel/compiler/jay/jay_register_allocate.c +++ b/src/intel/compiler/jay/jay_register_allocate.c @@ -15,7 +15,6 @@ #include "jay_ir.h" #include "jay_opcodes.h" #include "jay_private.h" -#include "shader_enums.h" /** * Register allocation for Jay shaders. @@ -39,9 +38,6 @@ * Finally, we deconstruct SSA. */ -#define jay_foreach_ra_file(file) \ - for (enum jay_file file = 0; file < JAY_NUM_RA_FILES; ++file) - #define jay_foreach_ra_src(I, s) \ jay_foreach_src(I, s) \ if (I->src[s].file < JAY_NUM_RA_FILES && !jay_is_null(I->src[s])) @@ -186,7 +182,11 @@ struct affinity { */ bool eot:1; - /** If true, this UGPR needs full GRF alignment */ + /** + * If align is nonzero, this SSA def should be assigned to a register of the + * form (k * align) + align_offs for some integer k. In other words, align is + * the alignment of the whole vector and align_offs is this def's channel. + */ unsigned align :5; unsigned align_offs:4; unsigned nr :4; @@ -283,15 +283,16 @@ def_from_reg(jay_reg r) return jay_bare_reg(r_file(r), r_reg(r)); } +struct jay_roundrobin { + unsigned block, gpr; +}; + typedef struct jay_ra_state { /** Size of each register file */ unsigned num_regs[JAY_NUM_RA_FILES]; - /** Counter for roundrobin register allocation */ - unsigned roundrobin[JAY_NUM_RA_FILES]; - - /** First GPR that may be used for EOT sends */ - unsigned eot_offs; + /** Partition-aware counters for roundrobin register allocation */ + struct jay_roundrobin roundrobin[JAY_NUM_RA_FILES][JAY_NUM_STRIDES]; /** Phi coalescing data structure */ struct phi_web_node *phi_web; @@ -691,11 +692,12 @@ try_find_free_reg(jay_ra_state *ra, unsigned except, bool stride4) { + struct jay_partition *p = &ra->b.shader->partition; + unsigned i; BITSET_FOREACH_SET(i, ra->available_regs[file], ra->num_regs[file]) { if (i != except && - (!stride4 || - jay_gpr_to_stride(&ra->b.shader->partition, i) == JAY_STRIDE_4)) { + (!stride4 || jay_lookup_block(p, i, GPR).stride == JAY_STRIDE_4)) { return make_reg(file, i); } } @@ -732,6 +734,124 @@ find_temp_regs(jay_ra_state *ra) }; } +static void +pick_regs_from_block(jay_ra_state *ra, + enum jay_file file, + unsigned size, + unsigned alignment, + jay_inst *I, + jay_def var, + bool is_src, + struct jay_register_block block, + unsigned block_cost, + struct affinity affinity, + unsigned *best_cost, + unsigned *best_reg, + unsigned first) +{ + bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND; + must_tie &= !is_src; + + /* Cross-lane access cannot be SIMD split if the source/destination registers + * overlap, but as long as we don't tie those destinations, we're ok. + */ + bool may_tie = !jay_is_shuffle_like(I); + + first = align(first, alignment); + for (unsigned i = first; i + size <= block.len_gpr; i += alignment) { + unsigned r = block.start_gpr + i; + + unsigned cost = block_cost; + bool tied = !is_src && BITSET_TEST(ra->killed[file], r); + + if (tied ? !may_tie : + (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) + continue; + + /* Try to tie predicated default values, otherwise post-RA lowering needs + * to insert a predicated-MOV or SEL. + */ + if (I->predication == JAY_PREDICATED_DEFAULT && !is_src) + cost += jay_inst_get_default(I)->reg != r; + + /* If there are stricter alignment requirements later, model the cost of + * inserting copies for that. + */ + if (affinity.align && + (i < affinity.align_offs || + !util_is_aligned(i - affinity.align_offs, affinity.align))) + cost += size; + + if (affinity.repr == jay_channel(var, 0)) { + /* If we are the collect representative but the final collect won't + * actually be usable, the whole vector will need to be copied. + */ + if (i < affinity.offset || !util_is_aligned(i - affinity.offset, 4)) { + cost += affinity.nr; + } + } else if (affinity.repr) { + /* If we are used for a collect but not in the right place, we will + * similarly insert copies. + */ + if (ra->reg_for_index[affinity.repr] != NO_REG && + r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) { + + cost++; + } + } + + for (unsigned c = 0; c < size; ++c) { + unsigned j = r + c; + + /* If the register is unavailable, account for the cost of shuffling */ + if (!BITSET_TEST(ra->available_regs[file], j) && !tied) { + bool live_out = u_sparse_bitset_test(&ra->block->live_out, + ra->index_for_reg[file][j]); + cost += 1 + live_out; + } + + /* Model the cost of shuffling for phis */ + if (c < jay_num_values(var)) { + struct phi_web_node *phi_web = + &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))]; + if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != j) { + cost += 2; + } + } + + /* Choosing this register will pin it, leaving it unavailable to later + * smaller sources which will need a move. + */ + cost += BITSET_TEST(ra->sources[file], j); + } + + if (cost < *best_cost) { + *best_cost = cost; + *best_reg = r; + + /* If we find something with 0 cost, we are guaranteed to pick this + * register, so terminate early. This speeds up the search. + */ + if (cost == 0) { + return; + } + } + } +} + +static bool +is_block_compatible(struct jay_register_block block, + enum jay_file file, + enum jay_stride min_stride, + enum jay_stride max_stride, + bool eot) +{ + return block.type != JAY_BLOCK_SPILL && + (file != GPR || + (min_stride <= block.stride && block.stride <= max_stride)) && + (!eot || block.type == JAY_BLOCK_EOT); +} + static unsigned pick_regs(jay_ra_state *ra, enum jay_file file, @@ -744,38 +864,17 @@ pick_regs(jay_ra_state *ra, bool is_src) { struct jay_partition *partition = &ra->b.shader->partition; - unsigned first = 0, end = ra->num_regs[file]; - bool must_tie = I->op == JAY_OPCODE_LANE_ID_EXPAND; - must_tie &= !is_src; - - /* Cross-lane access cannot be SIMD split if the source/destination registers - * overlap, but as long as we don't tie those destinations, we're ok. - */ - bool may_tie = !jay_is_shuffle_like(I); - - /* Ensure we do not cross partitions */ - if (file == UGPR && size > 16) { - first = partition->large_ugpr_block.start; - end = partition->large_ugpr_block.start + partition->large_ugpr_block.len; - } else if (file == GPR && size > 1 && ra->b.shader->payload_gprs < 8) { - first = align(ra->b.shader->payload_gprs, MAX2(size, alignment)); - } - - /* Sources used by end-of-thread sends must be at the end of the file */ - if (I->op == JAY_OPCODE_SEND && jay_send_eot(I)) { - first = ra->eot_offs; - } + bool eot = I->op == JAY_OPCODE_SEND && jay_send_eot(I); /* If possible, keep sources in place to avoid shuffles. */ if (is_src && jay_channel(var, 0) != 0) { unsigned cur = r_reg(ra->reg_for_index[jay_channel(var, 0)]); - enum jay_stride stride = jay_gpr_to_stride(partition, cur); + struct jay_register_block block = jay_lookup_block(partition, cur, file); if (!BITSET_TEST_COUNT(ra->pinned[file], cur, size) && - util_is_aligned(cur, alignment) && - cur >= first && - cur + size <= end && - (file != GPR || (min_stride <= stride && stride <= max_stride))) { + util_is_aligned(cur - block.start_gpr, alignment) && + is_block_compatible(block, file, min_stride, max_stride, eot) && + cur + size <= (block.start_gpr + block.len_gpr)) { return cur; } } @@ -786,120 +885,74 @@ pick_regs(jay_ra_state *ra, ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, 0))].affinity; assert(alignment >= size && "alignment must be a multiple of size"); - unsigned nr = DIV_ROUND_UP((end + 1 - size - first), alignment); - unsigned roundrobin = (ra->roundrobin[file]) % nr; - unsigned rr_al = roundrobin * alignment, nr_al = nr * alignment; - /* Heuristic: Advance the roundrobin by a whole vector if we are the - * representative. This leaves us registers for the rest of the vector. + /* We select registers roundrobin. This has several benefits: + * + * 1. Easier coalescing since we are less likely statistically to allocate + * a register that a future instruction has an affinity. + * + * 2. More freedom for post-RA scheduling thanks to fewer dependencies. + * + * 3. Less stalling due to SWSB annotations from register reuse. */ - ra->roundrobin[file] += - affinity.repr == jay_channel(var, 0) ? MAX2(size, affinity.nr) : size; + enum jay_stride stride = file == GPR ? min_stride : 0; + struct jay_roundrobin *rr = &ra->roundrobin[file][stride]; + unsigned nr_blocks = partition->nr_blocks[file]; - for (unsigned i = rr_al; i < rr_al + nr_al; i += alignment) { - /* We select registers roundrobin. This has several benefits: - * - * 1. Easier coalescing since we are less likely statistically to allocate - * a register that a future instruction has an affinity. - * - * 2. More freedom for post-RA scheduling thanks to fewer dependencies. - * - * 3. Less stalling due to SWSB annotations from register reuse. - */ - unsigned r = first + (i >= nr_al ? (i - nr_al) : i); - assert(r >= first && r + size <= end); + /* Make sure we use the optimal stride for roundrobin RA */ + if (file == GPR) { + while (partition->blocks[GPR][rr->block].stride != stride) { + rr->block = (rr->block + 1 == nr_blocks) ? 0 : rr->block + 1; + } + } - unsigned cost = 0; - bool tied = !is_src && BITSET_TEST(ra->killed[file], r); - enum jay_stride stride = - file == GPR ? jay_gpr_to_stride(partition, r) : min_stride; + unsigned last_b_ = rr->block + nr_blocks; + for (unsigned b_ = rr->block; b_ <= last_b_ && best_cost > 0; ++b_) { + unsigned b = b_ >= nr_blocks ? (b_ - nr_blocks) : b_; + assert(b < nr_blocks); - if ((tied ? !may_tie : - (must_tie || BITSET_TEST_COUNT(ra->pinned[file], r, size))) || - !(min_stride <= stride && stride <= max_stride)) - continue; + struct jay_register_block block = partition->blocks[file][b]; - /* Try to tie predicated default values, otherwise post-RA lowering needs - * to insert a predicated-MOV or SEL. - */ - if (I->predication == JAY_PREDICATED_DEFAULT && !is_src) - cost += jay_inst_get_default(I)->reg != r; + if (is_block_compatible(block, file, min_stride, max_stride, eot)) { + unsigned r = b_ == rr->block ? rr->gpr : 0; - /* Assigning a stride that is too big may result in SIMDness splitting. - * Model that cost so we prefer packed registers. - */ - cost += stride - min_stride; - - /* If we are used for end-of-thread and it is not in the appropriate - * register, we will need to insert 1 copy per channel at the end. - */ - if (affinity.eot && r < ra->eot_offs) - cost += size; - - /* If there are stricter alignment requirements later, model the cost of - * inserting copies for that. - */ - if (affinity.align && - !util_is_aligned(r - affinity.align_offs, affinity.align)) - cost += size; - - if (affinity.repr == jay_channel(var, 0)) { - /* If we are the collect representative but the final collect won't - * actually be usable, the whole vector will need to be copied. - */ - if (!util_is_aligned(r - affinity.offset, 8) || - (affinity.eot && r - affinity.offset < ra->eot_offs)) { - cost += 8; + if (affinity.repr == jay_channel(var, 0) && b_ == rr->block) { + r += affinity.offset; } - } else if (affinity.repr) { - /* If we are used for a collect but not in the right place, we will - * similarly insert copies. - */ - if (ra->reg_for_index[affinity.repr] != NO_REG && - r_reg(ra->reg_for_index[affinity.repr]) != r - affinity.offset) { - cost += size; + /* Assigning a stride that is too big may result in SIMDness splitting. + * Model that cost so we prefer packed registers. + */ + unsigned block_cost = file == GPR ? block.stride - min_stride : 0; + + /* If we are used for end-of-thread and it is not in the appropriate + * register, we will need to insert 1 copy per channel at the end. + */ + if (affinity.eot && block.type != JAY_BLOCK_EOT) { + block_cost += size; + } + + /* Consider only blocks that could be picked */ + if (best_cost > block_cost) { + pick_regs_from_block(ra, file, size, alignment, I, var, is_src, + block, block_cost, affinity, &best_cost, + &best_reg, r); } } + } - for (unsigned c = 0; c < size; ++c) { - unsigned i = r + c; + /* If we chose a register roundrobin (the constant 16 here is determined + * experimentally), advance the roundrobin. As a heuristic, advance by a + * whole vector if we are the representative. This leaves us registers for + * the rest of the vector. + */ + if (rr->gpr <= best_reg && best_reg <= rr->gpr + 16) { + bool is_repr = affinity.repr == jay_channel(var, 0); + rr->gpr = best_reg + MAX2(size, is_repr ? affinity.nr : 0); - /* If the register is unavailable, account for the cost of shuffling */ - if (!BITSET_TEST(ra->available_regs[file], i) && !tied) { - cost++; - - /* ..plus the cost of shuffling back. */ - if (u_sparse_bitset_test(&ra->block->live_out, - ra->index_for_reg[file][i])) - cost++; - } - - /* Model the cost of shuffling for phis */ - if (c < jay_num_values(var)) { - struct phi_web_node *phi_web = - &ra->phi_web[phi_web_find(ra->phi_web, jay_channel(var, c))]; - if (phi_web->reg != NO_REG && r_reg(phi_web->reg) != i) { - cost += 2; - } - } - - /* Choosing this register will pin it, leaving it unavailable to later - * smaller sources which will need a move. - */ - cost += BITSET_TEST(ra->sources[file], i); - } - - if (cost < best_cost) { - best_cost = cost; - best_reg = r; - - /* If we find something with 0 cost, we are guaranteed to pick this - * register, so terminate early. This speeds up the search. - */ - if (cost == 0) { - break; - } + if (rr->gpr >= partition->blocks[file][rr->block].len_gpr) { + rr->block = ((rr->block + 1) == nr_blocks) ? 0 : (rr->block + 1); + rr->gpr = 0; } } @@ -1322,233 +1375,6 @@ insert_parallel_copies_for_phis(jay_function *f) free(phi_dsts); } -static struct jay_register_block -block_gpr_to_grf(struct jay_partition *p, enum jay_file file, unsigned block) -{ - assert(file == GPR || file == UGPR); - assert(((p->blocks[file][block].start * 16) % p->units_x16[file]) == 0); - assert(((p->blocks[file][block].len * 16) % p->units_x16[file]) == 0); - - return (struct jay_register_block) { - .start = (p->blocks[file][block].start * 16) / p->units_x16[file], - .len = (p->blocks[file][block].len * 16) / p->units_x16[file], - }; -} - -static void -print_partition(struct jay_partition *p) -{ - for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { - for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { - struct jay_register_block B = block_gpr_to_grf(p, f, b); - const char *file = f ? "UGPR" : "GPR"; - - if (B.len > 1) { - fprintf(stderr, "%s: %u-%u\n", file, B.start, B.start + B.len - 1); - } else if (B.len == 1) { - fprintf(stderr, "%s: %u\n", file, B.start); - } - } - } - - fprintf(stderr, "\n"); -} - -/* - * Verify that a register partition is a bijective mapping of the GRF file. - */ -static void -validate_partition(struct jay_partition *p, - unsigned stride4_header_size, - unsigned nonuniform_gprs) -{ - BITSET_DECLARE(regs, JAY_NUM_PHYS_GRF) = { 0 }; - - for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { - for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { - struct jay_register_block B = block_gpr_to_grf(p, f, b); - if (B.len) { - assert(B.start + B.len <= JAY_NUM_PHYS_GRF && "GRF file size"); - assert(!BITSET_TEST_COUNT(regs, B.start, B.len) && "uniqueness"); - - BITSET_SET_COUNT(regs, B.start, B.len); - } - } - } - - for (unsigned i = 0; i < JAY_NUM_PHYS_GRF; ++i) { - assert(BITSET_TEST(regs, i) && "all GRFs mapped"); - } - - assert(p->large_ugpr_block.len && "partition must have a large UGPR block"); - assert(p->base2 >= p->base8 && p->base_eot >= p->base2 && "monotonic"); - assert(p->base8 >= stride4_header_size && "header is big enough"); - assert(p->base_eot + p->units_x16[GPR] <= nonuniform_gprs && "EOT fits"); - assert(util_is_aligned(p->base8, 8) && "so vectors don't cross"); - assert(util_is_aligned(p->base2, 8) && "so vectors don't cross"); - assert(util_is_aligned(p->base_eot, 8) && "so vectors don't cross"); -} - -static void -build_partition(jay_shader *shader, unsigned *blocks, unsigned n) -{ - unsigned base = 0; - unsigned ugpr_base = 0; - struct jay_partition *p = &shader->partition; - - *p = (struct jay_partition) { - .units_x16[UGPR] = jay_ugpr_per_grf(shader) * 16, - .units_x16[GPR] = 16 / jay_grf_per_gpr(shader), - }; - - for (unsigned i = 0; i < n; ++i) { - enum jay_file file = (i & 1) ? GPR : UGPR; - unsigned file_i = i >> 1; - - p->blocks[file][file_i].start = (base * p->units_x16[file]) / 16; - p->blocks[file][file_i].len = (blocks[i] * p->units_x16[file]) / 16; - - if (file == UGPR && blocks[i] >= 8) { - p->large_ugpr_block = (struct jay_register_block) { - .start = (ugpr_base * p->units_x16[file]) / 16, - .len = p->blocks[file][file_i].len, - }; - } - - base += blocks[i]; - if (file == UGPR) { - ugpr_base += blocks[i]; - } - } -} - -static unsigned -gpr_limit(jay_shader *shader) -{ - /* If testing spilling, set limit tightly. */ - bool test = (jay_debug & JAY_DBG_SPILL); - test &= shader->stage != MESA_SHADER_VERTEX; - - return test ? 13 : shader->num_regs[GPR]; -} - -/* - * Partition the register file for the entire shader. All functions must - * share the same partition for correctness with non-uniform function calls. - * For unlinked library functions, we must use the ABI partition (TODO). - */ -void -jay_partition_grf(jay_shader *shader) -{ - /* Calculate the maximum register demand across all functions in the shader. - * We will use this to choose a good partition. - */ - struct jay_partition *p = &shader->partition; - unsigned demand[JAY_NUM_GRF_FILES] = { 0 }; - - jay_foreach_function(shader, f) { - jay_compute_liveness(f); - jay_calculate_register_demands(f); - - demand[GPR] = MAX2(demand[GPR], f->demand[GPR]); - demand[UGPR] = MAX2(demand[UGPR], f->demand[UGPR]); - } - - /* We must have enough register file space for the register payload, plus the - * reserved UGPRs in the case we spill. That UGPR interferes with everything - * we preload so it needs to be reserved specially here for the worst case. - */ - jay_foreach_preload(jay_shader_get_entrypoint(shader), I) { - unsigned end = jay_preload_reg(I) + jay_num_values(I->dst); - unsigned extra = I->dst.file == UGPR ? shader->dispatch_width : 0; - assert(I->dst.file < JAY_NUM_GRF_FILES); - demand[I->dst.file] = MAX2(demand[I->dst.file], end + extra); - } - - /* Determine a good GPR/UGPR split informed by the demand calculation */ - unsigned ugpr_per_grf = jay_ugpr_per_grf(shader); - unsigned uniform_grfs = DIV_ROUND_UP(demand[UGPR], ugpr_per_grf); - - /* We must have enough for SIMD1 images (TODO: Check if this actually - * applies. Or if we could eliminate this with smarter partitioning even.) - */ - unsigned min_ugprs = 16; - min_ugprs = MAX2(min_ugprs, 256); - - unsigned grf_block_alignment = 8 * jay_grf_per_gpr(shader); /* max_vec */ - - /* TODO: We could partition more cleverly */ - uniform_grfs = CLAMP(align(uniform_grfs, grf_block_alignment), - DIV_ROUND_UP(min_ugprs, ugpr_per_grf), - 128 - (32 * jay_grf_per_gpr(shader))); - unsigned nonuniform_grfs = JAY_NUM_PHYS_GRF - uniform_grfs; - - /* Check the split */ - assert((uniform_grfs * ugpr_per_grf) >= min_ugprs); - assert(nonuniform_grfs >= 32 * jay_grf_per_gpr(shader)); - assert((uniform_grfs + nonuniform_grfs) == JAY_NUM_PHYS_GRF); - - /* Partition GRFs between GPR & UGPR */ - unsigned stride4_header_size = 0; - - if (shader->stage == MESA_SHADER_VERTEX) { - unsigned attrib_grfs = shader->prog_data->vue.urb_read_length * 8; - unsigned blocks[] = { - 1, /* UGPR: g0 */ - 8, /* GPR: URB output handle */ - shader->push_grfs, /* UGPR: Push constants */ - attrib_grfs, /* GPR: Vertex inputs */ - uniform_grfs - (blocks[0] + blocks[2]), /* UGPR: * */ - nonuniform_grfs - (blocks[1] + blocks[3]), /* GPR: * and EOT */ - }; - - build_partition(shader, blocks, ARRAY_SIZE(blocks)); - stride4_header_size = blocks[1] + blocks[3]; - } else if (shader->stage == MESA_SHADER_FRAGMENT) { - unsigned len0 = jay_grf_per_gpr(shader); - unsigned payload_grfs = shader->payload_gprs * len0; - - unsigned blocks[] = { - len0, /* UGPR: g0 (and maybe g1) */ - payload_grfs, /* GPR: Barycentrics */ - uniform_grfs - len0, /* UGPR: Dispatch (eg push constants) & general */ - nonuniform_grfs - payload_grfs, /* GPR: General & EOT */ - }; - build_partition(shader, blocks, ARRAY_SIZE(blocks)); - stride4_header_size = blocks[1]; - } else { - unsigned blocks[] = { uniform_grfs - 4, nonuniform_grfs, 4 }; - build_partition(shader, blocks, ARRAY_SIZE(blocks)); - } - - /* TODO: Make the stride partition smarter */ - unsigned nonuniform_gprs = nonuniform_grfs / jay_grf_per_gpr(shader); - unsigned eot_gprs = 16 / jay_grf_per_gpr(shader); - p->base8 = ROUND_DOWN_TO(nonuniform_gprs - (16 + eot_gprs), 8) + 0; - p->base2 = 8 + p->base8; - p->base_eot = 8 + p->base2; - - // print_partition(p); - validate_partition(p, stride4_header_size, nonuniform_gprs); - - /* By construction of our partition, the entire GRF is used. */ - shader->prog_data->base.grf_used = JAY_NUM_PHYS_GRF; - - /* Set the targets for the virtual register file accordingly */ - for (unsigned f = 0; f < JAY_NUM_GRF_FILES; ++f) { - for (unsigned b = 0; b < JAY_PARTITION_BLOCKS; ++b) { - shader->num_regs[f] += p->blocks[f][b].len; - } - } - - /* This should be an acceptable upper limit since we assign memory tightly - * thanks to the usual SSA allocator guarantees. - */ - if (demand[GPR] > gpr_limit(shader)) { - shader->num_regs[MEM] = demand[GPR]; - } -} - static void jay_register_allocate_function(jay_function *f) { @@ -1556,15 +1382,10 @@ jay_register_allocate_function(jay_function *f) jay_ra_state ra = { .b.shader = shader, .b.func = f }; /* Spill as needed to fit within the limits. */ - unsigned limit = gpr_limit(f->shader); + unsigned limit = jay_gpr_limit(f->shader); bool spilled = f->demand[GPR] > limit; if (spilled) { - /* Spilling requires reserving UGPRs for spilling */ - unsigned reservation = f->shader->dispatch_width; - f->shader->num_regs[UGPR] -= reservation; - f->shader->partition.large_ugpr_block.len -= reservation; - jay_spill(f, limit); jay_validate(f->shader, "spilling"); jay_compute_liveness(f); @@ -1588,17 +1409,6 @@ jay_register_allocate_function(jay_function *f) typed_memcpy(ra.num_regs, shader->num_regs, JAY_NUM_RA_FILES); - /* The end of the register file is allowed for end-of-thread messages. - * Calculate the offset in GPRs. Compute shaders have this as UGPRs while - * fragment shaders have this as GPRs. - */ - if (mesa_shader_stage_is_compute(shader->stage)) { - ra.eot_offs = ROUND_DOWN_TO(ra.num_regs[UGPR], jay_ugpr_per_grf(shader)) - - jay_ugpr_per_grf(shader); - } else { - ra.eot_offs = ra.num_regs[GPR] - (16 / jay_grf_per_gpr(shader)); - } - linear_ctx *lin_ctx = linear_context(shader); ra.reg_for_index = linear_alloc_array(lin_ctx, jay_reg, f->ssa_alloc); diff --git a/src/intel/compiler/jay/jay_to_binary.c b/src/intel/compiler/jay/jay_to_binary.c index 3af3fbd71b5..8615e252fb6 100644 --- a/src/intel/compiler/jay/jay_to_binary.c +++ b/src/intel/compiler/jay/jay_to_binary.c @@ -10,7 +10,6 @@ #include "compiler/brw/brw_eu_inst.h" #include "compiler/brw/brw_reg.h" #include "compiler/brw/brw_reg_type.h" -#include "dev/intel_debug.h" #include "util/macros.h" #include "util/u_dynarray.h" #include "util/u_math.h" @@ -42,45 +41,29 @@ to_brw_reg_type(enum jay_type type) /* clang-format on */ } -static inline unsigned -to_def_grf_16(struct jay_partition *p, jay_def d) -{ - unsigned count = jay_num_values(d); - if (count == 0 || !(d.file == GPR || d.file == UGPR)) { - return d.reg; - } - - unsigned base = 0; - for (unsigned i = 0; i < JAY_PARTITION_BLOCKS; ++i) { - unsigned offset = d.reg - base; - - if (offset < p->blocks[d.file][i].len) { - assert(offset + count <= p->blocks[d.file][i].len && - "vectors must not cross partition boundaries"); - - return (p->blocks[d.file][i].start + offset) * 2 + d.hi; - } - - base += p->blocks[d.file][i].len; - } - - UNREACHABLE("virtual register must be in a block"); -} - static inline brw_reg -to_brw_reg(jay_function *f, - const jay_inst *I, - signed idx, - unsigned simd_offs, - bool force_hi) +to_brw_reg( + jay_function *f, const jay_inst *I, signed idx, unsigned simd_offs, bool hi) { bool is_dest = idx < 0; enum jay_type type = is_dest ? I->type : jay_src_type(I, idx); jay_def d = is_dest ? I->dst : I->src[idx]; - d.hi |= force_hi; + hi |= d.hi; struct brw_reg R; - unsigned reg = to_def_grf_16(&f->shader->partition, d), offset_B = 0; + unsigned reg = d.reg, count = jay_num_values(d); + unsigned offset_B = 0, grf = 0; + assert(!hi || d.file == GPR); + + if (count && (d.file == GPR || d.file == UGPR)) { + struct jay_register_block block = + jay_lookup_block(&f->shader->partition, d.reg, d.file); + + grf = block.start_grf; + reg -= block.start_gpr; + + assert(reg + count <= block.len_gpr && "must not cross partitions"); + } if (jay_is_imm(d)) { /* Immediates have size restrictions but can zero extend */ @@ -95,13 +78,13 @@ to_brw_reg(jay_function *f, } else if (jay_is_null(d)) { R = brw_null_reg(); } else if (d.file == UGPR || d.file == UACCUM) { - unsigned phys_reg = (reg >> 1) / 8; - offset_B = ((reg >> 1) % 8) * 4; + grf += (reg / jay_ugpr_per_grf(f->shader)); + offset_B = (reg % jay_ugpr_per_grf(f->shader)) * 4; if (d.file == UGPR) { - R = brw_ud1_grf(phys_reg, 0); + R = brw_ud1_grf(grf * 2, 0); } else { - R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0); + R = brw_ud1_reg(ARF, BRW_ARF_ACCUMULATOR + (grf * 2), 0); } /* Handle 3-src restrictions and vectorized uniform code. */ @@ -140,22 +123,22 @@ to_brw_reg(jay_function *f, unsigned stride_bits = jay_stride_to_bits(def_stride); unsigned simd_width = jay_simd_width_physical(f->shader, I); - unsigned phys_reg; if (def_stride == JAY_STRIDE_2) { - /* Bit 0 selects between lo/hi halves of the GPR */ - phys_reg = (reg / 2) * jay_grf_per_gpr(f->shader); - offset_B = (reg & 1) * 2 * f->shader->dispatch_width; + /* Select between lo/hi halves of the GPR */ + grf += reg * jay_grf_per_gpr(f->shader); + offset_B = hi ? 2 * f->shader->dispatch_width : 0; } else { - /* Low bits are an offset in 2-byte words into the GRF */ + /* Treat low bits as an offset in 2-byte words into the GRF */ + unsigned r = (reg * 2) + hi; unsigned mask = BITFIELD_MASK(stride_bits / 32); - phys_reg = ((reg & ~mask) / 2) * jay_grf_per_gpr(f->shader); - offset_B = (reg & mask) * 2; + grf += ((r & ~mask) / 2) * jay_grf_per_gpr(f->shader); + offset_B = (r & mask) * 2; } if (d.file == GPR) { - R = xe2_vec8_grf(phys_reg, 0); + R = xe2_vec8_grf(grf, 0); } else { - R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + (phys_reg * 2), 0); + R = brw_vecn_reg(8, ARF, BRW_ARF_ACCUMULATOR + grf, 0); } R = byte_offset(R, simd_offs * simd_width * stride_bits / 8); @@ -524,8 +507,13 @@ emit(struct brw_codegen *p, case JAY_OPCODE_SHUFFLE: { struct brw_reg a0 = brw_address_reg(0); - unsigned grf_16 = to_def_grf_16(&f->shader->partition, I->src[0]); - unsigned offset_B = grf_16 * 2 * f->shader->dispatch_width; + assert(I->src[0].file == GPR && jay_num_values(I->src[0]) == 1); + struct jay_register_block block = + jay_lookup_block(&f->shader->partition, I->src[0].reg, GPR); + + unsigned offset_B = + (block.start_grf * 64) + + ((I->src[0].reg - block.start_gpr) * 4 * f->shader->dispatch_width); brw_ADD(p, a0, subscript(SRC(1), BRW_TYPE_UW, 0), brw_imm_uw(offset_B)); brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), BRW_TYPE_UD)); diff --git a/src/intel/compiler/jay/meson.build b/src/intel/compiler/jay/meson.build index a367de71380..d4f5f1a0353 100644 --- a/src/intel/compiler/jay/meson.build +++ b/src/intel/compiler/jay/meson.build @@ -62,6 +62,7 @@ libintel_compiler_jay_files = files( 'jay_opt_dead_code.c', 'jay_opt_predicate.c', 'jay_opt_propagate.c', + 'jay_partition.c', 'jay_print.c', 'jay_private.h', 'jay_prog_data.c', diff --git a/src/intel/compiler/jay/test/jay_test.h b/src/intel/compiler/jay/test/jay_test.h index 43cc48b87ef..086129396fe 100644 --- a/src/intel/compiler/jay/test/jay_test.h +++ b/src/intel/compiler/jay/test/jay_test.h @@ -8,7 +8,6 @@ #include #include "jay_builder.h" #include "jay_ir.h" -#include "jay_private.h" #include "shader_enums.h" static inline jay_block * @@ -25,13 +24,17 @@ jay_test_builder(void *memctx) { jay_shader *s = jay_new_shader(memctx, MESA_SHADER_COMPUTE); jay_function *f = jay_new_function(s); - s->partition.base8 = 8; struct intel_device_info *devinfo = rzalloc(memctx, struct intel_device_info); s->devinfo = devinfo; s->dispatch_width = 32; + s->partition.blocks[GPR][s->partition.nr_blocks[GPR]++] = { + .len_gpr = 32, + .stride = JAY_STRIDE_4, + }; + unsigned verx10 = 200; devinfo->verx10 = verx10; devinfo->ver = verx10 / 10;