diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c index df3f09030fb..2bac35d5e55 100644 --- a/src/panfrost/bifrost/bi_pack.c +++ b/src/panfrost/bifrost/bi_pack.c @@ -463,7 +463,7 @@ static struct bi_packed_tuple bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tuple, gl_shader_stage stage) { bi_assign_slots(tuple, prev); - bi_assign_fau_idx(clause, tuple); + tuple->regs.fau_idx = tuple->fau_idx; tuple->regs.first_instruction = first_tuple; bi_flip_slots(&tuple->regs); @@ -509,36 +509,54 @@ bi_pack_tuple(bi_clause *clause, bi_tuple *tuple, bi_tuple *prev, bool first_tup return packed; } -/* Packs the next two constants as a dedicated constant quadword at the end of - * the clause, returning the number packed. There are two cases to consider: - * - * Case #1: Branching is not used. For a single constant copy the upper nibble - * over, easy. - * - * Case #2: Branching is used. For a single constant, it suffices to set the - * upper nibble to 4 and leave the latter constant 0, which matches what the - * blob does. - * - * Extending to multiple constants is considerably more tricky and left for - * future work. +/* A block contains at most one PC-relative constant, from a terminal branch. + * Find the last instruction and if it is a relative branch, fix up the + * PC-relative constant to contain the absolute offset. This occurs at pack + * time instead of schedule time because the number of quadwords between each + * block is not known until after all other passes have finished. */ -static unsigned -bi_pack_constants(bi_context *ctx, bi_clause *clause, - unsigned word_idx, bool ec0_packed, +static void +bi_assign_branch_offset(bi_context *ctx, bi_block *block) +{ + if (list_is_empty(&block->clauses)) + return; + + bi_clause *clause = list_last_entry(&block->clauses, bi_clause, link); + bi_instr *br = bi_last_instr_in_clause(clause); + + if (!br->branch_target) + return; + + /* Put it in the high place */ + int32_t qwords = bi_block_offset(ctx, clause, br->branch_target); + int32_t bytes = qwords * 16; + + /* Copy so we can toy with the sign without undefined behaviour */ + uint32_t raw = 0; + memcpy(&raw, &bytes, sizeof(raw)); + + /* Clear off top bits for A1/B1 bits */ + raw &= ~0xF0000000; + + /* Put in top 32-bits */ + assert(clause->pcrel_idx < 8); + clause->constants[clause->pcrel_idx] |= ((uint64_t) raw) << 32ull; +} + +static void +bi_pack_constants(unsigned tuple_count, uint64_t *constants, + unsigned word_idx, unsigned constant_words, bool ec0_packed, struct util_dynarray *emission) { unsigned index = (word_idx << 1) + ec0_packed; - /* After these two, are we done? Determines tag */ - bool done = clause->constant_count <= (index + 2); - - /* Is the constant we're packing for a branch? */ - bool branches = clause->branch_constant && done; + /* Do more constants follow */ + bool more = (word_idx + 1) < constant_words; /* Indexed first by tuple count and second by constant word number, * indicates the position in the clause */ - unsigned pos[8][3] = { + unsigned pos_lookup[8][3] = { { 0 }, { 1 }, { 3 }, @@ -549,57 +567,20 @@ bi_pack_constants(bi_context *ctx, bi_clause *clause, { 9, 12 } }; - /* Compute branch offset instead of a dummy 0 */ - bool terminal_branch = true; - - if (branches) { - bi_instr *br = clause->tuples[clause->tuple_count - 1].add; - assert(br && br->branch_target); - - if (!bi_is_terminal_block(br->branch_target)) { - /* Put it in the high place */ - int32_t qwords = bi_block_offset(ctx, clause, br->branch_target); - int32_t bytes = qwords * 16; - - /* Copy so we get proper sign behaviour */ - uint32_t raw = 0; - memcpy(&raw, &bytes, sizeof(raw)); - - /* Clear off top bits for the magic bits */ - raw &= ~0xF0000000; - terminal_branch = false; - - /* Put in top 32-bits */ - clause->constants[index + 0] = ((uint64_t) raw) << 32ull; - } - } - - uint64_t hi = clause->constants[index + 0] >> 60ull; + /* Compute the pos, and check everything is reasonable */ + assert((tuple_count - 1) < 8); + assert(word_idx < 3); + unsigned pos = pos_lookup[tuple_count - 1][word_idx]; + assert(pos != 0 || (tuple_count == 1 && word_idx == 0)); struct bifrost_fmt_constant quad = { - .pos = pos[clause->tuple_count - 1][word_idx], /* TODO */ - .tag = done ? BIFROST_FMTC_FINAL : BIFROST_FMTC_CONSTANTS, - .imm_1 = clause->constants[index + 0] >> 4, - .imm_2 = ((hi < 8) ? (hi << 60ull) : 0) >> 4, + .pos = pos, + .tag = more ? BIFROST_FMTC_CONSTANTS : BIFROST_FMTC_FINAL, + .imm_1 = constants[index + 0] >> 4, + .imm_2 = constants[index + 1] >> 4, }; - if (branches && !terminal_branch) { - /* Branch offsets are less than 60-bits so this should work at - * least for now */ - quad.imm_1 |= (4ull << 60ull) >> 4; - assert (hi == 0); - } - - /* XXX: On G71, Connor observed that the difference of the top 4 bits - * of the second constant with the first must be less than 8, otherwise - * we have to swap them. On G52, I'm able to reproduce a similar issue - * but with a different workaround (modeled above with a single - * constant, unclear how to workaround for multiple constants.) Further - * investigation needed. Possibly an errata. XXX */ - util_dynarray_append(emission, struct bifrost_fmt_constant, quad); - - return 2; } static inline uint8_t @@ -800,9 +781,6 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause, struct util_dynarray *emission, gl_shader_stage stage, bool tdd) { - /* TODO After the deadline lowering */ - bi_lower_cubeface2(ctx, &clause->tuples[0]); - struct bi_packed_tuple ins[8] = { 0 }; for (unsigned i = 0; i < clause->tuple_count; ++i) { @@ -857,8 +835,8 @@ bi_pack_clause(bi_context *ctx, bi_clause *clause, /* Pack the remaining constants */ for (unsigned pos = 0; pos < constant_quads; ++pos) { - bi_pack_constants(ctx, clause, pos, ec0_packed, - emission); + bi_pack_constants(clause->tuple_count, clause->constants, + pos, constant_quads, ec0_packed, emission); } } @@ -909,6 +887,8 @@ bi_pack(bi_context *ctx, struct util_dynarray *emission) bi_foreach_block(ctx, _block) { bi_block *block = (bi_block *) _block; + bi_assign_branch_offset(ctx, block); + /* Passthrough the first clause of where we're branching to for * the last clause of the block (the clause with the branch) */ diff --git a/src/panfrost/bifrost/bi_ra.c b/src/panfrost/bifrost/bi_ra.c index ed2b1c3de71..a4f1fe81444 100644 --- a/src/panfrost/bifrost/bi_ra.c +++ b/src/panfrost/bifrost/bi_ra.c @@ -266,11 +266,14 @@ bi_spill_dest(bi_builder *b, bi_index index, bi_index temp, uint32_t offset, { b->cursor = bi_after_clause(clause); - bi_instr *st = bi_store_to(b, channels * 32, bi_null(), - temp, bi_imm_u32(offset), bi_zero(), BI_SEG_TL); + /* setup FAU as [offset][0] */ + bi_instr *st = bi_store_to(b, channels * 32, bi_null(), temp, + bi_passthrough(BIFROST_SRC_FAU_LO), + bi_passthrough(BIFROST_SRC_FAU_HI), + BI_SEG_TL); bi_clause *singleton = bi_singleton(b->shader, st, block, 0, (1 << 0), - true); + offset, true); list_add(&singleton->link, &clause->link); b->shader->spills++; @@ -281,12 +284,14 @@ bi_fill_src(bi_builder *b, bi_index index, bi_index temp, uint32_t offset, bi_clause *clause, bi_block *block, unsigned channels) { b->cursor = bi_before_clause(clause); - bi_instr *ld = bi_load_to(b, channels * 32, temp, bi_imm_u32(offset), - bi_zero(), BI_SEG_TL); + bi_instr *ld = bi_load_to(b, channels * 32, temp, + bi_passthrough(BIFROST_SRC_FAU_LO), + bi_passthrough(BIFROST_SRC_FAU_HI), + BI_SEG_TL); ld->no_spill = true; bi_clause *singleton = bi_singleton(b->shader, ld, block, 0, - (1 << 0), true); + (1 << 0), offset, true); list_addtail(&singleton->link, &clause->link); b->shader->fills++; diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c index 29efa23c228..46e31646134 100644 --- a/src/panfrost/bifrost/bi_schedule.c +++ b/src/panfrost/bifrost/bi_schedule.c @@ -239,6 +239,7 @@ bi_singleton(void *memctx, bi_instr *ins, bi_block *block, unsigned scoreboard_id, unsigned dependencies, + uint64_t combined_constant, bool osrb) { bi_clause *u = rzalloc(memctx, bi_clause); @@ -266,42 +267,14 @@ bi_singleton(void *memctx, bi_instr *ins, /* Let's be optimistic, we'll fix up later */ u->flow_control = BIFROST_FLOW_NBTB; - /* Build up a combined constant, count in 32-bit words */ - uint64_t combined_constant = 0; - unsigned constant_count = 0; + assert(!ins->branch_target); - bi_foreach_src(ins, s) { - if (ins->src[s].type != BI_INDEX_CONSTANT) continue; - unsigned value = ins->src[s].value; - - /* Allow fast zero */ - if (value == 0 && u->tuples[0].fma) continue; - - if (constant_count == 0) { - combined_constant = ins->src[s].value; - } else if (constant_count == 1) { - /* Allow reuse */ - if (combined_constant == value) - continue; - - combined_constant |= ((uint64_t) value) << 32ull; - } else { - /* No more room! */ - assert((combined_constant & 0xffffffff) == value || - (combined_constant >> 32ull) == value); - } - - constant_count++; - } - - if (ins->branch_target) - u->branch_constant = true; - - /* XXX: Investigate errors when constants are not used */ - if (constant_count || u->branch_constant || true) { + if (combined_constant) { /* Clause in 64-bit, above in 32-bit */ u->constant_count = 1; u->constants[0] = combined_constant; + u->tuples[0].fau_idx = bi_constant_field(0) | + (combined_constant & 0xF); } u->next_clause_prefetch = (ins->op != BI_OPCODE_JUMP); @@ -414,44 +387,6 @@ bi_reads_t(bi_instr *ins, unsigned src) } } -/* Eventually, we'll need a proper scheduling, grouping instructions - * into clauses and ordering/assigning grouped instructions to the - * appropriate FMA/ADD slots. Right now we do the dumbest possible - * thing just to have the scheduler stubbed out so we can focus on - * codegen */ - -void -bi_schedule(bi_context *ctx) -{ - bool is_first = true; - - bi_foreach_block(ctx, block) { - bi_block *bblock = (bi_block *) block; - - list_inithead(&bblock->clauses); - - bi_foreach_instr_in_block(bblock, ins) { - bi_clause *u = bi_singleton(ctx, ins, - bblock, 0, (1 << 0), - !is_first); - - is_first = false; - list_addtail(&u->link, &bblock->clauses); - } - - /* Back-to-back bit affects only the last clause of a block, - * the rest are implicitly true */ - - if (!list_is_empty(&bblock->clauses)) { - bi_clause *last_clause = list_last_entry(&bblock->clauses, bi_clause, link); - if (!bi_back_to_back(bblock)) - last_clause->flow_control = BIFROST_FLOW_NBTB_UNCONDITIONAL; - } - - bblock->scheduled = true; - } -} - /* Counts the number of 64-bit constants required by a clause. TODO: We * might want to account for merging, right now we overestimate, but * that's probably fine most of the time */ @@ -1427,6 +1362,16 @@ bi_schedule_block(bi_context *ctx, bi_block *block) bi_free_worklist(st); } +void +bi_schedule(bi_context *ctx) +{ + bi_foreach_block(ctx, block) { + bi_block *bblock = (bi_block *) block; + bi_schedule_block(ctx, bblock); + bi_opt_dead_code_eliminate(ctx, bblock, true); + } +} + #ifndef NDEBUG static bi_builder * diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c index 19830549472..0a8f299f983 100644 --- a/src/panfrost/bifrost/bifrost_compile.c +++ b/src/panfrost/bifrost/bifrost_compile.c @@ -1669,22 +1669,17 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord, bi_index *face, bi_index *s, bi_index *t) { /* Compute max { |x|, |y|, |z| } */ - bi_index cubeface1 = bi_cubeface1(b, coord, + bi_instr *cubeface = bi_cubeface_to(b, bi_temp(b->shader), coord, bi_word(coord, 1), bi_word(coord, 2)); - - /* Calculate packed exponent / face / infinity. In reality this reads - * the destination from cubeface1 but that's handled by lowering */ - bi_instr *cubeface2 = bi_cubeface1_to(b, bi_temp(b->shader), coord, - bi_word(coord, 1), bi_word(coord, 2)); - cubeface2->op = BI_OPCODE_CUBEFACE2; /* XXX: DEEP VOODOO */ + cubeface->dest[1] = bi_temp(b->shader); /* Select coordinates */ bi_index ssel = bi_cube_ssel(b, bi_word(coord, 2), coord, - cubeface2->dest[0]); + cubeface->dest[1]); bi_index tsel = bi_cube_tsel(b, bi_word(coord, 1), bi_word(coord, 2), - cubeface2->dest[0]); + cubeface->dest[1]); /* The OpenGL ES specification requires us to transform an input vector * (x, y, z) to the coordinate, given the selected S/T: @@ -1700,7 +1695,7 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord, * Take the reciprocal of max{x, y, z} */ - bi_index rcp = bi_frcp_f32(b, cubeface1); + bi_index rcp = bi_frcp_f32(b, cubeface->dest[0]); /* Calculate 0.5 * (1.0 / max{x, y, z}) */ bi_index fma1 = bi_fma_f32(b, rcp, bi_imm_f32(0.5f), bi_zero(), @@ -1722,7 +1717,7 @@ bi_emit_cube_coord(bi_builder *b, bi_index coord, * because the TEXS_CUBE and TEXC instructions expect the face index to * be at this position. */ - *face = cubeface2->dest[0]; + *face = cubeface->dest[1]; } /* Emits a cube map descriptor, returning lower 32-bits and putting upper diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h index 138d49d6f49..346ac86a72b 100644 --- a/src/panfrost/bifrost/compiler.h +++ b/src/panfrost/bifrost/compiler.h @@ -743,6 +743,7 @@ bi_singleton(void *memctx, bi_instr *ins, bi_block *block, unsigned scoreboard_id, unsigned dependencies, + uint64_t combined_constant, bool osrb); /* Liveness */