From 1f0370616a622c53de715611d2f9ba7dd8f22b26 Mon Sep 17 00:00:00 2001 From: Lars-Ivar Hesselberg Simonsen Date: Wed, 25 Mar 2026 16:04:01 +0100 Subject: [PATCH] pan: Centralize preload registers Rather than having preload registers hardcoded over multiple files, gather them in one place with an enum abstraction. This should simplify updates to the preload registers. Reviewed-by: Eric R. Smith Reviewed-by: Lorenzo Rossi Part-of: --- src/panfrost/compiler/bifrost/bi_ra.c | 57 +++++--- .../compiler/bifrost/bifrost_compile.c | 95 ++++++++----- src/panfrost/compiler/bifrost/compiler.h | 131 ++++++++++++++++++ .../compiler/bifrost/valhall/va_pack.c | 6 +- 4 files changed, 228 insertions(+), 61 deletions(-) diff --git a/src/panfrost/compiler/bifrost/bi_ra.c b/src/panfrost/compiler/bifrost/bi_ra.c index 990500a7ac4..e10ee92bfa3 100644 --- a/src/panfrost/compiler/bifrost/bi_ra.c +++ b/src/panfrost/compiler/bifrost/bi_ra.c @@ -321,8 +321,9 @@ bi_make_affinity(uint64_t clobber, unsigned count, bool split_file) static void bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, uint64_t preload_live, unsigned node_count, bool is_blend, - bool split_file, bool aligned_sr) + bool split_file, unsigned arch) { + bool aligned_sr = arch >= 9; bi_foreach_instr_in_block_rev(block, ins) { /* Mark all registers live after the instruction as * interfering with the destination */ @@ -383,8 +384,10 @@ bi_mark_interference(bi_block *block, struct lcra_state *l, uint8_t *live, } if (!is_blend && ins->op == BI_OPCODE_BLEND) { - /* Blend shaders might clobber r0-r15, r48. */ - uint64_t clobber = BITFIELD64_MASK(16) | BITFIELD64_BIT(48); + /* Blend shaders might clobber r0-r15, blend link reg. */ + uint64_t clobber = + BITFIELD64_MASK(16) | + BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_BLEND_LINK, arch)); for (unsigned i = 0; i < node_count; ++i) { if (live[i]) @@ -410,7 +413,7 @@ bi_compute_interference(bi_context *ctx, struct lcra_state *l, bool full_regs) uint8_t *live = mem_dup(blk->live_out, ctx->ssa_alloc); bi_mark_interference(blk, l, live, blk->reg_live_out, ctx->ssa_alloc, - ctx->inputs->is_blend, !full_regs, ctx->arch >= 9); + ctx->inputs->is_blend, !full_regs, ctx->arch); free(live); } @@ -438,36 +441,43 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) bi_foreach_dest(ins, d) l->affinity[ins->dest[d].value] = default_affinity; - /* Blend shaders expect the src colour to be in r0-r3 */ + /* Blend shaders expect the src colour to be in blend_src0_c0 + * through c3 */ if (ins->op == BI_OPCODE_BLEND && !ctx->inputs->is_blend) { assert(bi_is_ssa(ins->src[0])); - l->solutions[ins->src[0].value] = 0; + l->solutions[ins->src[0].value] = + bi_preload_reg(BI_PRELOAD_BLEND_SRC0_C0, ctx->arch); - /* Dual source blend input in r4-r7 */ + /* Dual source blend input in blend_src1_c0 through c3 */ if (bi_is_ssa(ins->src[4])) - l->solutions[ins->src[4].value] = 4; + l->solutions[ins->src[4].value] = + bi_preload_reg(BI_PRELOAD_BLEND_SRC1_C0, ctx->arch); - /* Writes to R48 */ + /* Writes to blend link */ if (!bi_is_null(ins->dest[0])) - l->solutions[ins->dest[0].value] = 48; + l->solutions[ins->dest[0].value] = + bi_preload_reg(BI_PRELOAD_BLEND_LINK, ctx->arch); } - /* Coverage mask writes stay in R60 */ + /* Coverage mask writes stay in the cumulative coverage reg */ if ((ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT) && !bi_is_null(ins->dest[0])) { - l->solutions[ins->dest[0].value] = 60; + l->solutions[ins->dest[0].value] = + bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch); } /* Experimentally, it seems coverage masks inputs to ATEST must - * be in R60. Otherwise coverage mask writes do not work with - * early-ZS with pixel-frequency-shading (this combination of - * settings is legal if depth/stencil writes are disabled). - * Allowing a FAU index also seems to work on Valhall, at least. + * be in the cumulative coverage reg. Otherwise coverage mask + * writes do not work with early-ZS with pixel-frequency-shading + * (this combination of settings is legal if depth/stencil + * writes are disabled). Allowing a FAU index also seems to + * work on Valhall, at least. */ if (ins->op == BI_OPCODE_ATEST) { assert(bi_is_ssa(ins->src[0]) || ins->src[0].type == BI_INDEX_FAU); if (bi_is_ssa(ins->src[0])) - l->solutions[ins->src[0].value] = 60; + l->solutions[ins->src[0].value] = + bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch); } } @@ -492,8 +502,10 @@ bi_allocate_registers(bi_context *ctx, bool *success, bool full_regs) if (ctx->inputs->is_blend) { /* We're allowed to coalesce the moves to these */ - affinity |= BITFIELD64_BIT(48); - affinity |= BITFIELD64_BIT(60); + affinity |= + BITFIELD64_BIT(bi_preload_reg(BI_PRELOAD_BLEND_LINK, ctx->arch)); + affinity |= BITFIELD64_BIT( + bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch)); } /* Try to coalesce */ @@ -595,14 +607,15 @@ bi_choose_spill_node(bi_context *ctx, struct lcra_state *l) bi_foreach_instr_global(ctx, ins) { bi_foreach_dest(ins, d) { /* Don't allow spilling coverage mask writes because the - * register preload logic assumes it will stay in R60. - * This could be optimized. + * register preload logic assumes it will stay in the + * cumulative coverage reg. This could be optimized. */ if (ins->no_spill || ins->op == BI_OPCODE_ATEST || ins->op == BI_OPCODE_ZS_EMIT || (ins->op == BI_OPCODE_MOV_I32 && ins->src[0].type == BI_INDEX_REGISTER && - ins->src[0].value == 60)) { + ins->src[0].value == + bi_preload_reg(BI_PRELOAD_CUMULATIVE_COVERAGE, ctx->arch))) { BITSET_SET(no_spill, ins->dest[d].value); } } diff --git a/src/panfrost/compiler/bifrost/bifrost_compile.c b/src/panfrost/compiler/bifrost/bifrost_compile.c index 03d285a57ce..f38e4af223d 100644 --- a/src/panfrost/compiler/bifrost/bifrost_compile.c +++ b/src/panfrost/compiler/bifrost/bifrost_compile.c @@ -32,8 +32,9 @@ static void pan_stats_verbose(FILE *f, const char *prefix, bi_context *ctx, static bi_block *emit_cf_list(bi_context *ctx, struct exec_list *list); static bi_index -bi_preload(bi_builder *b, unsigned reg) +bi_preload(bi_builder *b, enum bi_preload val) { + unsigned reg = bi_preload_reg(val, b->shader->arch); if (bi_is_null(b->shader->preloaded[reg])) { /* Insert at the beginning of the shader */ bi_builder b_ = *b; @@ -50,7 +51,7 @@ static bi_index bi_coverage(bi_builder *b) { if (bi_is_null(b->shader->coverage)) - b->shader->coverage = bi_preload(b, 60); + b->shader->coverage = bi_preload(b, BI_PRELOAD_CUMULATIVE_COVERAGE); return b->shader->coverage; } @@ -63,20 +64,20 @@ bi_coverage(bi_builder *b) static inline bi_index bi_vertex_id(bi_builder *b) { - return bi_preload(b, (b->shader->arch >= 9) ? 60 : 61); + return bi_preload(b, BI_PRELOAD_VERTEX_ID); } static inline bi_index bi_instance_id(bi_builder *b) { - return bi_preload(b, (b->shader->arch >= 9) ? 61 : 62); + return bi_preload(b, BI_PRELOAD_INSTANCE_ID); } static inline bi_index bi_draw_id(bi_builder *b) { assert(b->shader->arch >= 9); - return bi_preload(b, 62); + return bi_preload(b, BI_PRELOAD_DRAW_ID); } static void @@ -258,8 +259,9 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) { switch (intr->intrinsic) { case nir_intrinsic_load_barycentric_centroid: + return bi_preload(b, BI_PRELOAD_CENTROID_ID); case nir_intrinsic_load_barycentric_sample: - return bi_preload(b, 61); + return bi_preload(b, BI_PRELOAD_SAMPLE_ID); /* Need to put the sample ID in the top 16-bits */ case nir_intrinsic_load_barycentric_at_sample: @@ -328,7 +330,8 @@ bi_varying_src0_for_barycentric(bi_builder *b, nir_intrinsic_instr *intr) case nir_intrinsic_load_barycentric_pixel: default: - return b->shader->arch >= 9 ? bi_preload(b, 61) : bi_dontcare(b); + return b->shader->arch >= 9 ? bi_preload(b, BI_PRELOAD_CENTROID_ID) + : bi_dontcare(b); } } @@ -550,7 +553,8 @@ bi_emit_lea_attr(bi_builder *b, nir_intrinsic_instr *intr) unsigned snap4 = 0x5E; uint32_t format = identity | (snap4 << 12) | (regfmt << 24); bi_collect_v3i32_to(b, bi_def_index(&intr->def), - bi_preload(b, 58), bi_preload(b, 59), + bi_preload(b, BI_PRELOAD_POS_RESULT_PTR_LO), + bi_preload(b, BI_PRELOAD_POS_RESULT_PTR_HI), bi_imm_u32(format)); return; } @@ -838,8 +842,8 @@ bi_load_sample_id_to(bi_builder *b, bi_index dst) * seem to read garbage (despite being architecturally defined * as zero), so use a 5-bit mask instead of 8-bits */ - bi_rshift_and_i32_to(b, dst, bi_preload(b, 61), bi_imm_u32(0x1f), - bi_imm_u8(16), false); + bi_rshift_and_i32_to(b, dst, bi_preload(b, BI_PRELOAD_SAMPLE_ID), + bi_imm_u32(0x1f), bi_imm_u8(16), false); } static bi_index @@ -872,12 +876,24 @@ static void bi_emit_load_blend_input(bi_builder *b, nir_intrinsic_instr *instr) { nir_io_semantics sem = nir_intrinsic_io_semantics(instr); - unsigned base = sem.dual_source_blend_index * 4; unsigned size = nir_alu_type_get_type_size(nir_intrinsic_dest_type(instr)); assert(size == 16 || size == 32); - bi_index srcs[] = {bi_preload(b, base + 0), bi_preload(b, base + 1), - bi_preload(b, base + 2), bi_preload(b, base + 3)}; + bi_index srcs[4]; + switch (sem.dual_source_blend_index) { + case 0: + srcs[0] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C0); + srcs[1] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C1); + srcs[2] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C2); + srcs[3] = bi_preload(b, BI_PRELOAD_BLEND_SRC0_C3); + break; + case 1: + srcs[0] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C0); + srcs[1] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C1); + srcs[2] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C2); + srcs[3] = bi_preload(b, BI_PRELOAD_BLEND_SRC1_C3); + break; + } bi_emit_collect_to(b, bi_def_index(&instr->def), srcs, size == 32 ? 4 : 2); } @@ -1759,7 +1775,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_cumulative_coverage_pan: - bi_mov_i32_to(b, dst, bi_preload(b, 60)); + bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_CUMULATIVE_COVERAGE)); break; case nir_intrinsic_load_blend_descriptor_pan: { @@ -1841,16 +1857,15 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) } case nir_intrinsic_blend_return_pan: - /* Jump back to the fragment shader, return address is stored - * in r48 (see above). On Valhall, only jump if the address is - * nonzero. The check is free there and it implements the "jump - * to 0 terminates the blend shader" that's automatic on - * Bifrost. + /* Jump back to the fragment shader. On Valhall, only jump if the address + * is nonzero. The check is free there and it implements the "jump to 0 + * terminates the blend shader" that's automatic on Bifrost. */ if (b->shader->arch >= 9) - bi_branchzi(b, bi_preload(b, 48), bi_preload(b, 48), BI_CMPF_NE); + bi_branchzi(b, bi_preload(b, BI_PRELOAD_BLEND_LINK), + bi_preload(b, BI_PRELOAD_BLEND_LINK), BI_CMPF_NE); else - bi_jump(b, bi_preload(b, 48)); + bi_jump(b, bi_preload(b, BI_PRELOAD_BLEND_LINK)); break; case nir_intrinsic_load_ubo: @@ -2008,7 +2023,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) case nir_intrinsic_load_pixel_coord: /* Vectorized load of the preloaded i16vec2 */ - bi_mov_i32_to(b, dst, bi_preload(b, 59)); + bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_POSITION_XY)); break; case nir_intrinsic_load_texel_buf_conv_pan: @@ -2033,7 +2048,7 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_idvs_output_buf_index_pan: - bi_mov_i32_to(b, dst, bi_preload(b, 59)); + bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_INTERNAL_ID)); break; case nir_intrinsic_lea_attr_pan: @@ -2067,8 +2082,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_sample_mask_in: - /* r61[0:15] contains the coverage bitmap */ - bi_u16_to_u32_to(b, dst, bi_half(bi_preload(b, 61), false)); + /* [0:15] contains the coverage bitmap */ + bi_u16_to_u32_to( + b, dst, bi_half(bi_preload(b, BI_PRELOAD_RASTERIZER_COVERAGE), false)); break; case nir_intrinsic_load_sample_mask: @@ -2080,12 +2096,12 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) break; case nir_intrinsic_load_primitive_id: - bi_mov_i32_to(b, dst, bi_preload(b, 57)); + bi_mov_i32_to(b, dst, bi_preload(b, BI_PRELOAD_PRIMITIVE_ID)); break; case nir_intrinsic_load_front_face: { - /* (r58 & 1) == 0 means primitive is front facing */ - bi_index primitive_facing = bi_preload(b, 58); + /* (primitive_flags & 1) == 0 means primitive is front facing */ + bi_index primitive_facing = bi_preload(b, BI_PRELOAD_PRIMITIVE_FLAGS); /* Starting with v11, there is more fields defined in the primitive flags */ if (b->shader->arch >= 11) @@ -2150,20 +2166,23 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) } case nir_intrinsic_load_local_invocation_id: - bi_collect_v3i32_to(b, dst, - bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 0)), - bi_u16_to_u32(b, bi_half(bi_preload(b, 55), 1)), - bi_u16_to_u32(b, bi_half(bi_preload(b, 56), 0))); + bi_collect_v3i32_to( + b, dst, + bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_0), 0)), + bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_1), 1)), + bi_u16_to_u32(b, bi_half(bi_preload(b, BI_PRELOAD_LOCAL_ID_2), 0))); break; case nir_intrinsic_load_workgroup_id: - bi_collect_v3i32_to(b, dst, bi_preload(b, 57), bi_preload(b, 58), - bi_preload(b, 59)); + bi_collect_v3i32_to(b, dst, bi_preload(b, BI_PRELOAD_WORKGROUP_ID_0), + bi_preload(b, BI_PRELOAD_WORKGROUP_ID_1), + bi_preload(b, BI_PRELOAD_WORKGROUP_ID_2)); break; case nir_intrinsic_load_global_invocation_id: - bi_collect_v3i32_to(b, dst, bi_preload(b, 60), bi_preload(b, 61), - bi_preload(b, 62)); + bi_collect_v3i32_to(b, dst, bi_preload(b, BI_PRELOAD_GLOBAL_ID_0), + bi_preload(b, BI_PRELOAD_GLOBAL_ID_1), + bi_preload(b, BI_PRELOAD_GLOBAL_ID_2)); break; case nir_intrinsic_shader_clock: @@ -2190,7 +2209,9 @@ bi_emit_intrinsic(bi_builder *b, nir_intrinsic_instr *instr) case nir_intrinsic_load_view_index: case nir_intrinsic_load_layer_id: assert(b->shader->arch >= 9); - bi_mov_i32_to(b, dst, bi_u8_to_u32(b, bi_byte(bi_preload(b, 62), 0))); + bi_mov_i32_to( + b, dst, + bi_u8_to_u32(b, bi_byte(bi_preload(b, BI_PRELOAD_FRAME_ARG), 0))); break; case nir_intrinsic_load_ssbo_address: diff --git a/src/panfrost/compiler/bifrost/compiler.h b/src/panfrost/compiler/bifrost/compiler.h index 245eb96c279..a54632a018b 100644 --- a/src/panfrost/compiler/bifrost/compiler.h +++ b/src/panfrost/compiler/bifrost/compiler.h @@ -1116,6 +1116,137 @@ enum bi_idvs_mode { #define BI_MAX_REGS 64 +enum bi_preload { + /* Compute */ + BI_PRELOAD_LOCAL_ID_0, + BI_PRELOAD_LOCAL_ID_1, + BI_PRELOAD_LOCAL_ID_2, + BI_PRELOAD_WORKGROUP_ID_0, + BI_PRELOAD_WORKGROUP_ID_1, + BI_PRELOAD_WORKGROUP_ID_2, + BI_PRELOAD_GLOBAL_ID_0, + BI_PRELOAD_GLOBAL_ID_1, + BI_PRELOAD_GLOBAL_ID_2, + /* Vertex */ + BI_PRELOAD_POS_RESULT_PTR_LO, + BI_PRELOAD_POS_RESULT_PTR_HI, + BI_PRELOAD_INTERNAL_ID, + BI_PRELOAD_VERTEX_ID, + BI_PRELOAD_INSTANCE_ID, + BI_PRELOAD_DRAW_ID, + BI_PRELOAD_VIEW_ID, + /* Fragment */ + BI_PRELOAD_PRIMITIVE_ID, + BI_PRELOAD_PRIMITIVE_FLAGS, + BI_PRELOAD_POSITION_XY, + BI_PRELOAD_CUMULATIVE_COVERAGE, + BI_PRELOAD_RASTERIZER_COVERAGE, + BI_PRELOAD_SAMPLE_ID, + BI_PRELOAD_CENTROID_ID, + BI_PRELOAD_FRAME_ARG, + /* Blend */ + BI_PRELOAD_BLEND_SRC0_C0, + BI_PRELOAD_BLEND_SRC0_C1, + BI_PRELOAD_BLEND_SRC0_C2, + BI_PRELOAD_BLEND_SRC0_C3, + BI_PRELOAD_BLEND_SRC1_C0, + BI_PRELOAD_BLEND_SRC1_C1, + BI_PRELOAD_BLEND_SRC1_C2, + BI_PRELOAD_BLEND_SRC1_C3, + BI_PRELOAD_BLEND_LINK, +}; + +static inline unsigned +bi_preload_reg(enum bi_preload val, unsigned arch) +{ + switch (val) { + /* Compute */ + case BI_PRELOAD_LOCAL_ID_0: + /* Bits [15;0] */ + return 55; + case BI_PRELOAD_LOCAL_ID_1: + /* Bits [31;16] */ + return 55; + case BI_PRELOAD_LOCAL_ID_2: + /* Bits [15;0] */ + return 56; + case BI_PRELOAD_WORKGROUP_ID_0: + return 57; + case BI_PRELOAD_WORKGROUP_ID_1: + return 58; + case BI_PRELOAD_WORKGROUP_ID_2: + return 59; + case BI_PRELOAD_GLOBAL_ID_0: + return 60; + case BI_PRELOAD_GLOBAL_ID_1: + return 61; + case BI_PRELOAD_GLOBAL_ID_2: + return 62; + /* Vertex */ + case BI_PRELOAD_POS_RESULT_PTR_LO: + assert(arch < 9); + return 58; + case BI_PRELOAD_POS_RESULT_PTR_HI: + assert(arch < 9); + return 59; + case BI_PRELOAD_INTERNAL_ID: + assert(arch >= 9); + return 59; + case BI_PRELOAD_VERTEX_ID: + return (arch >= 9) ? 60 : 61; + case BI_PRELOAD_INSTANCE_ID: + return (arch >= 9) ? 61 : 62; + case BI_PRELOAD_DRAW_ID: + assert(arch >= 9); + return 62; + case BI_PRELOAD_VIEW_ID: + assert(arch >= 9); + return 63; + /* Fragment */ + case BI_PRELOAD_PRIMITIVE_ID: + return 57; + case BI_PRELOAD_PRIMITIVE_FLAGS: + return 58; + case BI_PRELOAD_POSITION_XY: + return 59; + case BI_PRELOAD_CUMULATIVE_COVERAGE: + /* Bits [15;0] */ + return 60; + case BI_PRELOAD_RASTERIZER_COVERAGE: + /* Bits [15;0] */ + return 61; + case BI_PRELOAD_SAMPLE_ID: + /* Bits [23;16] */ + return 61; + case BI_PRELOAD_CENTROID_ID: + /* Bits [31;24] */ + return 61; + case BI_PRELOAD_FRAME_ARG: + /* Double reg */ + return 62; + /* Blend */ + case BI_PRELOAD_BLEND_SRC0_C0: + return 0; + case BI_PRELOAD_BLEND_SRC0_C1: + return 1; + case BI_PRELOAD_BLEND_SRC0_C2: + return 2; + case BI_PRELOAD_BLEND_SRC0_C3: + return 3; + case BI_PRELOAD_BLEND_SRC1_C0: + return 4; + case BI_PRELOAD_BLEND_SRC1_C1: + return 5; + case BI_PRELOAD_BLEND_SRC1_C2: + return 6; + case BI_PRELOAD_BLEND_SRC1_C3: + return 7; + case BI_PRELOAD_BLEND_LINK: + return 48; + } + UNREACHABLE("Non-handled BI_PRELOAD"); +} + typedef struct { const struct pan_compile_inputs *inputs; nir_shader *nir; diff --git a/src/panfrost/compiler/bifrost/valhall/va_pack.c b/src/panfrost/compiler/bifrost/valhall/va_pack.c index 9242da468e5..5eefb9b5ff8 100644 --- a/src/panfrost/compiler/bifrost/valhall/va_pack.c +++ b/src/panfrost/compiler/bifrost/valhall/va_pack.c @@ -1155,8 +1155,10 @@ va_lower_blend(bi_context *ctx) unsigned prolog_length = 2 * 8; - /* By ABI, r48 is the link register shared with blend shaders */ - assert(bi_is_equiv(I->dest[0], bi_register(48))); + /* By ABI, the preload blend link register is shared with blend + * shaders */ + assert(bi_is_equiv(I->dest[0], bi_register(bi_preload_reg( + BI_PRELOAD_BLEND_LINK, ctx->arch)))); if (I->flow == VA_FLOW_END) bi_iadd_imm_i32_to(&b, I->dest[0], va_zero_lut(), 0);