From 82a31a73395be97899a44e1eade710e544e40e27 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 6 May 2026 08:59:50 +0200 Subject: [PATCH 1/5] nir/lower_alu: fix lower_fminmax_signed_zero for denorms When both inputs are denorms, the bcsel picks the integer min/max result, which does not flush denorms and therefore might return the wrong result. Fixes OpenCL fmin/fmax on asahi. Fixes: d238d766c64 ("nir: add lower_fminmax_signed_zero") --- src/compiler/nir/nir_lower_alu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/src/compiler/nir/nir_lower_alu.c b/src/compiler/nir/nir_lower_alu.c index f22b789e7d5..7cad1e7f1a2 100644 --- a/src/compiler/nir/nir_lower_alu.c +++ b/src/compiler/nir/nir_lower_alu.c @@ -213,6 +213,7 @@ lower_alu_instr(nir_builder *b, nir_alu_instr *instr, UNUSED void *cb_data) /* Fallback on the emulation */ if (!lowered) { nir_def *iminmax = max ? nir_imax(b, s0, s1) : nir_imin(b, s0, s1); + iminmax = nir_fcanonicalize(b, iminmax); lowered = nir_bcsel(b, nir_feq(b, s0, s1), iminmax, fminmax); } From a5d80212f670bd41f31d09509b1fe98e56796d55 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 6 May 2026 12:04:43 +0200 Subject: [PATCH 2/5] asahi: fix dst range in buffer copy region agx_batch_writes_range takes the offset within the buffer, not the address Fixes: 4a3b905bb89 ("agx: move texture lowering into lib") --- src/gallium/drivers/asahi/agx_blit.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c index 12901f71241..5d770ef1ab7 100644 --- a/src/gallium/drivers/asahi/agx_blit.c +++ b/src/gallium/drivers/asahi/agx_blit.c @@ -619,11 +619,12 @@ agx_resource_copy_region(struct pipe_context *pctx, struct pipe_resource *dst, assert(dst->format == src->format); unsigned bs = util_format_get_blocksize(dst->format); unsigned size = bs * src_box->width; - uint64_t dst_addr = agx_map_gpu(agx_resource(dst)) + dstx * bs; + unsigned dst_offset = dstx * bs; + uint64_t dst_addr = agx_map_gpu(agx_resource(dst)) + dst_offset; uint64_t src_addr = agx_map_gpu(agx_resource(src)) + src_box->x * bs; agx_batch_reads(batch, agx_resource(src)); - agx_batch_writes_range(batch, agx_resource(dst), dst_addr, size); + agx_batch_writes_range(batch, agx_resource(dst), dst_offset, size); /* Use vectorized copies for as much of the buffer as possible. This requires * that dst, src, and size are all properly aligned. Failing to check for * alignment on the buffers causes subtle and hard-to-debug issues! From 44325c40a253395b463ebde85b288adbf691b430 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 6 May 2026 12:37:22 +0200 Subject: [PATCH 3/5] asahi: fix compute blitter for float16 image copies Fixes OpenCL CTS clCopyImage tests with CL_HALF_FLOAT --- src/gallium/drivers/asahi/agx_blit.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/gallium/drivers/asahi/agx_blit.c b/src/gallium/drivers/asahi/agx_blit.c index 5d770ef1ab7..1ea7a572abb 100644 --- a/src/gallium/drivers/asahi/agx_blit.c +++ b/src/gallium/drivers/asahi/agx_blit.c @@ -111,6 +111,13 @@ asahi_blit_compute_shader(struct pipe_context *ctx, struct asahi_blit_key *key) nir_iand(b, in_bounds, nir_ilt(b, logical_id_el_2d, dimensions_el_2d)); } + unsigned bit_size = 32; + nir_alu_type dst_type = nir_type_uint32; + if (util_format_is_float16(key->dst_format)) { + bit_size = 16; + dst_type = nir_type_float16; + } + nir_def *colour0, *colour1; nir_push_if(b, nir_ball(b, in_bounds)); { @@ -127,15 +134,15 @@ asahi_blit_compute_shader(struct pipe_context *ctx, struct asahi_blit_key *key) colour0 = nir_tex(b, coords_el_nd, .texture_index = 0, .sampler_index = 0, .backend_flags = AGX_TEXTURE_FLAG_NO_CLAMP, .dim = GLSL_SAMPLER_DIM_2D, .is_array = key->array, - .dest_type = nir_type_uint32); + .dest_type = dst_type); } nir_push_else(b, NULL); { /* For out-of-bounds pixels, copy in the destination */ colour1 = nir_image_load( - b, 4, 32, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos_nd), zero, zero, + b, 4, bit_size, nir_imm_int(b, 0), nir_pad_vec4(b, image_pos_nd), zero, zero, .image_array = key->array, .image_dim = GLSL_SAMPLER_DIM_2D, - .access = ACCESS_IN_BOUNDS, .dest_type = nir_type_uint32); + .access = ACCESS_IN_BOUNDS, .dest_type = dst_type); } nir_pop_if(b, NULL); nir_def *color = nir_if_phi(b, colour0, colour1); From 28aa1d7fb80f17b19452bc717d35771d5e27754c Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 6 May 2026 17:11:58 +0200 Subject: [PATCH 4/5] asahi: move batch flushing into agx_launch_internal The CL CTS basic arraycopy test actually runs out of space there. --- src/gallium/drivers/asahi/agx_state.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index c913e022d94..c2784427e38 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -3003,7 +3003,6 @@ agx_launch_internal(struct agx_batch *batch, struct agx_grid grid, struct agx_context *ctx = batch->ctx; struct agx_device *dev = agx_device(ctx->base.screen); - /* TODO: Ensure space if we allow multiple kernels in a batch */ uint32_t *out = (uint32_t *)batch->cdm.current; out = agx_cdm_launch(out, dev->chip, grid, wg, launch, usc); @@ -3012,6 +3011,19 @@ agx_launch_internal(struct agx_batch *batch, struct agx_grid grid, batch->cdm.current = (void *)out; assert(batch->cdm.current <= batch->cdm.end && "Failed to reserve sufficient space in encoder"); + + /* If the next dispatch might overflow, flush now. TODO: If this is ever hit + * in practice, we can use CDM stream links. + */ + size_t dispatch_upper_bound = + AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH + + AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH + + AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH + + AGX_CDM_BARRIER_LENGTH; + + if (batch->cdm.current + dispatch_upper_bound >= batch->cdm.end) + agx_flush_batch_for_reason(ctx, batch, "CDM overfull"); + } void @@ -5408,18 +5420,6 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) agx_dirty_all(ctx); batch->uniforms.tables[AGX_SYSVAL_TABLE_GRID] = 0; - - /* If the next dispatch might overflow, flush now. TODO: If this is ever hit - * in practice, we can use CDM stream links. - */ - size_t dispatch_upper_bound = - AGX_CDM_LAUNCH_WORD_0_LENGTH + AGX_CDM_LAUNCH_WORD_1_LENGTH + - AGX_CDM_UNK_G14X_LENGTH + AGX_CDM_INDIRECT_LENGTH + - AGX_CDM_GLOBAL_SIZE_LENGTH + AGX_CDM_LOCAL_SIZE_LENGTH + - AGX_CDM_BARRIER_LENGTH; - - if (batch->cdm.current + dispatch_upper_bound >= batch->cdm.end) - agx_flush_batch_for_reason(ctx, batch, "CDM overfull"); } static void From a8e530766855db44d48e1126f049e098483df997 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Thu, 7 May 2026 16:33:49 +0200 Subject: [PATCH 5/5] asahi: fix fdiv lowering Fixes: d3adef31641 ("agx: defer nir_opt_idiv_const") --- src/asahi/compiler/agx_compile.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/asahi/compiler/agx_compile.c b/src/asahi/compiler/agx_compile.c index 8070e610700..e131bcf0721 100644 --- a/src/asahi/compiler/agx_compile.c +++ b/src/asahi/compiler/agx_compile.c @@ -3619,8 +3619,6 @@ agx_preprocess_nir(nir_shader *nir) nir_metadata_control_flow, NULL); NIR_PASS(_, nir, agx_nir_lower_subgroups); NIR_PASS(_, nir, nir_lower_all_phis_to_scalar); - NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv, - nir_metadata_control_flow, NULL); /* After lowering, run through the standard suite of NIR optimizations. We * will run through the loop later, once we have the shader key, but if we @@ -3638,6 +3636,11 @@ agx_preprocess_nir(nir_shader *nir) }; NIR_PASS(_, nir, nir_lower_idiv, &idiv_options); + + /* Has to run after nir_lower_idiv */ + NIR_PASS(_, nir, nir_shader_alu_pass, agx_nir_lower_fdiv, + nir_metadata_control_flow, NULL); + NIR_PASS(_, nir, nir_opt_deref); NIR_PASS(_, nir, nir_lower_vars_to_ssa);