From 04f15cc71092f566cccc47ca29aead831bdfd634 Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Mon, 4 May 2026 23:26:47 +0200 Subject: [PATCH 1/4] r600/sfn: Add lowering of tess inner and outer default intrinsics These are UBO loads and so we do the lowering in nir. Signed-off-by: Gert Wollny --- src/gallium/drivers/r600/sfn/sfn_nir.cpp | 6 ++- .../drivers/r600/sfn/sfn_shader_tess.cpp | 39 +++++++++++++++++++ .../drivers/r600/sfn/sfn_shader_tess.h | 2 + 3 files changed, 46 insertions(+), 1 deletion(-) diff --git a/src/gallium/drivers/r600/sfn/sfn_nir.cpp b/src/gallium/drivers/r600/sfn/sfn_nir.cpp index 613204f4356..0a59584121d 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_nir.cpp @@ -22,6 +22,7 @@ #include "sfn_nir_lower_tex.h" #include "sfn_optimizer.h" #include "sfn_ra.h" +#include "sfn_shader_tess.h" #include "sfn_scheduler.h" #include "sfn_shader.h" #include "sfn_split_address_loads.h" @@ -869,8 +870,11 @@ r600_lower_and_optimize_nir(nir_shader *sh, NIR_PASS(_, sh, r600_lower_tess_io, static_cast(prim_type)); } - if (sh->info.stage == MESA_SHADER_TESS_CTRL) + if (sh->info.stage == MESA_SHADER_TESS_CTRL) { + NIR_PASS(_, sh, nir_lower_system_values); + NIR_PASS(_, sh, r600_lower_tess_level_default_to_ubo); NIR_PASS(_, sh, r600_append_tcs_TF_emission, (mesa_prim)key->tcs.prim_mode); + } if (sh->info.stage == MESA_SHADER_TESS_EVAL) { NIR_PASS(_, diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_tess.cpp b/src/gallium/drivers/r600/sfn/sfn_shader_tess.cpp index 16c7d0ccb0c..28f4f4a7c96 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_tess.cpp +++ b/src/gallium/drivers/r600/sfn/sfn_shader_tess.cpp @@ -8,6 +8,8 @@ #include "sfn_instr_export.h" #include "sfn_shader_vs.h" +#include "sfn_nir.h" +#include "nir.h" #include @@ -245,4 +247,41 @@ TESShader::do_print_properties(std::ostream& os) const (void)os; } +class LowerTessLevelDefault : public NirLowerInstruction { + bool filter(const nir_instr *instr) const override + { + if (instr->type != nir_instr_type_intrinsic) + return false; + + auto intr = nir_instr_as_intrinsic(instr); + return intr->intrinsic == nir_intrinsic_load_tess_level_inner_default || + intr->intrinsic == nir_intrinsic_load_tess_level_outer_default; + } + + nir_def *lower(nir_instr *instr) override + { + auto intr = nir_instr_as_intrinsic(instr); + + auto info_buffer = nir_imm_int(b, R600_BUFFER_INFO_CONST_BUFFER); + + switch (intr->intrinsic) { + case nir_intrinsic_load_tess_level_inner_default: + return nir_load_ubo(b, 2, 32, info_buffer, nir_imm_int(b, 16), + .range_base = 16, .range = 8); + case nir_intrinsic_load_tess_level_outer_default: + return nir_load_ubo(b, 4, 32, info_buffer, nir_imm_int(b, 0), + .range_base = 0, .range = 16); + default: + assert(0); + return nullptr; + } + } +}; + } // namespace r600 + +int r600_lower_tess_level_default_to_ubo(nir_shader *sh) +{ + return r600::LowerTessLevelDefault().run(sh); +} + diff --git a/src/gallium/drivers/r600/sfn/sfn_shader_tess.h b/src/gallium/drivers/r600/sfn/sfn_shader_tess.h index d932bf46c8e..62410ba69b2 100644 --- a/src/gallium/drivers/r600/sfn/sfn_shader_tess.h +++ b/src/gallium/drivers/r600/sfn/sfn_shader_tess.h @@ -83,4 +83,6 @@ private: } // namespace r600 +int r600_lower_tess_level_default_to_ubo(nir_shader *sh); + #endif // TCS_H From 1278a547f591a606a0d341df87652c34be7431db Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Mon, 4 May 2026 23:31:21 +0200 Subject: [PATCH 2/4] r600: replace TGSI TCS passthrough with NIR version We don't actually need to copy the vertex attributes because if no TCS shader was given by the user TES simply is pointed to the VS output in LDS that has the same layout the TCS shader would provide. v2: with the lowering of the relevant intrinsics in place use nir_create_passthrough_tcs_impl to create the passthrough shader (like suggested by Mareco and Emma) Signed-off-by: Gert Wollny --- src/gallium/drivers/r600/r600_state_common.c | 43 ++++++++------------ 1 file changed, 18 insertions(+), 25 deletions(-) diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c index 74ba282bdab..a58796cfae0 100644 --- a/src/gallium/drivers/r600/r600_state_common.c +++ b/src/gallium/drivers/r600/r600_state_common.c @@ -19,9 +19,9 @@ #include "util/u_math.h" #include "tgsi/tgsi_parse.h" #include "tgsi/tgsi_scan.h" -#include "tgsi/tgsi_ureg.h" #include "nir.h" +#include "nir_builder.h" #include "nir/nir_to_tgsi_info.h" void r600_init_command_buffer(struct r600_command_buffer *cb, unsigned num_dw) @@ -1679,32 +1679,25 @@ static void r600_update_clip_state(struct r600_context *rctx, } } -static void r600_generate_fixed_func_tcs(struct r600_context *rctx) +/* The TCS passthrough shader only writes the tessellation levels, + * the IO doesn't need to be copied over, because TES gets handed the + * location of the VS outputs directly if this shader is used + * (see evergreen_setup_tess_constants) +*/ + +static struct r600_pipe_shader_selector * +r600_create_fixed_func_tcs_nir(struct r600_context *rctx) { - struct ureg_src const0, const1; - struct ureg_dst tessouter, tessinner; - struct ureg_program *ureg = ureg_create(MESA_SHADER_TESS_CTRL); + const struct nir_shader_compiler_options *options = + rctx->screen->b.b.nir_options[MESA_SHADER_TESS_CTRL]; - if (!ureg) - return; /* if we get here, we're screwed */ + struct pipe_shader_state state = { + .type = PIPE_SHADER_IR_NIR, + .ir.nir = nir_create_passthrough_tcs_impl(options, NULL, 0, 0) + }; - assert(!rctx->fixed_func_tcs_shader); - - ureg_DECL_constant2D(ureg, 0, 1, R600_BUFFER_INFO_CONST_BUFFER); - const0 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 0), - R600_BUFFER_INFO_CONST_BUFFER); - const1 = ureg_src_dimension(ureg_src_register(TGSI_FILE_CONSTANT, 1), - R600_BUFFER_INFO_CONST_BUFFER); - - tessouter = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSOUTER, 0); - tessinner = ureg_DECL_output(ureg, TGSI_SEMANTIC_TESSINNER, 0); - - ureg_MOV(ureg, tessouter, const0); - ureg_MOV(ureg, tessinner, const1); - ureg_END(ureg); - - rctx->fixed_func_tcs_shader = - ureg_create_shader_and_destroy(ureg, &rctx->b.b); + return (struct r600_pipe_shader_selector *) + rctx->b.b.create_tcs_state(&rctx->b.b, &state); } void r600_update_compressed_resource_state(struct r600_context *rctx, bool compute_only) @@ -1919,7 +1912,7 @@ static bool r600_update_derived_state(struct r600_context *rctx) UPDATE_SHADER(EG_HW_STAGE_HS, tcs); } else if (rctx->tes_shader) { if (!rctx->fixed_func_tcs_shader) { - r600_generate_fixed_func_tcs(rctx); + rctx->fixed_func_tcs_shader = r600_create_fixed_func_tcs_nir(rctx); if (!rctx->fixed_func_tcs_shader) return false; From 7bdc3d74d31b5f3ab626da19d4414c18a93ab182 Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Wed, 6 May 2026 00:24:27 +0200 Subject: [PATCH 3/4] r600/sfn: lower iadd3 to iadd(iadd) Just to make the query shader simpler to look at. Signed-off-by: Gert Wollny --- src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py b/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py index efb3f16c0d5..0024863d9b7 100644 --- a/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py +++ b/src/gallium/drivers/r600/sfn/sfn_nir_algebraic.py @@ -49,6 +49,8 @@ lower_alu = [ (('seq', ('fadd', 'a', 'b'), 0.0), ('seq', 'a', ('fneg', 'b'))), (('sne', ('fadd', 'a', 'b'), 0.0), ('sne', 'a', ('fneg', 'b'))), + + (('iadd3', 'a', 'b', 'c'), ('iadd', 'a', ('iadd', 'b', 'c'))), ] From 6d10e79c265ac3c14a496ed3dff780655e7af392 Mon Sep 17 00:00:00 2001 From: Gert Wollny Date: Fri, 1 May 2026 18:55:27 +0200 Subject: [PATCH 4/4] r600: replace TGSI query shader with nir The commit was created in assistence with github Copilot. v2: - remove a few useless helpers - rename some variables - use some more nir with immediate codes (Emma) Signed-off-by: Gert Wollny --- src/gallium/drivers/r600/r600_query.c | 482 ++++++++++++++++---------- 1 file changed, 305 insertions(+), 177 deletions(-) diff --git a/src/gallium/drivers/r600/r600_query.c b/src/gallium/drivers/r600/r600_query.c index 758050caade..5d95edc0bf6 100644 --- a/src/gallium/drivers/r600/r600_query.c +++ b/src/gallium/drivers/r600/r600_query.c @@ -10,7 +10,7 @@ #include "util/u_memory.h" #include "util/u_upload_mgr.h" #include "util/os_time.h" -#include "tgsi/tgsi_text.h" +#include "nir/nir_builder.h" #define R600_MAX_STREAMS 4 @@ -1353,6 +1353,253 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, return true; } +enum r600_query_result_shader_config { + R600_QUERY_READ_PREV = 1, + R600_QUERY_WRITE_CHAIN = 2, + R600_QUERY_WRITE_AVAIL = 4, + R600_QUERY_BOOL = 8, + R600_QUERY_ONE_DWORD = 16, + R600_QUERY_TIMESTAMP = 32, + R600_QUERY_RESULT64 = 64, + R600_QUERY_SIGNED32 = 128, + R600_QUERY_SO_OVERFLOW = 256, +}; + +struct r600_query_result_accum_vars { + nir_variable *accum; + nir_variable *result_available; +}; + +#define QUERY_RESULT_BUF nir_imm_int(b, 0) +#define QUERY_PREV_SUMMARY_BUF nir_imm_int(b, 1) +#define NEXT_SUMMARY_OR_USER_BUF nir_imm_int(b, 2) + +static void r600_query_result_load_one_dword(nir_builder *b, nir_def *fence_offset, + nir_def *result_offset, + const struct r600_query_result_accum_vars *vars) +{ + nir_def *fence_value = nir_load_ssbo(b, 1, 32, QUERY_RESULT_BUF, + nir_iadd(b, fence_offset, result_offset)); + nir_def *fence_mask = nir_inot(b, nir_ishr_imm(b, fence_value, 31)); + nir_store_var(b, vars->result_available, fence_mask, 0x1); + + nir_if *if_fence_set = nir_push_if(b, nir_ilt_imm(b, fence_value, 0)); { + nir_def *result = nir_load_ssbo(b, 2, 32, QUERY_RESULT_BUF, result_offset); + nir_store_var(b, vars->accum, nir_channels(b, result, 0x3), 0x3); + } + nir_pop_if(b, if_fence_set); +} + +static void r600_query_result_accumulate_pairs(nir_builder *b, nir_function_impl *impl, + nir_def *config_bitfield, + nir_def *result_count, + nir_def *result_stride, + nir_def *fence_offset, + nir_def *pair_stride, + nir_def *pair_count, + nir_def *end_offset, + nir_def *result_offset, + const struct r600_query_result_accum_vars *vars) +{ + nir_def *_0i = nir_imm_int(b, 0); + + nir_store_var(b, vars->accum, nir_vec2(b, _0i, _0i), 0x3); + + nir_def *cfg_prev_acc_result = nir_test_mask(b, config_bitfield, R600_QUERY_READ_PREV); + nir_if *if_prev_acc_result = nir_push_if(b, cfg_prev_acc_result); { + nir_def *result = nir_load_ssbo(b, 3, 32, QUERY_PREV_SUMMARY_BUF, _0i); + nir_store_var(b, vars->accum, nir_channels(b, result, 0x3), 0x3); + nir_store_var(b, vars->result_available, nir_channel(b, result, 2), 0x1); + } + nir_pop_if(b, if_prev_acc_result); + + nir_def *availability_mask_value = nir_load_var(b, vars->result_available); + nir_if *if_not_z = nir_push_if(b, nir_ieq(b, availability_mask_value, _0i)); { + + /* Accumulate query result pairs across result buffers: + * for (result_index = 0; result_index < result_count; ++result_index) { + * fence = load_fence(result_index) + * if (!fence.valid) break // Stop when we hit an invalid fence + * for (pair_index = 0; pair_index < pair_count; ++pair_index) { + * pair_start = load64(result_offset + pair_index * pair_stride) + * pair_end = load64(result_offset + pair_index * pair_stride + end_offset) + * accumulator += (pair_end - pair_start) + * if (SO_OVERFLOW_mode) { + * fence_start = load64(...) + * fence_end = load64(...) + * accumulator -= (fence_end - fence_start) + * } + * } + * } + */ + + nir_variable *outer_loop_iter = + nir_local_variable_create(impl, glsl_uint_type(), "outer_loop_iter"); + nir_store_var(b, outer_loop_iter, _0i, 0x1); + nir_loop *loop_outer = nir_push_loop(b); { + nir_def *result_index = nir_load_var(b, outer_loop_iter); + + nir_break_if(b, nir_uge(b, result_index, result_count)); + + nir_def *pitch_outer_loop = nir_imul(b, result_index, result_stride); + nir_def *address = nir_iadd3(b, pitch_outer_loop, + fence_offset, + result_offset); + nir_def *value = nir_load_ssbo(b, 1, 32, QUERY_RESULT_BUF, address); + nir_def *bitmask = nir_inot(b, nir_ishr_imm(b, value, 31)); + nir_store_var(b, vars->result_available, bitmask, 0x1); + + nir_break_if(b, nir_i2b(b, bitmask)); + + nir_variable *inner_loop_iter = + nir_local_variable_create(impl, glsl_uint_type(), "inner_loop_iter"); + nir_store_var(b, inner_loop_iter, _0i, 0x1); + + nir_loop *loop_inner = nir_push_loop(b); { + nir_def *i = nir_load_var(b, inner_loop_iter); + nir_def *pitch_inner_loop = + nir_iadd(b, + nir_imul(b, i, pair_stride), + pitch_outer_loop); + + nir_def *base = + nir_iadd(b, pitch_inner_loop, result_offset); + nir_def *first = nir_load_ssbo(b, 1, 64, QUERY_RESULT_BUF, base); + nir_def *new_pitch = + nir_iadd(b, pitch_inner_loop, end_offset); + nir_def *second = nir_load_ssbo(b, 1, 64, QUERY_RESULT_BUF, + nir_iadd(b, new_pitch, result_offset)); + nir_def *start_half_pair = nir_isub(b, second, first); + + nir_def *difference; + nir_def *cfg_so_overflow_mode = nir_test_mask(b, config_bitfield, R600_QUERY_SO_OVERFLOW); + nir_if *if_so_overflow_mode = nir_push_if(b, cfg_so_overflow_mode); { + first = nir_load_ssbo(b, 1, 64, QUERY_RESULT_BUF, + nir_iadd_imm(b, base, 8)); + second = nir_load_ssbo(b, 1, 64, QUERY_RESULT_BUF, + nir_iadd_imm(b, + nir_iadd(b, new_pitch, result_offset), + 8)); + nir_def *end_half_pair = nir_isub(b, second, first); + difference = nir_isub(b, start_half_pair, end_half_pair); + } + nir_pop_if(b, if_so_overflow_mode); + + nir_def *sum = nir_iadd(b, nir_pack_64_2x32(b, nir_load_var(b, vars->accum)), + nir_if_phi(b, difference, start_half_pair)); + sum = nir_unpack_64_2x32(b, sum); + + i = nir_iadd_imm(b, i, 1); + nir_store_var(b, inner_loop_iter, i, 0x1); + + nir_store_var(b, vars->accum, sum, 0x3); + + nir_def *is_pair_count_exceeded = nir_uge(b, i, pair_count); + nir_break_if(b, is_pair_count_exceeded); + } + nir_pop_loop(b, loop_inner); + + nir_store_var(b, outer_loop_iter, nir_iadd_imm(b, result_index, 1), 0x1); + } + nir_pop_loop(b, loop_outer); + } + nir_pop_if(b, if_not_z); +} + +static void r600_query_result_store_output(nir_builder *b, + struct r600_common_context *rctx, + nir_def *config_bitfield, + nir_def *result_offset, + const struct r600_query_result_accum_vars *vars) +{ + nir_def *_0i = nir_imm_int(b, 0); + nir_def *accum_value = nir_load_var(b, vars->accum); + nir_def *availability_value = nir_load_var(b, vars->result_available); + + /* Store query results to output buffer based on configuration: + * if (write_chain) { + * // Store accumulated values for chaining to next grid invocation + * output = (accum_lo, accum_hi, availability_mask) + * } else if (write_available) { + * // Store query availability flag (!fence_valid) and optional 64-bit padding + * output = (fence_valid ? 0 : 1) + * if (is_64bit) output += padding_word + * } else { + * // Store actual query result with optional conversions + * if (apply_timestamp) + * result = convert_to_timestamp(x, y) + * if (convert_to_bool) + * result = (result != 0) ? 1 : 0 + * if (is_64bit) { + * output = (result_low, result_high) + * } else { + * output = result_low (clamped for signed mode) + * } + * } + */ + nir_def *is_acc_chaining = nir_test_mask(b, config_bitfield, R600_QUERY_WRITE_CHAIN); + nir_if *if_acc_chaining = nir_push_if(b, is_acc_chaining); { + nir_store_ssbo(b, nir_vec3(b, nir_channel(b, accum_value, 0), nir_channel(b, accum_value, 1), availability_value), NEXT_SUMMARY_OR_USER_BUF, + result_offset); + } nir_push_else(b, if_acc_chaining); { + nir_def *cfg_write_result_available = nir_test_mask(b, config_bitfield, R600_QUERY_WRITE_AVAIL); + nir_if *if_write_result_available = nir_push_if(b, cfg_write_result_available); { + nir_store_ssbo(b, + nir_iand_imm(b, nir_inot(b, availability_value), 1), + NEXT_SUMMARY_OR_USER_BUF, result_offset); + + nir_def *is_result_64_bits = nir_test_mask(b, config_bitfield, R600_QUERY_RESULT64); + nir_if *if_result_64_bits = nir_push_if(b, is_result_64_bits); { + nir_store_ssbo(b, _0i, NEXT_SUMMARY_OR_USER_BUF, nir_iadd_imm(b, result_offset, 4)); + } + nir_pop_if(b, if_result_64_bits); + } nir_push_else(b, if_write_result_available); { + nir_def *result_not_written = nir_i2b(b, nir_inot(b, availability_value)); + nir_if *if_result_not_written = nir_push_if(b, result_not_written); { + nir_def *ts; + + nir_def *cfg_apply_timestamp = nir_test_mask(b, config_bitfield, R600_QUERY_TIMESTAMP); + nir_if *if_apply_timestamp = nir_push_if(b, cfg_apply_timestamp); { + nir_def *clock_crystal_frequency = + nir_imm_int64(b, rctx->screen->info.clock_crystal_freq); + nir_def *xy_million = nir_imul(b, + nir_pack_64_2x32(b, accum_value), + nir_imm_int64(b, 1000000)); + nir_def *ts_converted = nir_udiv(b, xy_million, + clock_crystal_frequency); + ts = nir_unpack_64_2x32(b, ts_converted); + } + nir_pop_if(b, if_apply_timestamp); + + nir_def *n = nir_if_phi(b, ts, accum_value); + nir_def *nx = nir_channel(b,n, 0); + nir_def *ny = nir_channel(b,n, 1); + + nir_def *cfg_convert_to_bool = nir_test_mask(b, config_bitfield, R600_QUERY_BOOL); + nir_def *is_nonzero = nir_b2i32(b, nir_ine(b, nir_ior(b, nx, ny), _0i)); + nx = nir_bcsel(b, cfg_convert_to_bool, is_nonzero, nx); + ny = nir_bcsel(b, cfg_convert_to_bool, _0i, ny); + + nir_def *cfg_result_64_bits = nir_test_mask(b, config_bitfield, R600_QUERY_RESULT64); + nir_if *if_result_64_bits = nir_push_if(b, cfg_result_64_bits); { + nir_store_ssbo(b, nir_vec2(b, nx, ny), NEXT_SUMMARY_OR_USER_BUF, result_offset); + } nir_push_else(b, if_result_64_bits); { + nir_def *is_y = nir_ine(b, ny, _0i); + nx = nir_bcsel(b, is_y, nir_imm_int(b, UINT32_MAX), nx); + nir_def *cfg_signed_32bit_result = nir_test_mask(b, config_bitfield, R600_QUERY_SIGNED32); + nir_def *min = nir_umin(b, nx, nir_imm_int(b, INT_MAX)); + nx = nir_bcsel(b, cfg_signed_32bit_result, min, nx); + nir_store_ssbo(b, nx, NEXT_SUMMARY_OR_USER_BUF, result_offset); + } + nir_pop_if(b, if_result_64_bits); + } + nir_pop_if(b, if_result_not_written); + } + nir_pop_if(b, if_write_result_available); + } + nir_pop_if(b, if_acc_chaining); +} + /* Create the compute shader that is used to collect the results. * * One compute grid with a single thread is launched for every query result @@ -1389,188 +1636,68 @@ bool r600_query_hw_get_result(struct r600_common_context *rctx, */ static void r600_create_query_result_shader(struct r600_common_context *rctx) { - /* TEMP[0].xy = accumulated result so far - * TEMP[0].z = result not available - * - * TEMP[1].x = current result index - * TEMP[1].y = current pair index - */ - static const char text_tmpl[] = - "COMP\n" - "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n" - "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n" - "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n" - "DCL BUFFER[0]\n" - "DCL BUFFER[1]\n" - "DCL BUFFER[2]\n" - "DCL CONST[0][0..2]\n" - "DCL TEMP[0..5]\n" - "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n" - "IMM[1] UINT32 {1, 2, 4, 8}\n" - "IMM[2] UINT32 {16, 32, 64, 128}\n" - "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */ - "IMM[4] UINT32 {256, 0, 0, 0}\n" - - "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n" - "UIF TEMP[5]\n" - /* Check result availability. */ - "UADD TEMP[1].x, CONST[0][1].xxxx, CONST[0][2].xxxx\n" - "LOAD TEMP[1].x, BUFFER[0], TEMP[1].xxxx\n" - "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n" - "MOV TEMP[1], TEMP[0].zzzz\n" - "NOT TEMP[0].z, TEMP[0].zzzz\n" - - /* Load result if available. */ - "UIF TEMP[1]\n" - "UADD TEMP[0].x, IMM[0].xxxx, CONST[0][2].xxxx\n" - "LOAD TEMP[0].xy, BUFFER[0], TEMP[0].xxxx\n" - "ENDIF\n" - "ELSE\n" - /* Load previously accumulated result if requested. */ - "MOV TEMP[0], IMM[0].xxxx\n" - "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n" - "UIF TEMP[4]\n" - "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n" - "ENDIF\n" - - "MOV TEMP[1].x, IMM[0].xxxx\n" - "BGNLOOP\n" - /* Break if accumulated result so far is not available. */ - "UIF TEMP[0].zzzz\n" - "BRK\n" - "ENDIF\n" - - /* Break if result_index >= result_count. */ - "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - - /* Load fence and check result availability */ - "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n" - "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n" - "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n" - "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n" - "NOT TEMP[0].z, TEMP[0].zzzz\n" - "UIF TEMP[0].zzzz\n" - "BRK\n" - "ENDIF\n" - - "MOV TEMP[1].y, IMM[0].xxxx\n" - "BGNLOOP\n" - /* Load start and end. */ - "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n" - "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n" - "UADD TEMP[5].x, TEMP[5].xxxx, CONST[0][2].xxxx\n" - "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" - - "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n" - "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" - - "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n" - - "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n" - "UIF TEMP[5].zzzz\n" - /* Load second start/end half-pair and - * take the difference - */ - "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n" - "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n" - "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n" - - "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n" - "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n" - "ENDIF\n" - - "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n" - - /* Increment pair index */ - "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n" - "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n" - "UIF TEMP[5]\n" - "BRK\n" - "ENDIF\n" - "ENDLOOP\n" - - /* Increment result index */ - "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n" - "ENDLOOP\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n" - "UIF TEMP[4]\n" - /* Store accumulated data for chaining. */ - "STORE BUFFER[2].xyz, CONST[0][1].wwww, TEMP[0]\n" - "ELSE\n" - "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n" - "UIF TEMP[4]\n" - /* Store result availability. */ - "NOT TEMP[0].z, TEMP[0]\n" - "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n" - "STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].zzzz\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" - "UIF TEMP[4]\n" - "STORE BUFFER[2].y, CONST[0][1].wwww, IMM[0].xxxx\n" - "ENDIF\n" - "ELSE\n" - /* Store result if it is available. */ - "NOT TEMP[4], TEMP[0].zzzz\n" - "UIF TEMP[4]\n" - /* Apply timestamp conversion */ - "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n" - "UIF TEMP[4]\n" - "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n" - "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n" - "ENDIF\n" - - /* Convert to boolean */ - "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n" - "UIF TEMP[4]\n" - "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n" - "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n" - "MOV TEMP[0].y, IMM[0].xxxx\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n" - "UIF TEMP[4]\n" - "STORE BUFFER[2].xy, CONST[0][1].wwww, TEMP[0].xyxy\n" - "ELSE\n" - /* Clamping */ - "UIF TEMP[0].yyyy\n" - "MOV TEMP[0].x, IMM[0].wwww\n" - "ENDIF\n" - - "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n" - "UIF TEMP[4]\n" - "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n" - "ENDIF\n" - - "STORE BUFFER[2].x, CONST[0][1].wwww, TEMP[0].xxxx\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - "ENDIF\n" - - "END\n"; - - char text[sizeof(text_tmpl) + 32]; - struct tgsi_token tokens[1024]; + const struct nir_shader_compiler_options *options = + rctx->b.screen->nir_options[MESA_SHADER_COMPUTE]; + nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, + "r600_create_query_result_cs"); struct pipe_compute_state state = {}; - /* Hard code the frequency into the shader so that the backend can - * use the full range of optimizations for divide-by-constant. + b.shader->info.workgroup_size[0] = 1; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.num_ubos = 1; + b.shader->info.num_ssbos = 3; + b.shader->num_uniforms = 3; + + /* uint32_t accum_lo, accum_hi, availability_mask = 0; */ + nir_function_impl *impl = nir_shader_get_entrypoint(b.shader); + nir_variable *accum = nir_local_variable_create(impl, glsl_ivec2_type(), "accum"); + nir_variable *availability_mask = nir_local_variable_create(impl, glsl_uint_type(), "availability_mask"); + nir_store_var(&b, availability_mask, nir_imm_int(&b, 0), 0x1); + const struct r600_query_result_accum_vars accum_vars = {accum, availability_mask}; + + /* consts[0] = {end_offset, result_stride, result_count, config} + * consts[1] = {fence_offset, pair_stride, pair_count, buffer_offset} + * consts[2] = {buffer0_offset, 0, 0, 0} */ - snprintf(text, sizeof(text), text_tmpl, - rctx->screen->info.clock_crystal_freq); + nir_def *default_ubo = nir_imm_int(&b, 0); + nir_def *config_0 = nir_load_ubo(&b, 4, 32, default_ubo, nir_imm_int(&b, 0), .range_base = 0, .range = 16); + nir_def *config_1 = nir_load_ubo(&b, 4, 32, default_ubo, nir_imm_int(&b, 16), .range_base = 16, .range = 16); + nir_def *config_2 = nir_load_ubo(&b, 4, 32, default_ubo, nir_imm_int(&b, 32), .range_base = 32, .range = 16); - if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) { - assert(false); - return; + /* Decode UBO payload into named fields to make the shader logic readable. */ + nir_def *end_offset = nir_channel(&b, config_0, 0); + nir_def *result_stride = nir_channel(&b, config_0, 1); + nir_def *result_count = nir_channel(&b, config_0, 2); + nir_def *confifg_bitfield = nir_channel(&b, config_0, 3); + nir_def *fence_offset = nir_channel(&b, config_1, 0); + nir_def *pair_stride = nir_channel(&b, config_1, 1); + nir_def *pair_count = nir_channel(&b, config_1, 2); + nir_def *result_offset = nir_channel(&b, config_1, 3); + nir_def *buffer0_offset = nir_channel(&b, config_2, 0); + + /* Check result availability for timestamp queries. */ + nir_def *is_one_dword_result = + nir_test_mask(&b, confifg_bitfield, R600_QUERY_ONE_DWORD); + nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); { + r600_query_result_load_one_dword(&b, fence_offset, buffer0_offset, &accum_vars); + } nir_push_else(&b, if_one_dword_result); { + r600_query_result_accumulate_pairs(&b, impl, confifg_bitfield, result_count, + result_stride, fence_offset, + pair_stride, pair_count, + end_offset, buffer0_offset, + &accum_vars); } + nir_pop_if(&b, if_one_dword_result); + + r600_query_result_store_output(&b, rctx, confifg_bitfield, result_offset, &accum_vars); + + rctx->b.screen->finalize_nir(rctx->b.screen, b.shader, true); + + state.ir_type = PIPE_SHADER_IR_NIR; + state.prog = b.shader; + - state.ir_type = PIPE_SHADER_IR_TGSI; - state.prog = tokens; rctx->query_result_shader = rctx->b.create_compute_state(&rctx->b, &state); } @@ -1613,6 +1740,7 @@ static void r600_query_hw_get_result_resource(struct r600_common_context *rctx, uint32_t pair_count; uint32_t buffer_offset; uint32_t buffer0_offset; + uint32_t pad[3]; } consts; if (!rctx->query_result_shader) {