radeonsi: convert "gfx11_create_sh_query_result_cs" shader to nir

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25972>
This commit is contained in:
Ganesh Belgur Ramachandra 2023-11-21 02:55:54 -06:00 committed by Marge Bot
parent c109c3f95c
commit f119f34742
4 changed files with 252 additions and 231 deletions

View file

@ -74,7 +74,6 @@ files_libradeonsi = files(
'si_shader_internal.h',
'si_shader_nir.c',
'si_shaderlib_nir.c',
'si_shaderlib_tgsi.c',
'si_sqtt.c',
'si_state.c',
'si_state.h',

View file

@ -1674,8 +1674,6 @@ void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_t
void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array);
void *si_create_query_result_cs(struct si_context *sctx);
/* si_shaderlib_tgsi.c */
void *gfx11_create_sh_query_result_cs(struct si_context *sctx);
/* gfx11_query.c */

View file

@ -8,6 +8,7 @@
#define AC_SURFACE_INCLUDE_NIR
#include "ac_surface.h"
#include "si_pipe.h"
#include "si_query.h"
#include "nir_format_convert.h"
@ -1253,3 +1254,254 @@ void *si_create_query_result_cs(struct si_context *sctx)
return create_shader_state(sctx, b.shader);
}
/* Create the compute shader that is used to collect the results of gfx10+
* shader queries.
*
* One compute grid with a single thread is launched for every query result
* buffer. The thread (optionally) reads a previous summary buffer, then
* accumulates data from the query result buffer, and writes the result either
* to a summary buffer to be consumed by the next grid invocation or to the
* user-supplied buffer.
*
* Data layout:
*
* CONST
* 0.x = config;
* [0:2] the low 3 bits indicate the mode:
* 0: sum up counts
* 1: determine result availability and write it as a boolean
* 2: SO_OVERFLOW
* 3: SO_ANY_OVERFLOW
* the remaining bits form a bitfield:
* 8: write result as a 64-bit value
* 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
* 0.z = chain bit field:
* 1: have previous summary buffer
* 2: write next summary buffer
* 0.w = result_count
*/
void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
{
const nir_shader_compiler_options *options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "gfx11_create_sh_query_result_cs");
b.shader->info.workgroup_size[0] = 1;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ubos = 1;
b.shader->info.num_ssbos = 3;
b.shader->num_uniforms = 1;
nir_def *zero = nir_imm_int(&b, 0);
nir_def *one = nir_imm_int(&b, 1);
nir_def *two = nir_imm_int(&b, 2);
nir_def *four = nir_imm_int(&b, 4);
nir_def *minus_one = nir_imm_int(&b, 0xffffffff);
/* uint32_t acc_result = 0, acc_missing = 0; */
nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
nir_variable *acc_result = nir_local_variable_create(e, glsl_uint_type(), "acc_result");
nir_store_var(&b, acc_result, zero, 0x1);
nir_variable *acc_missing = nir_local_variable_create(e, glsl_uint_type(), "acc_missing");
nir_store_var(&b, acc_missing, zero, 0x1);
/* uint32_t buff_0[4] = load_ubo(0, 0); */
nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
/* if((chain & 1) {
* uint32_t result[2] = load_ssbo(1, 0);
* acc_result = result[0];
* acc_missing = result[1];
* }
*/
nir_def *is_prev_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), one));
nir_if *if_prev_summary_buffer = nir_push_if(&b, is_prev_summary_buffer); {
nir_def *result = nir_load_ssbo(&b, 2, 32, one, zero);
nir_store_var(&b, acc_result, nir_channel(&b, result, 0), 0x1);
nir_store_var(&b, acc_missing, nir_channel(&b, result, 1), 0x1);
}
nir_pop_if(&b, if_prev_summary_buffer);
/* uint32_t mode = config & 0b111;
* bool is_overflow = mode >= 2;
*/
nir_def *mode = nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 0b111);
nir_def *is_overflow = nir_uge(&b, mode, two);
/* uint32_t result_remaining = (is_overflow && acc_result) ? 0 : result_count; */
nir_variable *result_remaining = nir_local_variable_create(e, glsl_uint_type(), "result_remaining");
nir_variable *base_offset = nir_local_variable_create(e, glsl_uint_type(), "base_offset");
nir_def *state = nir_iand(&b,
nir_isub(&b, zero, nir_b2i32(&b, is_overflow)),
nir_load_var(&b, acc_result));
nir_def *value = nir_bcsel(&b, nir_i2b(&b, state), zero, nir_channel(&b, buff_0, 3));
nir_store_var(&b, result_remaining, value, 0x1);
/* uint32_t base_offset = 0; */
nir_store_var(&b, base_offset, zero, 0x1);
/* Outer loop begin.
* while (!result_remaining) {
* ...
*/
nir_loop *loop_outer = nir_push_loop(&b); {
nir_def *condition = nir_load_var(&b, result_remaining);
nir_if *if_not_condition = nir_push_if(&b, nir_ieq(&b, condition, zero)); {
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_not_condition);
/* result_remaining--; */
condition = nir_iadd(&b, condition, minus_one);
nir_store_var(&b, result_remaining, condition, 0x1);
/* uint32_t fence = load_ssbo(0, base_offset + sizeof(gfx11_sh_query_buffer_mem.stream)); */
nir_def *b_offset = nir_load_var(&b, base_offset);
uint64_t buffer_mem_stream_size = sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream);
nir_def *fence = nir_load_ssbo(&b, 1, 32, zero,
nir_iadd_imm(&b, b_offset, buffer_mem_stream_size));
/* if (!fence) {
* acc_missing = ~0u;
* break;
* }
*/
nir_def *is_zero = nir_ieq(&b, fence, zero);
nir_def *y_value = nir_isub(&b, zero, nir_b2i32(&b, is_zero));
nir_store_var(&b, acc_missing, y_value, 0x1);
nir_if *if_ssbo_zero = nir_push_if(&b, is_zero); {
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_ssbo_zero);
/* stream_offset = base_offset + offset; */
nir_def *s_offset = nir_iadd(&b, b_offset, nir_channel(&b, buff_0, 1));
/* if (!(config & 7)) {
* acc_result += buffer[0]@stream_offset;
* }
*/
nir_if *if_sum_up_counts = nir_push_if(&b, nir_ieq(&b, mode, zero)); {
nir_def *x_value = nir_load_ssbo(&b, 1, 32, zero, s_offset);
x_value = nir_iadd(&b, nir_load_var(&b, acc_result), x_value);
nir_store_var(&b, acc_result, x_value, 0x1);
}
nir_pop_if(&b, if_sum_up_counts);
/* if (is_overflow) {
* uint32_t count = (config & 1) ? 4 : 1;
* ...
*/
nir_if *if_overflow = nir_push_if(&b, is_overflow); {
nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, mode, one));
nir_def *initial_count = nir_bcsel(&b, is_result_available, four, one);
nir_variable *count =
nir_local_variable_create(e, glsl_uint_type(), "count");
nir_store_var(&b, count, initial_count, 0x1);
nir_variable *stream_offset =
nir_local_variable_create(e, glsl_uint_type(), "stream_offset");
nir_store_var(&b, stream_offset, s_offset, 0x1);
/* Inner loop begin.
* do {
* ...
*/
nir_loop *loop_inner = nir_push_loop(&b); {
/* uint32_t buffer[4] = load_ssbo(0, stream_offset + 2 * sizeof(uint64_t)); */
nir_def *stream_offset_value = nir_load_var(&b, stream_offset);
nir_def *buffer =
nir_load_ssbo(&b, 4, 32, zero,
nir_iadd_imm(&b, stream_offset_value, 2 * sizeof(uint64_t)));
/* if (generated != emitted) {
* acc_result = 1;
* base_offset = 0;
* break;
* }
*/
nir_def *generated = nir_channel(&b, buffer, 0);
nir_def *emitted = nir_channel(&b, buffer, 2);
nir_if *if_not_equal = nir_push_if(&b, nir_ine(&b, generated, emitted)); {
nir_store_var(&b, acc_result, one, 0x1);
nir_store_var(&b, base_offset, zero, 0x1);
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_not_equal);
/* stream_offset += sizeof(gfx11_sh_query_buffer_mem.stream[0]); */
uint64_t buffer_mem_stream0_size =
sizeof(((struct gfx11_sh_query_buffer_mem*)0)->stream[0]);
stream_offset_value = nir_iadd_imm(&b, stream_offset_value, buffer_mem_stream0_size);
nir_store_var(&b, stream_offset, stream_offset_value, 0x1);
/* } while(count--); */
nir_def *loop_count = nir_load_var(&b, count);
loop_count = nir_iadd(&b, loop_count, minus_one);
nir_store_var(&b, count, loop_count, 0x1);
nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, loop_count, zero)); {
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_zero);
}
nir_pop_loop(&b, loop_inner); /* Inner loop end */
}
nir_pop_if(&b, if_overflow);
/* base_offset += sizeof(gfx11_sh_query_buffer_mem); */
nir_def *buffer_mem_size = nir_imm_int(&b, sizeof(struct gfx11_sh_query_buffer_mem));
nir_store_var(&b, base_offset, nir_iadd(&b, nir_load_var(&b, base_offset), buffer_mem_size), 0x1);
}
nir_pop_loop(&b, loop_outer); /* Outer loop end */
nir_def *acc_result_value = nir_load_var(&b, acc_result);
nir_def *y_value = nir_load_var(&b, acc_missing);
/* if ((chain & 2)) {
* store_ssbo(<acc_result, acc_missing>, 2, 0);
* ...
*/
nir_def *is_write_summary_buffer = nir_i2b(&b, nir_iand(&b, nir_channel(&b, buff_0, 2), two));
nir_if *if_write_summary_buffer = nir_push_if(&b, is_write_summary_buffer); {
nir_store_ssbo(&b, nir_vec2(&b, acc_result_value, y_value), two, zero);
} nir_push_else(&b, if_write_summary_buffer); {
/* } else {
* if ((config & 7) == 1) {
* acc_result = acc_missing ? 0 : 1;
* acc_missing = 0;
* }
* ...
*/
nir_def *is_result_available = nir_ieq(&b, mode, one);
nir_def *is_zero = nir_ieq(&b, y_value, zero);
acc_result_value = nir_bcsel(&b, is_result_available, nir_b2i32(&b, is_zero), acc_result_value);
nir_def *ny = nir_bcsel(&b, is_result_available, zero, y_value);
/* if (!acc_missing) {
* store_ssbo(acc_result, 2, 0);
* if (config & 8)) {
* store_ssbo(0, 2, 4)
* }
* }
*/
nir_if *if_zero = nir_push_if(&b, nir_ieq(&b, ny, zero)); {
nir_store_ssbo(&b, acc_result_value, two, zero);
nir_def *is_so_any_overflow = nir_i2b(&b, nir_iand_imm(&b, nir_channel(&b, buff_0, 0), 8));
nir_if *if_so_any_overflow = nir_push_if(&b, is_so_any_overflow); {
nir_store_ssbo(&b, zero, two, four);
}
nir_pop_if(&b, if_so_any_overflow);
}
nir_pop_if(&b, if_zero);
}
nir_pop_if(&b, if_write_summary_buffer);
return create_shader_state(sctx, b.shader);
}

View file

@ -1,228 +0,0 @@
/*
* Copyright 2018 Advanced Micro Devices, Inc.
*
* SPDX-License-Identifier: MIT
*/
#include "si_pipe.h"
#include "tgsi/tgsi_text.h"
#include "tgsi/tgsi_ureg.h"
/* Create the compute shader that is used to collect the results of gfx10+
* shader queries.
*
* One compute grid with a single thread is launched for every query result
* buffer. The thread (optionally) reads a previous summary buffer, then
* accumulates data from the query result buffer, and writes the result either
* to a summary buffer to be consumed by the next grid invocation or to the
* user-supplied buffer.
*
* Data layout:
*
* BUFFER[0] = query result buffer (layout is defined by gfx10_sh_query_buffer_mem)
* BUFFER[1] = previous summary buffer
* BUFFER[2] = next summary buffer or user-supplied buffer
*
* CONST
* 0.x = config; the low 3 bits indicate the mode:
* 0: sum up counts
* 1: determine result availability and write it as a boolean
* 2: SO_OVERFLOW
* 3: SO_ANY_OVERFLOW
* the remaining bits form a bitfield:
* 8: write result as a 64-bit value
* 0.y = offset in bytes to counts or stream for SO_OVERFLOW mode
* 0.z = chain bit field:
* 1: have previous summary buffer
* 2: write next summary buffer
* 0.w = result_count
*/
void *gfx11_create_sh_query_result_cs(struct si_context *sctx)
{
/* TEMP[0].x = accumulated result so far
* TEMP[0].y = result missing
* TEMP[0].z = whether we're in overflow mode
*/
static const char text_tmpl[] =
"COMP\n"
"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
"DCL BUFFER[0]\n"
"DCL BUFFER[1]\n"
"DCL BUFFER[2]\n"
"DCL CONST[0][0..0]\n"
"DCL TEMP[0..5]\n"
"IMM[0] UINT32 {0, 7, 256, 4294967295}\n"
"IMM[1] UINT32 {1, 2, 4, 8}\n"
"IMM[2] UINT32 {16, 32, 64, 128}\n"
/* acc_result = 0;
* acc_missing = 0;
*/
"MOV TEMP[0].xy, IMM[0].xxxx\n"
/* if (chain & 1) {
* acc_result = buffer[1][0];
* acc_missing = buffer[1][1];
* }
*/
"AND TEMP[5], CONST[0][0].zzzz, IMM[1].xxxx\n"
"UIF TEMP[5]\n"
"LOAD TEMP[0].xy, BUFFER[1], IMM[0].xxxx\n"
"ENDIF\n"
/* is_overflow (TEMP[0].z) = (config & 7) >= 2; */
"AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
"USGE TEMP[0].z, TEMP[5].xxxx, IMM[1].yyyy\n"
/* result_remaining (TEMP[1].x) = (is_overflow && acc_result) ? 0 : result_count; */
"AND TEMP[5].x, TEMP[0].zzzz, TEMP[0].xxxx\n"
"UCMP TEMP[1].x, TEMP[5].xxxx, IMM[0].xxxx, CONST[0][0].wwww\n"
/* base_offset (TEMP[1].y) = 0; */
"MOV TEMP[1].y, IMM[0].xxxx\n"
/* for (;;) {
* if (!result_remaining) {
* break;
* }
* result_remaining--;
*/
"BGNLOOP\n"
" USEQ TEMP[5], TEMP[1].xxxx, IMM[0].xxxx\n"
" UIF TEMP[5]\n"
" BRK\n"
" ENDIF\n"
" UADD TEMP[1].x, TEMP[1].xxxx, IMM[0].wwww\n"
/* fence = buffer[0]@(base_offset + sizeof(gfx10_sh_query_buffer_mem.stream)); */
" UADD TEMP[5].x, TEMP[1].yyyy, IMM[2].wwww\n"
" LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
/* if (!fence) {
* acc_missing = ~0u;
* break;
* }
*/
" USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
" UIF TEMP[5]\n"
" MOV TEMP[0].y, TEMP[5].xxxx\n"
" BRK\n"
" ENDIF\n"
/* stream_offset (TEMP[2].x) = base_offset + offset; */
" UADD TEMP[2].x, TEMP[1].yyyy, CONST[0][0].yyyy\n"
/* if (!(config & 7)) {
* acc_result += buffer[0]@stream_offset;
* }
*/
" AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
" USEQ TEMP[5], TEMP[5].xxxx, IMM[0].xxxx\n"
" UIF TEMP[5]\n"
" LOAD TEMP[5].x, BUFFER[0], TEMP[2].xxxx\n"
" UADD TEMP[0].x, TEMP[0].xxxx, TEMP[5].xxxx\n"
" ENDIF\n"
/* if ((config & 7) >= 2) {
* count (TEMP[2].y) = (config & 1) ? 4 : 1;
*/
" AND TEMP[5].x, CONST[0][0].xxxx, IMM[0].yyyy\n"
" USGE TEMP[5], TEMP[5].xxxx, IMM[1].yyyy\n"
" UIF TEMP[5]\n"
" AND TEMP[5].x, CONST[0][0].xxxx, IMM[1].xxxx\n"
" UCMP TEMP[2].y, TEMP[5].xxxx, IMM[1].zzzz, IMM[1].xxxx\n"
/* do {
* generated = buffer[0]@(stream_offset + 2 * sizeof(uint64_t));
* emitted = buffer[0]@(stream_offset + 3 * sizeof(uint64_t));
* if (generated != emitted) {
* acc_result = 1;
* result_remaining = 0;
* break;
* }
*
* stream_offset += sizeof(gfx10_sh_query_buffer_mem.stream[0]);
* } while (--count);
* }
*/
" BGNLOOP\n"
" UADD TEMP[5].x, TEMP[2].xxxx, IMM[2].xxxx\n"
" LOAD TEMP[4].xyzw, BUFFER[0], TEMP[5].xxxx\n"
" USNE TEMP[5], TEMP[4].xyxy, TEMP[4].zwzw\n"
" UIF TEMP[5]\n"
" MOV TEMP[0].x, IMM[1].xxxx\n"
" MOV TEMP[1].y, IMM[0].xxxx\n"
" BRK\n"
" ENDIF\n"
" UADD TEMP[2].y, TEMP[2].yyyy, IMM[0].wwww\n"
" USEQ TEMP[5], TEMP[2].yyyy, IMM[0].xxxx\n"
" UIF TEMP[5]\n"
" BRK\n"
" ENDIF\n"
" UADD TEMP[2].x, TEMP[2].xxxx, IMM[2].yyyy\n"
" ENDLOOP\n"
" ENDIF\n"
/* base_offset += sizeof(gfx10_sh_query_buffer_mem);
* } // end outer loop
*/
" UADD TEMP[1].y, TEMP[1].yyyy, IMM[0].zzzz\n"
"ENDLOOP\n"
/* if (chain & 2) {
* buffer[2][0] = acc_result;
* buffer[2][1] = acc_missing;
* } else {
*/
"AND TEMP[5], CONST[0][0].zzzz, IMM[1].yyyy\n"
"UIF TEMP[5]\n"
" STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0]\n"
"ELSE\n"
/* if ((config & 7) == 1) {
* acc_result = acc_missing ? 0 : 1;
* acc_missing = 0;
* }
*/
" AND TEMP[5], CONST[0][0].xxxx, IMM[0].yyyy\n"
" USEQ TEMP[5], TEMP[5].xxxx, IMM[1].xxxx\n"
" UIF TEMP[5]\n"
" UCMP TEMP[0].x, TEMP[0].yyyy, IMM[0].xxxx, IMM[1].xxxx\n"
" MOV TEMP[0].y, IMM[0].xxxx\n"
" ENDIF\n"
/* if (!acc_missing) {
* buffer[2][0] = acc_result;
* if (config & 8) {
* buffer[2][1] = 0;
* }
* }
* }
*/
" USEQ TEMP[5], TEMP[0].yyyy, IMM[0].xxxx\n"
" UIF TEMP[5]\n"
" STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
" AND TEMP[5], CONST[0][0].xxxx, IMM[1].wwww\n"
" UIF TEMP[5]\n"
" STORE BUFFER[2].x, IMM[1].zzzz, TEMP[0].yyyy\n"
" ENDIF\n"
" ENDIF\n"
"ENDIF\n"
"END\n";
struct tgsi_token tokens[1024];
struct pipe_compute_state state = {};
if (!tgsi_text_translate(text_tmpl, tokens, ARRAY_SIZE(tokens))) {
assert(false);
return NULL;
}
state.ir_type = PIPE_SHADER_IR_TGSI;
state.prog = tokens;
return sctx->b.create_compute_state(&sctx->b, &state);
}