From d4fdeaa820a15a87cad79aa7ef7fed3bc1f1912e Mon Sep 17 00:00:00 2001 From: Qiang Yu Date: Thu, 8 Sep 2022 18:06:56 +0800 Subject: [PATCH] radeonsi: replace llvm resource code with nir lower MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Port from ac_nir_to_llvm.c and si_shader_llvm_resource.c. Due to need waterfall of llvm backend, we can't get bind-texture descriptor directly in nir. So we keep load_sampler_desc abi only for bind-texture index to desc. Reviewed-by: Marek Olšák Signed-off-by: Qiang Yu Part-of: --- src/amd/llvm/ac_nir_to_llvm.c | 56 ++- src/gallium/drivers/radeonsi/meson.build | 1 - .../drivers/radeonsi/si_nir_lower_resource.c | 405 ++++++++++++++++++ .../drivers/radeonsi/si_shader_internal.h | 3 - src/gallium/drivers/radeonsi/si_shader_llvm.c | 61 ++- .../radeonsi/si_shader_llvm_resources.c | 267 ------------ src/gallium/drivers/radeonsi/si_shader_nir.c | 6 +- 7 files changed, 495 insertions(+), 304 deletions(-) delete mode 100644 src/gallium/drivers/radeonsi/si_shader_llvm_resources.c diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index c807a4bbe4e..81d2f449e21 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -2485,7 +2485,6 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins LLVMConstInt(ctx->ac.i32, 2, false), LLVMConstInt(ctx->ac.i32, 3, false), }; - LLVMValueRef sample_index = NULL; int count; ASSERTED bool add_frag_pos = @@ -2495,25 +2494,6 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins assert(!add_frag_pos && "Input attachments should be lowered by this point."); count = image_type_to_components_count(dim, is_array); - if (ctx->ac.gfx_level < GFX11 && - is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load || - instr->intrinsic == nir_intrinsic_bindless_image_load || - instr->intrinsic == nir_intrinsic_image_deref_sparse_load || - instr->intrinsic == nir_intrinsic_bindless_image_sparse_load)) { - LLVMValueRef fmask_load_address[3]; - - fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); - fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], ""); - if (is_array) - fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], ""); - else - fmask_load_address[2] = NULL; - - sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); - sample_index = adjust_sample_index_using_fmask( - &ctx->ac, fmask_load_address[0], fmask_load_address[1], fmask_load_address[2], - sample_index, get_image_descriptor(ctx, instr, dynamic_desc_index, AC_DESC_FMASK, false)); - } if (count == 1 && !gfx9_1d) { if (instr->src[1].ssa->num_components) args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], ""); @@ -2577,9 +2557,8 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins } if (is_ms) { - if (!sample_index) - sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); - args->coords[count] = sample_index; + /* sample index */ + args->coords[count] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0); count++; } } @@ -2647,7 +2626,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components); res = ac_to_integer(&ctx->ac, res); - } else if (instr->intrinsic == nir_intrinsic_image_deref_samples_identical) { + } else if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) { assert(ctx->ac.gfx_level < GFX11); args.opcode = ac_image_load; @@ -2659,8 +2638,6 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri args.a16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.coords[0])) == 16; res = ac_build_image_opcode(&ctx->ac, &args); - res = LLVMBuildExtractElement(ctx->ac.builder, res, ctx->ac.i32_0, ""); - res = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, res, ctx->ac.i32_0, ""); } else { bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0; @@ -3823,6 +3800,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins break; case nir_intrinsic_bindless_image_load: case nir_intrinsic_bindless_image_sparse_load: + case nir_intrinsic_bindless_image_fragment_mask_load_amd: result = visit_image_load(ctx, instr, true); break; case nir_intrinsic_image_deref_load: @@ -4611,6 +4589,8 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr, LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr, bool divergent) { + bool texture_handle_divergent = false; + bool sampler_handle_divergent = false; LLVMValueRef texture_dynamic_handle = NULL; LLVMValueRef sampler_dynamic_handle = NULL; nir_deref_instr *texture_deref_instr = NULL; @@ -4637,10 +4617,14 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr, else *samp_ptr = val; } else { - if (instr->src[i].src_type == nir_tex_src_texture_handle) + bool divergent = instr->src[i].src.ssa->divergent; + if (instr->src[i].src_type == nir_tex_src_texture_handle) { texture_dynamic_handle = val; - else + texture_handle_divergent = divergent; + } else { sampler_dynamic_handle = val; + sampler_handle_divergent = divergent; + } } break; } @@ -4671,11 +4655,23 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr, } if (texture_dynamic_handle || sampler_dynamic_handle) { + /* instr->sampler_non_uniform and texture_non_uniform are always false in GLSL, + * but this can lead to unexpected behavior if texture/sampler index come from + * a vertex attribute. + * For instance, 2 consecutive draws using 2 different index values, + * could be squashed together by the hw - producing a single draw with + * non-dynamically uniform index. + * To avoid this, detect divergent indexing, and use enter_waterfall. + * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253. + */ + /* descriptor handles given through nir_tex_src_{texture,sampler}_handle */ - if (instr->texture_non_uniform) + if (instr->texture_non_uniform || + (ctx->abi->use_waterfall_for_divergent_tex_samplers && texture_handle_divergent)) texture_dynamic_handle = enter_waterfall(ctx, &wctx[0], texture_dynamic_handle, divergent); - if (instr->sampler_non_uniform) + if (instr->sampler_non_uniform || + (ctx->abi->use_waterfall_for_divergent_tex_samplers && sampler_handle_divergent)) sampler_dynamic_handle = enter_waterfall(ctx, &wctx[1], sampler_dynamic_handle, divergent); if (texture_dynamic_handle) diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build index 40e9f254223..2d37d56a7c0 100644 --- a/src/gallium/drivers/radeonsi/meson.build +++ b/src/gallium/drivers/radeonsi/meson.build @@ -56,7 +56,6 @@ files_libradeonsi = files( 'si_shader_llvm.c', 'si_shader_llvm_gs.c', 'si_shader_llvm_ps.c', - 'si_shader_llvm_resources.c', 'si_shader_llvm_tess.c', 'si_shader_llvm_vs.c', 'si_shader_nir.c', diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_resource.c b/src/gallium/drivers/radeonsi/si_nir_lower_resource.c index b6104251121..c6636c8b7f6 100644 --- a/src/gallium/drivers/radeonsi/si_nir_lower_resource.c +++ b/src/gallium/drivers/radeonsi/si_nir_lower_resource.c @@ -115,6 +115,168 @@ static nir_ssa_def *load_ssbo_desc(nir_builder *b, nir_src *index, return nir_load_smem_amd(b, 4, addr, offset); } +static nir_ssa_def *fixup_image_desc(nir_builder *b, nir_ssa_def *rsrc, bool uses_store, + struct lower_resource_state *s) +{ + struct si_shader_selector *sel = s->shader->selector; + struct si_screen *screen = sel->screen; + + /** + * Given a 256-bit resource descriptor, force the DCC enable bit to off. + * + * At least on Tonga, executing image stores on images with DCC enabled and + * non-trivial can eventually lead to lockups. This can occur when an + * application binds an image as read-only but then uses a shader that writes + * to it. The OpenGL spec allows almost arbitrarily bad behavior (including + * program termination) in this case, but it doesn't cost much to be a bit + * nicer: disabling DCC in the shader still leads to undefined results but + * avoids the lockup. + */ + if (uses_store && + screen->info.gfx_level <= GFX9 && + screen->info.gfx_level >= GFX8) { + nir_ssa_def *tmp = nir_channel(b, rsrc, 6); + tmp = nir_iand_imm(b, tmp, C_008F28_COMPRESSION_EN); + rsrc = nir_vector_insert_imm(b, rsrc, tmp, 6); + } + + if (!uses_store && + screen->info.has_image_load_dcc_bug && + screen->always_allow_dcc_stores) { + nir_ssa_def *tmp = nir_channel(b, rsrc, 6); + tmp = nir_iand_imm(b, tmp, C_00A018_WRITE_COMPRESS_ENABLE); + rsrc = nir_vector_insert_imm(b, rsrc, tmp, 6); + } + + return rsrc; +} + +/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should + * adjust "index" to point to FMASK. + */ +static nir_ssa_def *load_image_desc(nir_builder *b, nir_ssa_def *list, nir_ssa_def *index, + enum ac_descriptor_type desc_type, bool uses_store, + struct lower_resource_state *s) +{ + /* index is in uvec8 unit, convert to offset in bytes */ + nir_ssa_def *offset = nir_ishl_imm(b, index, 5); + + unsigned num_channels; + if (desc_type == AC_DESC_BUFFER) { + offset = nir_iadd_imm(b, offset, 16); + num_channels = 4; + } else { + assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK); + num_channels = 8; + } + + nir_ssa_def *rsrc = nir_load_smem_amd(b, num_channels, list, offset); + + if (desc_type == AC_DESC_IMAGE) + rsrc = fixup_image_desc(b, rsrc, uses_store, s); + + return rsrc; +} + +static nir_ssa_def *deref_to_index(nir_builder *b, + nir_deref_instr *deref, + unsigned max_slots, + nir_ssa_def **dynamic_index_ret, + unsigned *const_index_ret) +{ + unsigned const_index = 0; + nir_ssa_def *dynamic_index = NULL; + while (deref->deref_type != nir_deref_type_var) { + assert(deref->deref_type == nir_deref_type_array); + unsigned array_size = MAX2(glsl_get_aoa_size(deref->type), 1); + + if (nir_src_is_const(deref->arr.index)) { + const_index += array_size * nir_src_as_uint(deref->arr.index); + } else { + nir_ssa_def *tmp = nir_imul_imm(b, deref->arr.index.ssa, array_size); + dynamic_index = dynamic_index ? nir_iadd(b, dynamic_index, tmp) : tmp; + } + + deref = nir_deref_instr_parent(deref); + } + + unsigned base_index = deref->var->data.binding; + const_index += base_index; + + /* Redirect invalid resource indices to the first array element. */ + if (const_index >= max_slots) + const_index = base_index; + + nir_ssa_def *index = nir_imm_int(b, const_index); + if (dynamic_index) { + index = nir_iadd(b, dynamic_index, index); + + /* From the GL_ARB_shader_image_load_store extension spec: + * + * If a shader performs an image load, store, or atomic + * operation using an image variable declared as an array, + * and if the index used to select an individual element is + * negative or greater than or equal to the size of the + * array, the results of the operation are undefined but may + * not lead to termination. + */ + index = clamp_index(b, index, max_slots); + } + + if (dynamic_index_ret) + *dynamic_index_ret = dynamic_index; + if (const_index_ret) + *const_index_ret = const_index; + + return index; +} + +static nir_ssa_def *load_deref_image_desc(nir_builder *b, nir_deref_instr *deref, + enum ac_descriptor_type desc_type, bool is_load, + struct lower_resource_state *s) +{ + unsigned const_index; + nir_ssa_def *dynamic_index; + nir_ssa_def *index = deref_to_index(b, deref, s->shader->selector->info.base.num_images, + &dynamic_index, &const_index); + + nir_ssa_def *desc; + if (!dynamic_index && desc_type != AC_DESC_FMASK && + const_index < s->shader->selector->cs_num_images_in_user_sgprs) { + /* Fast path if the image is in user SGPRs. */ + desc = ac_nir_load_arg(b, &s->args->ac, s->args->cs_image[const_index]); + + if (desc_type == AC_DESC_IMAGE) + desc = fixup_image_desc(b, desc, !is_load, s); + } else { + /* FMASKs are separate from images. */ + if (desc_type == AC_DESC_FMASK) + index = nir_iadd_imm(b, index, SI_NUM_IMAGES); + + index = nir_isub(b, nir_imm_int(b, SI_NUM_IMAGE_SLOTS - 1), index); + + nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->samplers_and_images); + desc = load_image_desc(b, list, index, desc_type, !is_load, s); + } + + return desc; +} + +static nir_ssa_def *load_bindless_image_desc(nir_builder *b, nir_ssa_def *index, + enum ac_descriptor_type desc_type, bool is_load, + struct lower_resource_state *s) +{ + /* Bindless image descriptors use 16-dword slots. */ + index = nir_ishl_imm(b, index, 1); + + /* FMASK is right after the image. */ + if (desc_type == AC_DESC_FMASK) + index = nir_iadd_imm(b, index, 1); + + nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->bindless_samplers_and_images); + return load_image_desc(b, list, index, desc_type, !is_load, s); +} + static bool lower_resource_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin, struct lower_resource_state *s) { @@ -161,6 +323,103 @@ static bool lower_resource_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin nir_instr_remove(&intrin->instr); break; } + case nir_intrinsic_image_deref_load: + case nir_intrinsic_image_deref_sparse_load: + case nir_intrinsic_image_deref_fragment_mask_load_amd: + case nir_intrinsic_image_deref_store: + case nir_intrinsic_image_deref_atomic_add: + case nir_intrinsic_image_deref_atomic_imin: + case nir_intrinsic_image_deref_atomic_umin: + case nir_intrinsic_image_deref_atomic_fmin: + case nir_intrinsic_image_deref_atomic_imax: + case nir_intrinsic_image_deref_atomic_umax: + case nir_intrinsic_image_deref_atomic_fmax: + case nir_intrinsic_image_deref_atomic_and: + case nir_intrinsic_image_deref_atomic_or: + case nir_intrinsic_image_deref_atomic_xor: + case nir_intrinsic_image_deref_atomic_exchange: + case nir_intrinsic_image_deref_atomic_comp_swap: + case nir_intrinsic_image_deref_atomic_fadd: + case nir_intrinsic_image_deref_atomic_inc_wrap: + case nir_intrinsic_image_deref_atomic_dec_wrap: + case nir_intrinsic_image_deref_descriptor_amd: { + assert(!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)); + + nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]); + + enum ac_descriptor_type desc_type; + if (intrin->intrinsic == nir_intrinsic_image_deref_fragment_mask_load_amd) { + desc_type = AC_DESC_FMASK; + } else { + enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type); + desc_type = dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; + } + + bool is_load = + intrin->intrinsic == nir_intrinsic_image_deref_load || + intrin->intrinsic == nir_intrinsic_image_deref_sparse_load || + intrin->intrinsic == nir_intrinsic_image_deref_fragment_mask_load_amd || + intrin->intrinsic == nir_intrinsic_image_deref_descriptor_amd; + + nir_ssa_def *desc = load_deref_image_desc(b, deref, desc_type, is_load, s); + + if (intrin->intrinsic == nir_intrinsic_image_deref_descriptor_amd) { + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc); + nir_instr_remove(&intrin->instr); + } else { + nir_intrinsic_set_image_dim(intrin, glsl_get_sampler_dim(deref->type)); + nir_intrinsic_set_image_array(intrin, glsl_sampler_type_is_array(deref->type)); + nir_rewrite_image_intrinsic(intrin, desc, true); + } + break; + } + case nir_intrinsic_bindless_image_load: + case nir_intrinsic_bindless_image_sparse_load: + case nir_intrinsic_bindless_image_fragment_mask_load_amd: + case nir_intrinsic_bindless_image_store: + case nir_intrinsic_bindless_image_atomic_add: + case nir_intrinsic_bindless_image_atomic_imin: + case nir_intrinsic_bindless_image_atomic_umin: + case nir_intrinsic_bindless_image_atomic_fmin: + case nir_intrinsic_bindless_image_atomic_imax: + case nir_intrinsic_bindless_image_atomic_umax: + case nir_intrinsic_bindless_image_atomic_fmax: + case nir_intrinsic_bindless_image_atomic_and: + case nir_intrinsic_bindless_image_atomic_or: + case nir_intrinsic_bindless_image_atomic_xor: + case nir_intrinsic_bindless_image_atomic_exchange: + case nir_intrinsic_bindless_image_atomic_comp_swap: + case nir_intrinsic_bindless_image_atomic_fadd: + case nir_intrinsic_bindless_image_atomic_inc_wrap: + case nir_intrinsic_bindless_image_atomic_dec_wrap: { + assert(!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM)); + + enum ac_descriptor_type desc_type; + if (intrin->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) { + desc_type = AC_DESC_FMASK; + } else { + enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intrin); + desc_type = dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; + } + + bool is_load = + intrin->intrinsic == nir_intrinsic_bindless_image_load || + intrin->intrinsic == nir_intrinsic_bindless_image_sparse_load || + intrin->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd || + intrin->intrinsic == nir_intrinsic_bindless_image_descriptor_amd; + + nir_ssa_def *index = nir_u2u32(b, intrin->src[0].ssa); + + nir_ssa_def *desc = load_bindless_image_desc(b, index, desc_type, is_load, s); + + if (intrin->intrinsic == nir_intrinsic_bindless_image_descriptor_amd) { + nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc); + nir_instr_remove(&intrin->instr); + } else { + nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(desc)); + } + break; + } default: return false; } @@ -168,6 +427,148 @@ static bool lower_resource_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin return true; } +static nir_ssa_def *load_sampler_desc(nir_builder *b, nir_ssa_def *list, nir_ssa_def *index, + enum ac_descriptor_type desc_type) +{ + /* index is in 16 dword unit, convert to offset in bytes */ + nir_ssa_def *offset = nir_ishl_imm(b, index, 6); + + unsigned num_channels = 0; + switch (desc_type) { + case AC_DESC_IMAGE: + /* The image is at [0:7]. */ + num_channels = 8; + break; + case AC_DESC_BUFFER: + /* The buffer is in [4:7]. */ + offset = nir_iadd_imm(b, offset, 16); + num_channels = 4; + break; + case AC_DESC_FMASK: + /* The FMASK is at [8:15]. */ + offset = nir_iadd_imm(b, offset, 32); + num_channels = 8; + break; + case AC_DESC_SAMPLER: + /* The sampler state is at [12:15]. */ + offset = nir_iadd_imm(b, offset, 48); + num_channels = 4; + break; + default: + unreachable("invalid desc type"); + break; + } + + return nir_load_smem_amd(b, num_channels, list, offset); +} + +static nir_ssa_def *load_deref_sampler_desc(nir_builder *b, nir_deref_instr *deref, + enum ac_descriptor_type desc_type, + struct lower_resource_state *s, + bool return_descriptor) +{ + unsigned max_slots = BITSET_LAST_BIT(b->shader->info.textures_used); + nir_ssa_def *index = deref_to_index(b, deref, max_slots, NULL, NULL); + index = nir_iadd_imm(b, index, SI_NUM_IMAGE_SLOTS / 2); + + /* return actual desc when required by caller */ + if (return_descriptor) { + nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->samplers_and_images); + return load_sampler_desc(b, list, index, desc_type); + } + + /* Just use index here and let nir-to-llvm backend to translate to actual + * descriptor. This is because we need waterfall to handle non-dynamic-uniform + * index there. + */ + return index; +} + +static nir_ssa_def *load_bindless_sampler_desc(nir_builder *b, nir_ssa_def *index, + enum ac_descriptor_type desc_type, + struct lower_resource_state *s) +{ + nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->bindless_samplers_and_images); + + /* 64 bit to 32 bit */ + index = nir_u2u32(b, index); + + return load_sampler_desc(b, list, index, desc_type); +} + +static bool lower_resource_tex(nir_builder *b, nir_tex_instr *tex, + struct lower_resource_state *s) +{ + assert(!tex->texture_non_uniform && !tex->sampler_non_uniform); + + nir_deref_instr *texture_deref = NULL; + nir_deref_instr *sampler_deref = NULL; + nir_ssa_def *texture_handle = NULL; + nir_ssa_def *sampler_handle = NULL; + + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_texture_deref: + texture_deref = nir_src_as_deref(tex->src[i].src); + break; + case nir_tex_src_sampler_deref: + sampler_deref = nir_src_as_deref(tex->src[i].src); + break; + case nir_tex_src_texture_handle: + texture_handle = tex->src[i].src.ssa; + break; + case nir_tex_src_sampler_handle: + sampler_handle = tex->src[i].src.ssa; + break; + default: + break; + } + } + + enum ac_descriptor_type desc_type; + if (tex->op == nir_texop_fragment_mask_fetch_amd) + desc_type = AC_DESC_FMASK; + else + desc_type = tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE; + + bool is_descriptor_op = tex->op == nir_texop_descriptor_amd; + nir_ssa_def *image = texture_deref ? + load_deref_sampler_desc(b, texture_deref, desc_type, s, is_descriptor_op) : + load_bindless_sampler_desc(b, texture_handle, desc_type, s); + + nir_ssa_def *sampler = NULL; + if (sampler_deref) + sampler = load_deref_sampler_desc(b, sampler_deref, AC_DESC_SAMPLER, s, false); + else if (sampler_handle) + sampler = load_bindless_sampler_desc(b, sampler_handle, AC_DESC_SAMPLER, s); + + if (is_descriptor_op) { + nir_ssa_def_rewrite_uses(&tex->dest.ssa, image); + nir_instr_remove(&tex->instr); + } else { + for (unsigned i = 0; i < tex->num_srcs; i++) { + switch (tex->src[i].src_type) { + case nir_tex_src_texture_deref: + tex->src[i].src_type = nir_tex_src_texture_handle; + FALLTHROUGH; + case nir_tex_src_texture_handle: + nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src, image); + break; + case nir_tex_src_sampler_deref: + tex->src[i].src_type = nir_tex_src_sampler_handle; + FALLTHROUGH; + case nir_tex_src_sampler_handle: + nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src, sampler); + break; + default: + break; + } + } + } + + return true; +} + static bool lower_resource_instr(nir_builder *b, nir_instr *instr, void *state) { struct lower_resource_state *s = (struct lower_resource_state *)state; @@ -179,6 +580,10 @@ static bool lower_resource_instr(nir_builder *b, nir_instr *instr, void *state) nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr); return lower_resource_intrinsic(b, intrin, s); } + case nir_instr_type_tex: { + nir_tex_instr *tex = nir_instr_as_tex(instr); + return lower_resource_tex(b, tex, s); + } default: return false; } diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h index be5c1fd377e..d1265594e25 100644 --- a/src/gallium/drivers/radeonsi/si_shader_internal.h +++ b/src/gallium/drivers/radeonsi/si_shader_internal.h @@ -257,9 +257,6 @@ void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader void si_llvm_ps_build_end(struct si_shader_context *ctx); void si_llvm_init_ps_callbacks(struct si_shader_context *ctx); -/* si_shader_llvm_resources.c */ -void si_llvm_init_resource_callbacks(struct si_shader_context *ctx); - /* si_shader_llvm_vs.c */ void si_llvm_clipvertex_to_clipdist(struct si_shader_context *ctx, struct ac_export_args clipdist[2], LLVMValueRef clipvertex[4]); diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c index 77e1df29ca0..b33558cbb6f 100644 --- a/src/gallium/drivers/radeonsi/si_shader_llvm.c +++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c @@ -724,6 +724,64 @@ static LLVMValueRef si_llvm_load_intrinsic(struct ac_shader_abi *abi, nir_intrin } } +static LLVMValueRef si_llvm_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, + unsigned base_index, unsigned constant_index, + LLVMValueRef dynamic_index, + enum ac_descriptor_type desc_type, bool image, + bool write, bool bindless) +{ + struct si_shader_context *ctx = si_shader_context_from_abi(abi); + LLVMBuilderRef builder = ctx->ac.builder; + + /* always 0 for OpenGL */ + assert(!descriptor_set); + + /* all image and texture has been lowered to bindless one in nir */ + assert(bindless); + + if (dynamic_index && LLVMTypeOf(dynamic_index) == ctx->ac.i32) { + /* image desc has been lowered in nir, we only expect texture here */ + assert(!image); + + bool is_vec4 = false; + LLVMValueRef index = dynamic_index; + + switch (desc_type) { + case AC_DESC_IMAGE: + /* The image is at [0:7]. */ + index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), ""); + break; + case AC_DESC_BUFFER: + /* The buffer is in [4:7]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1); + is_vec4 = true; + break; + case AC_DESC_FMASK: + /* The FMASK is at [8:15]. */ + assert(ctx->screen->info.gfx_level < GFX11); + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); + break; + case AC_DESC_SAMPLER: + /* The sampler state is at [12:15]. */ + index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), + LLVMConstInt(ctx->ac.i32, 3, 0)); + is_vec4 = true; + break; + default: + unreachable("invalid desc"); + } + + struct ac_llvm_pointer list = { + .value = ac_get_arg(&ctx->ac, ctx->args->samplers_and_images), + .pointee_type = is_vec4 ? ctx->ac.v4i32 : ctx->ac.v8i32, + }; + + return ac_build_load_to_sgpr(&ctx->ac, list, index); + } + + return dynamic_index; +} + bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader, struct nir_shader *nir, bool free_nir) { @@ -741,8 +799,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad ctx->abi.intrinsic_load = si_llvm_load_intrinsic; ctx->abi.export_vertex = gfx10_ngg_export_vertex; + ctx->abi.load_sampler_desc = si_llvm_load_sampler_desc; - si_llvm_init_resource_callbacks(ctx); si_llvm_create_main_func(ctx); if (ctx->stage <= MESA_SHADER_GEOMETRY && @@ -967,6 +1025,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero || info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO; ctx->abi.use_waterfall_for_divergent_tex_samplers = true; + ctx->abi.disable_aniso_single_level = true; unsigned num_outputs = info->num_outputs; /* need extra output to hold primitive id added by nir ngg lower */ diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c deleted file mode 100644 index 86daf419cc8..00000000000 --- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c +++ /dev/null @@ -1,267 +0,0 @@ -/* - * Copyright 2020 Advanced Micro Devices, Inc. - * All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * on the rights to use, copy, modify, merge, publish, distribute, sub - * license, and/or sell copies of the Software, and to permit persons to whom - * the Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice (including the next - * paragraph) shall be included in all copies or substantial portions of the - * Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL - * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, - * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR - * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE - * USE OR OTHER DEALINGS IN THE SOFTWARE. - */ - -#include "si_pipe.h" -#include "si_shader_internal.h" -#include "sid.h" - -/** - * Return a value that is equal to the given i32 \p index if it lies in [0,num) - * or an undefined value in the same interval otherwise. - */ -static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index, - unsigned num) -{ - LLVMBuilderRef builder = ctx->ac.builder; - LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0); - LLVMValueRef cc; - - if (util_is_power_of_two_or_zero(num)) { - index = LLVMBuildAnd(builder, index, c_max, ""); - } else { - /* In theory, this MAX pattern should result in code that is - * as good as the bit-wise AND above. - * - * In practice, LLVM generates worse code (at the time of - * writing), because its value tracking is not strong enough. - */ - cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, ""); - index = LLVMBuildSelect(builder, cc, index, c_max, ""); - } - - return index; -} - -/** - * Given a 256-bit resource descriptor, force the DCC enable bit to off. - * - * At least on Tonga, executing image stores on images with DCC enabled and - * non-trivial can eventually lead to lockups. This can occur when an - * application binds an image as read-only but then uses a shader that writes - * to it. The OpenGL spec allows almost arbitrarily bad behavior (including - * program termination) in this case, but it doesn't cost much to be a bit - * nicer: disabling DCC in the shader still leads to undefined results but - * avoids the lockup. - */ -static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc) -{ - if (ctx->screen->info.gfx_level <= GFX7) { - return rsrc; - } else { - LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); - LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0); - LLVMValueRef tmp; - - tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); - tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); - return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); - } -} - -static LLVMValueRef force_write_compress_off(struct si_shader_context *ctx, LLVMValueRef rsrc) -{ - LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0); - LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, 0); - LLVMValueRef tmp; - - tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, ""); - tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, ""); - return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, ""); -} - -static LLVMValueRef fixup_image_desc(struct si_shader_context *ctx, LLVMValueRef rsrc, - bool uses_store) -{ - if (uses_store && ctx->ac.gfx_level <= GFX9) - rsrc = force_dcc_off(ctx, rsrc); - - if (!uses_store && ctx->screen->info.has_image_load_dcc_bug && - ctx->screen->always_allow_dcc_stores) - rsrc = force_write_compress_off(ctx, rsrc); - - return rsrc; -} - -/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should - * adjust "index" to point to FMASK. */ -static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list, - LLVMValueRef index, enum ac_descriptor_type desc_type, - bool uses_store, bool bindless) -{ - LLVMValueRef rsrc; - - if (desc_type == AC_DESC_BUFFER) { - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); - list.pointee_type = ctx->ac.v4i32; - } else { - assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK); - } - - if (bindless) - rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index); - else - rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index); - - if (desc_type == AC_DESC_IMAGE) - rsrc = fixup_image_desc(ctx, rsrc, uses_store); - - return rsrc; -} - -/** - * Load an image view, fmask view. or sampler state descriptor. - */ -static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list, - LLVMValueRef index, enum ac_descriptor_type type) -{ - LLVMBuilderRef builder = ctx->ac.builder; - - switch (type) { - case AC_DESC_IMAGE: - /* The image is at [0:7]. */ - index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), ""); - break; - case AC_DESC_BUFFER: - /* The buffer is in [4:7]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1); - list.pointee_type = ctx->ac.v4i32; - break; - case AC_DESC_FMASK: - /* The FMASK is at [8:15]. */ - assert(ctx->screen->info.gfx_level < GFX11); - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1); - break; - case AC_DESC_SAMPLER: - /* The sampler state is at [12:15]. */ - index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), - LLVMConstInt(ctx->ac.i32, 3, 0)); - list.pointee_type = ctx->ac.v4i32; - break; - case AC_DESC_PLANE_0: - case AC_DESC_PLANE_1: - case AC_DESC_PLANE_2: - /* Only used for the multiplane image support for Vulkan. Should - * never be reached in radeonsi. - */ - unreachable("Plane descriptor requested in radeonsi."); - } - - return ac_build_load_to_sgpr(&ctx->ac, list, index); -} - -static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set, - unsigned base_index, unsigned constant_index, - LLVMValueRef dynamic_index, - enum ac_descriptor_type desc_type, bool image, - bool write, bool bindless) -{ - struct si_shader_context *ctx = si_shader_context_from_abi(abi); - LLVMBuilderRef builder = ctx->ac.builder; - unsigned const_index = base_index + constant_index; - - assert(!descriptor_set); - assert(desc_type <= AC_DESC_BUFFER); - - if (bindless) { - struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->bindless_samplers_and_images); - - /* dynamic_index is the bindless handle */ - if (image) { - /* Bindless image descriptors use 16-dword slots. */ - dynamic_index = - LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), ""); - /* FMASK is right after the image. */ - if (desc_type == AC_DESC_FMASK) { - dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, ""); - } - - return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true); - } - - /* Since bindless handle arithmetic can contain an unsigned integer - * wraparound and si_load_sampler_desc assumes there isn't any, - * use GEP without "inbounds" (inside ac_build_pointer_add) - * to prevent incorrect code generation and hangs. - */ - dynamic_index = - LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), ""); - list.v = ac_build_pointer_add(&ctx->ac, ctx->ac.v8i32, list.v, dynamic_index); - return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type); - } - - unsigned num_slots = image ? ctx->num_images : ctx->num_samplers; - - /* Redirect invalid resource indices to the first array element. */ - if (const_index >= num_slots) - const_index = base_index; - - struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->samplers_and_images); - LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false); - - if (dynamic_index) { - index = LLVMBuildAdd(builder, index, dynamic_index, ""); - - /* From the GL_ARB_shader_image_load_store extension spec: - * - * If a shader performs an image load, store, or atomic - * operation using an image variable declared as an array, - * and if the index used to select an individual element is - * negative or greater than or equal to the size of the - * array, the results of the operation are undefined but may - * not lead to termination. - */ - index = si_llvm_bound_index(ctx, index, num_slots); - } - - if (image) { - /* Fast path if the image is in user SGPRs. */ - if (!dynamic_index && - const_index < ctx->shader->selector->cs_num_images_in_user_sgprs && - (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) { - LLVMValueRef rsrc = ac_get_arg(&ctx->ac, ctx->args->cs_image[const_index]); - - if (desc_type == AC_DESC_IMAGE) - rsrc = fixup_image_desc(ctx, rsrc, write); - return rsrc; - } - - /* FMASKs are separate from images. */ - if (desc_type == AC_DESC_FMASK) { - index = - LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), ""); - } - index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0), - index, ""); - return si_load_image_desc(ctx, list, index, desc_type, write, false); - } - - index = LLVMBuildAdd(ctx->ac.builder, index, - LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), ""); - return si_load_sampler_desc(ctx, list, index, desc_type); -} - -void si_llvm_init_resource_callbacks(struct si_shader_context *ctx) -{ - ctx->abi.load_sampler_desc = si_nir_load_sampler_desc; -} diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c index c704bfd312f..44c78433d79 100644 --- a/src/gallium/drivers/radeonsi/si_shader_nir.c +++ b/src/gallium/drivers/radeonsi/si_shader_nir.c @@ -264,16 +264,18 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir) * and copy-propagated */ - static const struct nir_lower_tex_options lower_tex_options = { + const struct nir_lower_tex_options lower_tex_options = { .lower_txp = ~0u, .lower_txs_cube_array = true, .lower_invalid_implicit_lod = true, .lower_tg4_offsets = true, + .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11, }; NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options); - static const struct nir_lower_image_options lower_image_options = { + const struct nir_lower_image_options lower_image_options = { .lower_cube_size = true, + .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11, }; NIR_PASS_V(nir, nir_lower_image, &lower_image_options);