From d4fdeaa820a15a87cad79aa7ef7fed3bc1f1912e Mon Sep 17 00:00:00 2001
From: Qiang Yu <yuq825@gmail.com>
Date: Thu, 8 Sep 2022 18:06:56 +0800
Subject: [PATCH] radeonsi: replace llvm resource code with nir lower
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Port from ac_nir_to_llvm.c and si_shader_llvm_resource.c.

Due to need waterfall of llvm backend, we can't get bind-texture
descriptor directly in nir. So we keep load_sampler_desc abi only
for bind-texture index to desc.

Reviewed-by: Marek Olšák <marek.olsak@amd.com>
Signed-off-by: Qiang Yu <yuq825@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18666>
---
 src/amd/llvm/ac_nir_to_llvm.c                 |  56 ++-
 src/gallium/drivers/radeonsi/meson.build      |   1 -
 .../drivers/radeonsi/si_nir_lower_resource.c  | 405 ++++++++++++++++++
 .../drivers/radeonsi/si_shader_internal.h     |   3 -
 src/gallium/drivers/radeonsi/si_shader_llvm.c |  61 ++-
 .../radeonsi/si_shader_llvm_resources.c       | 267 ------------
 src/gallium/drivers/radeonsi/si_shader_nir.c  |   6 +-
 7 files changed, 495 insertions(+), 304 deletions(-)
 delete mode 100644 src/gallium/drivers/radeonsi/si_shader_llvm_resources.c

diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index c807a4bbe4e..81d2f449e21 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -2485,7 +2485,6 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins
       LLVMConstInt(ctx->ac.i32, 2, false),
       LLVMConstInt(ctx->ac.i32, 3, false),
    };
-   LLVMValueRef sample_index = NULL;
 
    int count;
    ASSERTED bool add_frag_pos =
@@ -2495,25 +2494,6 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins
    assert(!add_frag_pos && "Input attachments should be lowered by this point.");
    count = image_type_to_components_count(dim, is_array);
 
-   if (ctx->ac.gfx_level < GFX11 &&
-       is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load ||
-                 instr->intrinsic == nir_intrinsic_bindless_image_load ||
-                 instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
-                 instr->intrinsic == nir_intrinsic_bindless_image_sparse_load)) {
-      LLVMValueRef fmask_load_address[3];
-
-      fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
-      fmask_load_address[1] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[1], "");
-      if (is_array)
-         fmask_load_address[2] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[2], "");
-      else
-         fmask_load_address[2] = NULL;
-
-      sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
-      sample_index = adjust_sample_index_using_fmask(
-         &ctx->ac, fmask_load_address[0], fmask_load_address[1], fmask_load_address[2],
-         sample_index, get_image_descriptor(ctx, instr, dynamic_desc_index, AC_DESC_FMASK, false));
-   }
    if (count == 1 && !gfx9_1d) {
       if (instr->src[1].ssa->num_components)
          args->coords[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
@@ -2577,9 +2557,8 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins
       }
 
       if (is_ms) {
-         if (!sample_index)
-            sample_index = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
-         args->coords[count] = sample_index;
+         /* sample index */
+         args->coords[count] = ac_llvm_extract_elem(&ctx->ac, get_src(ctx, instr->src[2]), 0);
          count++;
       }
    }
@@ -2647,7 +2626,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
 
       res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
       res = ac_to_integer(&ctx->ac, res);
-   } else if (instr->intrinsic == nir_intrinsic_image_deref_samples_identical) {
+   } else if (instr->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
       assert(ctx->ac.gfx_level < GFX11);
 
       args.opcode = ac_image_load;
@@ -2659,8 +2638,6 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
       args.a16 = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.coords[0])) == 16;
 
       res = ac_build_image_opcode(&ctx->ac, &args);
-      res = LLVMBuildExtractElement(ctx->ac.builder, res, ctx->ac.i32_0, "");
-      res = LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, res, ctx->ac.i32_0, "");
    } else {
       bool level_zero = nir_src_is_const(instr->src[3]) && nir_src_as_uint(instr->src[3]) == 0;
 
@@ -3823,6 +3800,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
       break;
    case nir_intrinsic_bindless_image_load:
    case nir_intrinsic_bindless_image_sparse_load:
+   case nir_intrinsic_bindless_image_fragment_mask_load_amd:
       result = visit_image_load(ctx, instr, true);
       break;
    case nir_intrinsic_image_deref_load:
@@ -4611,6 +4589,8 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr,
                            LLVMValueRef *samp_ptr, LLVMValueRef *fmask_ptr,
                            bool divergent)
 {
+   bool texture_handle_divergent = false;
+   bool sampler_handle_divergent = false;
    LLVMValueRef texture_dynamic_handle = NULL;
    LLVMValueRef sampler_dynamic_handle = NULL;
    nir_deref_instr *texture_deref_instr = NULL;
@@ -4637,10 +4617,14 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr,
             else
                *samp_ptr = val;
          } else {
-            if (instr->src[i].src_type == nir_tex_src_texture_handle)
+            bool divergent = instr->src[i].src.ssa->divergent;
+            if (instr->src[i].src_type == nir_tex_src_texture_handle) {
                texture_dynamic_handle = val;
-            else
+               texture_handle_divergent = divergent;
+            } else {
                sampler_dynamic_handle = val;
+               sampler_handle_divergent = divergent;
+            }
          }
          break;
       }
@@ -4671,11 +4655,23 @@ static void tex_fetch_ptrs(struct ac_nir_context *ctx, nir_tex_instr *instr,
    }
 
    if (texture_dynamic_handle || sampler_dynamic_handle) {
+      /* instr->sampler_non_uniform and texture_non_uniform are always false in GLSL,
+       * but this can lead to unexpected behavior if texture/sampler index come from
+       * a vertex attribute.
+       * For instance, 2 consecutive draws using 2 different index values,
+       * could be squashed together by the hw - producing a single draw with
+       * non-dynamically uniform index.
+       * To avoid this, detect divergent indexing, and use enter_waterfall.
+       * See https://gitlab.freedesktop.org/mesa/mesa/-/issues/2253.
+       */
+
       /* descriptor handles given through nir_tex_src_{texture,sampler}_handle */
-      if (instr->texture_non_uniform)
+      if (instr->texture_non_uniform ||
+          (ctx->abi->use_waterfall_for_divergent_tex_samplers && texture_handle_divergent))
          texture_dynamic_handle = enter_waterfall(ctx, &wctx[0], texture_dynamic_handle, divergent);
 
-      if (instr->sampler_non_uniform)
+      if (instr->sampler_non_uniform ||
+         (ctx->abi->use_waterfall_for_divergent_tex_samplers && sampler_handle_divergent))
          sampler_dynamic_handle = enter_waterfall(ctx, &wctx[1], sampler_dynamic_handle, divergent);
 
       if (texture_dynamic_handle)
diff --git a/src/gallium/drivers/radeonsi/meson.build b/src/gallium/drivers/radeonsi/meson.build
index 40e9f254223..2d37d56a7c0 100644
--- a/src/gallium/drivers/radeonsi/meson.build
+++ b/src/gallium/drivers/radeonsi/meson.build
@@ -56,7 +56,6 @@ files_libradeonsi = files(
   'si_shader_llvm.c',
   'si_shader_llvm_gs.c',
   'si_shader_llvm_ps.c',
-  'si_shader_llvm_resources.c',
   'si_shader_llvm_tess.c',
   'si_shader_llvm_vs.c',
   'si_shader_nir.c',
diff --git a/src/gallium/drivers/radeonsi/si_nir_lower_resource.c b/src/gallium/drivers/radeonsi/si_nir_lower_resource.c
index b6104251121..c6636c8b7f6 100644
--- a/src/gallium/drivers/radeonsi/si_nir_lower_resource.c
+++ b/src/gallium/drivers/radeonsi/si_nir_lower_resource.c
@@ -115,6 +115,168 @@ static nir_ssa_def *load_ssbo_desc(nir_builder *b, nir_src *index,
    return nir_load_smem_amd(b, 4, addr, offset);
 }
 
+static nir_ssa_def *fixup_image_desc(nir_builder *b, nir_ssa_def *rsrc, bool uses_store,
+                                     struct lower_resource_state *s)
+{
+   struct si_shader_selector *sel = s->shader->selector;
+   struct si_screen *screen = sel->screen;
+
+   /**
+    * Given a 256-bit resource descriptor, force the DCC enable bit to off.
+    *
+    * At least on Tonga, executing image stores on images with DCC enabled and
+    * non-trivial can eventually lead to lockups. This can occur when an
+    * application binds an image as read-only but then uses a shader that writes
+    * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
+    * program termination) in this case, but it doesn't cost much to be a bit
+    * nicer: disabling DCC in the shader still leads to undefined results but
+    * avoids the lockup.
+    */
+   if (uses_store &&
+       screen->info.gfx_level <= GFX9 &&
+       screen->info.gfx_level >= GFX8) {
+      nir_ssa_def *tmp = nir_channel(b, rsrc, 6);
+      tmp = nir_iand_imm(b, tmp, C_008F28_COMPRESSION_EN);
+      rsrc = nir_vector_insert_imm(b, rsrc, tmp, 6);
+   }
+
+   if (!uses_store &&
+       screen->info.has_image_load_dcc_bug &&
+       screen->always_allow_dcc_stores) {
+      nir_ssa_def *tmp = nir_channel(b, rsrc, 6);
+      tmp = nir_iand_imm(b, tmp, C_00A018_WRITE_COMPRESS_ENABLE);
+      rsrc = nir_vector_insert_imm(b, rsrc, tmp, 6);
+   }
+
+   return rsrc;
+}
+
+/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
+ * adjust "index" to point to FMASK.
+ */
+static nir_ssa_def *load_image_desc(nir_builder *b, nir_ssa_def *list, nir_ssa_def *index,
+                                    enum ac_descriptor_type desc_type, bool uses_store,
+                                    struct lower_resource_state *s)
+{
+   /* index is in uvec8 unit, convert to offset in bytes */
+   nir_ssa_def *offset = nir_ishl_imm(b, index, 5);
+
+   unsigned num_channels;
+   if (desc_type == AC_DESC_BUFFER) {
+      offset = nir_iadd_imm(b, offset, 16);
+      num_channels = 4;
+   } else {
+      assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
+      num_channels = 8;
+   }
+
+   nir_ssa_def *rsrc = nir_load_smem_amd(b, num_channels, list, offset);
+
+   if (desc_type == AC_DESC_IMAGE)
+      rsrc = fixup_image_desc(b, rsrc, uses_store, s);
+
+   return rsrc;
+}
+
+static nir_ssa_def *deref_to_index(nir_builder *b,
+                                   nir_deref_instr *deref,
+                                   unsigned max_slots,
+                                   nir_ssa_def **dynamic_index_ret,
+                                   unsigned *const_index_ret)
+{
+   unsigned const_index = 0;
+   nir_ssa_def *dynamic_index = NULL;
+   while (deref->deref_type != nir_deref_type_var) {
+      assert(deref->deref_type == nir_deref_type_array);
+      unsigned array_size = MAX2(glsl_get_aoa_size(deref->type), 1);
+
+      if (nir_src_is_const(deref->arr.index)) {
+         const_index += array_size * nir_src_as_uint(deref->arr.index);
+      } else {
+         nir_ssa_def *tmp = nir_imul_imm(b, deref->arr.index.ssa, array_size);
+         dynamic_index = dynamic_index ? nir_iadd(b, dynamic_index, tmp) : tmp;
+      }
+
+      deref = nir_deref_instr_parent(deref);
+   }
+
+   unsigned base_index = deref->var->data.binding;
+   const_index += base_index;
+
+   /* Redirect invalid resource indices to the first array element. */
+   if (const_index >= max_slots)
+      const_index = base_index;
+
+   nir_ssa_def *index = nir_imm_int(b, const_index);
+   if (dynamic_index) {
+      index = nir_iadd(b, dynamic_index, index);
+
+      /* From the GL_ARB_shader_image_load_store extension spec:
+       *
+       *    If a shader performs an image load, store, or atomic
+       *    operation using an image variable declared as an array,
+       *    and if the index used to select an individual element is
+       *    negative or greater than or equal to the size of the
+       *    array, the results of the operation are undefined but may
+       *    not lead to termination.
+       */
+      index = clamp_index(b, index, max_slots);
+   }
+
+   if (dynamic_index_ret)
+      *dynamic_index_ret = dynamic_index;
+   if (const_index_ret)
+      *const_index_ret = const_index;
+
+   return index;
+}
+
+static nir_ssa_def *load_deref_image_desc(nir_builder *b, nir_deref_instr *deref,
+                                          enum ac_descriptor_type desc_type, bool is_load,
+                                          struct lower_resource_state *s)
+{
+   unsigned const_index;
+   nir_ssa_def *dynamic_index;
+   nir_ssa_def *index = deref_to_index(b, deref, s->shader->selector->info.base.num_images,
+                                       &dynamic_index, &const_index);
+
+   nir_ssa_def *desc;
+   if (!dynamic_index && desc_type != AC_DESC_FMASK &&
+       const_index < s->shader->selector->cs_num_images_in_user_sgprs) {
+      /* Fast path if the image is in user SGPRs. */
+      desc = ac_nir_load_arg(b, &s->args->ac, s->args->cs_image[const_index]);
+
+      if (desc_type == AC_DESC_IMAGE)
+         desc = fixup_image_desc(b, desc, !is_load, s);
+   } else {
+      /* FMASKs are separate from images. */
+      if (desc_type == AC_DESC_FMASK)
+         index = nir_iadd_imm(b, index, SI_NUM_IMAGES);
+
+      index = nir_isub(b, nir_imm_int(b, SI_NUM_IMAGE_SLOTS - 1), index);
+
+      nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->samplers_and_images);
+      desc = load_image_desc(b, list, index, desc_type, !is_load, s);
+   }
+
+   return desc;
+}
+
+static nir_ssa_def *load_bindless_image_desc(nir_builder *b, nir_ssa_def *index,
+                                             enum ac_descriptor_type desc_type, bool is_load,
+                                             struct lower_resource_state *s)
+{
+   /* Bindless image descriptors use 16-dword slots. */
+   index = nir_ishl_imm(b, index, 1);
+
+   /* FMASK is right after the image. */
+   if (desc_type == AC_DESC_FMASK)
+      index = nir_iadd_imm(b, index, 1);
+
+   nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->bindless_samplers_and_images);
+   return load_image_desc(b, list, index, desc_type, !is_load, s);
+}
+
 static bool lower_resource_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin,
                                      struct lower_resource_state *s)
 {
@@ -161,6 +323,103 @@ static bool lower_resource_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin
       nir_instr_remove(&intrin->instr);
       break;
    }
+   case nir_intrinsic_image_deref_load:
+   case nir_intrinsic_image_deref_sparse_load:
+   case nir_intrinsic_image_deref_fragment_mask_load_amd:
+   case nir_intrinsic_image_deref_store:
+   case nir_intrinsic_image_deref_atomic_add:
+   case nir_intrinsic_image_deref_atomic_imin:
+   case nir_intrinsic_image_deref_atomic_umin:
+   case nir_intrinsic_image_deref_atomic_fmin:
+   case nir_intrinsic_image_deref_atomic_imax:
+   case nir_intrinsic_image_deref_atomic_umax:
+   case nir_intrinsic_image_deref_atomic_fmax:
+   case nir_intrinsic_image_deref_atomic_and:
+   case nir_intrinsic_image_deref_atomic_or:
+   case nir_intrinsic_image_deref_atomic_xor:
+   case nir_intrinsic_image_deref_atomic_exchange:
+   case nir_intrinsic_image_deref_atomic_comp_swap:
+   case nir_intrinsic_image_deref_atomic_fadd:
+   case nir_intrinsic_image_deref_atomic_inc_wrap:
+   case nir_intrinsic_image_deref_atomic_dec_wrap:
+   case nir_intrinsic_image_deref_descriptor_amd: {
+      assert(!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM));
+
+      nir_deref_instr *deref = nir_src_as_deref(intrin->src[0]);
+
+      enum ac_descriptor_type desc_type;
+      if (intrin->intrinsic == nir_intrinsic_image_deref_fragment_mask_load_amd) {
+         desc_type = AC_DESC_FMASK;
+      } else {
+         enum glsl_sampler_dim dim = glsl_get_sampler_dim(deref->type);
+         desc_type = dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
+      }
+
+      bool is_load =
+         intrin->intrinsic == nir_intrinsic_image_deref_load ||
+         intrin->intrinsic == nir_intrinsic_image_deref_sparse_load ||
+         intrin->intrinsic == nir_intrinsic_image_deref_fragment_mask_load_amd ||
+         intrin->intrinsic == nir_intrinsic_image_deref_descriptor_amd;
+
+      nir_ssa_def *desc = load_deref_image_desc(b, deref, desc_type, is_load, s);
+
+      if (intrin->intrinsic == nir_intrinsic_image_deref_descriptor_amd) {
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
+         nir_instr_remove(&intrin->instr);
+      } else {
+         nir_intrinsic_set_image_dim(intrin, glsl_get_sampler_dim(deref->type));
+         nir_intrinsic_set_image_array(intrin, glsl_sampler_type_is_array(deref->type));
+         nir_rewrite_image_intrinsic(intrin, desc, true);
+      }
+      break;
+   }
+   case nir_intrinsic_bindless_image_load:
+   case nir_intrinsic_bindless_image_sparse_load:
+   case nir_intrinsic_bindless_image_fragment_mask_load_amd:
+   case nir_intrinsic_bindless_image_store:
+   case nir_intrinsic_bindless_image_atomic_add:
+   case nir_intrinsic_bindless_image_atomic_imin:
+   case nir_intrinsic_bindless_image_atomic_umin:
+   case nir_intrinsic_bindless_image_atomic_fmin:
+   case nir_intrinsic_bindless_image_atomic_imax:
+   case nir_intrinsic_bindless_image_atomic_umax:
+   case nir_intrinsic_bindless_image_atomic_fmax:
+   case nir_intrinsic_bindless_image_atomic_and:
+   case nir_intrinsic_bindless_image_atomic_or:
+   case nir_intrinsic_bindless_image_atomic_xor:
+   case nir_intrinsic_bindless_image_atomic_exchange:
+   case nir_intrinsic_bindless_image_atomic_comp_swap:
+   case nir_intrinsic_bindless_image_atomic_fadd:
+   case nir_intrinsic_bindless_image_atomic_inc_wrap:
+   case nir_intrinsic_bindless_image_atomic_dec_wrap: {
+      assert(!(nir_intrinsic_access(intrin) & ACCESS_NON_UNIFORM));
+
+      enum ac_descriptor_type desc_type;
+      if (intrin->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd) {
+         desc_type = AC_DESC_FMASK;
+      } else {
+         enum glsl_sampler_dim dim = nir_intrinsic_image_dim(intrin);
+         desc_type = dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
+      }
+
+      bool is_load =
+         intrin->intrinsic == nir_intrinsic_bindless_image_load ||
+         intrin->intrinsic == nir_intrinsic_bindless_image_sparse_load ||
+         intrin->intrinsic == nir_intrinsic_bindless_image_fragment_mask_load_amd ||
+         intrin->intrinsic == nir_intrinsic_bindless_image_descriptor_amd;
+
+      nir_ssa_def *index = nir_u2u32(b, intrin->src[0].ssa);
+
+      nir_ssa_def *desc = load_bindless_image_desc(b, index, desc_type, is_load, s);
+
+      if (intrin->intrinsic == nir_intrinsic_bindless_image_descriptor_amd) {
+         nir_ssa_def_rewrite_uses(&intrin->dest.ssa, desc);
+         nir_instr_remove(&intrin->instr);
+      } else {
+         nir_instr_rewrite_src(&intrin->instr, &intrin->src[0], nir_src_for_ssa(desc));
+      }
+      break;
+   }
    default:
       return false;
    }
@@ -168,6 +427,148 @@ static bool lower_resource_intrinsic(nir_builder *b, nir_intrinsic_instr *intrin
    return true;
 }
 
+static nir_ssa_def *load_sampler_desc(nir_builder *b, nir_ssa_def *list, nir_ssa_def *index,
+                                      enum ac_descriptor_type desc_type)
+{
+   /* index is in 16 dword unit, convert to offset in bytes */
+   nir_ssa_def *offset = nir_ishl_imm(b, index, 6);
+
+   unsigned num_channels = 0;
+   switch (desc_type) {
+   case AC_DESC_IMAGE:
+      /* The image is at [0:7]. */
+      num_channels = 8;
+      break;
+   case AC_DESC_BUFFER:
+      /* The buffer is in [4:7]. */
+      offset = nir_iadd_imm(b, offset, 16);
+      num_channels = 4;
+      break;
+   case AC_DESC_FMASK:
+      /* The FMASK is at [8:15]. */
+      offset = nir_iadd_imm(b, offset, 32);
+      num_channels = 8;
+      break;
+   case AC_DESC_SAMPLER:
+      /* The sampler state is at [12:15]. */
+      offset = nir_iadd_imm(b, offset, 48);
+      num_channels = 4;
+      break;
+   default:
+      unreachable("invalid desc type");
+      break;
+   }
+
+   return nir_load_smem_amd(b, num_channels, list, offset);
+}
+
+static nir_ssa_def *load_deref_sampler_desc(nir_builder *b, nir_deref_instr *deref,
+                                            enum ac_descriptor_type desc_type,
+                                            struct lower_resource_state *s,
+                                            bool return_descriptor)
+{
+   unsigned max_slots = BITSET_LAST_BIT(b->shader->info.textures_used);
+   nir_ssa_def *index = deref_to_index(b, deref, max_slots, NULL, NULL);
+   index = nir_iadd_imm(b, index, SI_NUM_IMAGE_SLOTS / 2);
+
+   /* return actual desc when required by caller */
+   if (return_descriptor) {
+      nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->samplers_and_images);
+      return load_sampler_desc(b, list, index, desc_type);
+   }
+
+   /* Just use index here and let nir-to-llvm backend to translate to actual
+    * descriptor. This is because we need waterfall to handle non-dynamic-uniform
+    * index there.
+    */
+   return index;
+}
+
+static nir_ssa_def *load_bindless_sampler_desc(nir_builder *b, nir_ssa_def *index,
+                                               enum ac_descriptor_type desc_type,
+                                               struct lower_resource_state *s)
+{
+   nir_ssa_def *list = ac_nir_load_arg(b, &s->args->ac, s->args->bindless_samplers_and_images);
+
+   /* 64 bit to 32 bit */
+   index = nir_u2u32(b, index);
+
+   return load_sampler_desc(b, list, index, desc_type);
+}
+
+static bool lower_resource_tex(nir_builder *b, nir_tex_instr *tex,
+                               struct lower_resource_state *s)
+{
+   assert(!tex->texture_non_uniform && !tex->sampler_non_uniform);
+
+   nir_deref_instr *texture_deref = NULL;
+   nir_deref_instr *sampler_deref = NULL;
+   nir_ssa_def *texture_handle = NULL;
+   nir_ssa_def *sampler_handle = NULL;
+
+   for (unsigned i = 0; i < tex->num_srcs; i++) {
+      switch (tex->src[i].src_type) {
+      case nir_tex_src_texture_deref:
+         texture_deref = nir_src_as_deref(tex->src[i].src);
+         break;
+      case nir_tex_src_sampler_deref:
+         sampler_deref = nir_src_as_deref(tex->src[i].src);
+         break;
+      case nir_tex_src_texture_handle:
+         texture_handle = tex->src[i].src.ssa;
+         break;
+      case nir_tex_src_sampler_handle:
+         sampler_handle = tex->src[i].src.ssa;
+         break;
+      default:
+         break;
+      }
+   }
+
+   enum ac_descriptor_type desc_type;
+   if (tex->op == nir_texop_fragment_mask_fetch_amd)
+      desc_type = AC_DESC_FMASK;
+   else
+      desc_type = tex->sampler_dim == GLSL_SAMPLER_DIM_BUF ? AC_DESC_BUFFER : AC_DESC_IMAGE;
+
+   bool is_descriptor_op = tex->op == nir_texop_descriptor_amd;
+   nir_ssa_def *image = texture_deref ?
+      load_deref_sampler_desc(b, texture_deref, desc_type, s, is_descriptor_op) :
+      load_bindless_sampler_desc(b, texture_handle, desc_type, s);
+
+   nir_ssa_def *sampler = NULL;
+   if (sampler_deref)
+      sampler = load_deref_sampler_desc(b, sampler_deref, AC_DESC_SAMPLER, s, false);
+   else if (sampler_handle)
+      sampler = load_bindless_sampler_desc(b, sampler_handle, AC_DESC_SAMPLER, s);
+
+   if (is_descriptor_op) {
+      nir_ssa_def_rewrite_uses(&tex->dest.ssa, image);
+      nir_instr_remove(&tex->instr);
+   } else {
+      for (unsigned i = 0; i < tex->num_srcs; i++) {
+         switch (tex->src[i].src_type) {
+         case nir_tex_src_texture_deref:
+            tex->src[i].src_type = nir_tex_src_texture_handle;
+            FALLTHROUGH;
+         case nir_tex_src_texture_handle:
+            nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src, image);
+            break;
+         case nir_tex_src_sampler_deref:
+            tex->src[i].src_type = nir_tex_src_sampler_handle;
+            FALLTHROUGH;
+         case nir_tex_src_sampler_handle:
+            nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src, sampler);
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   return true;
+}
+
 static bool lower_resource_instr(nir_builder *b, nir_instr *instr, void *state)
 {
    struct lower_resource_state *s = (struct lower_resource_state *)state;
@@ -179,6 +580,10 @@ static bool lower_resource_instr(nir_builder *b, nir_instr *instr, void *state)
       nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
       return lower_resource_intrinsic(b, intrin, s);
    }
+   case nir_instr_type_tex: {
+      nir_tex_instr *tex = nir_instr_as_tex(instr);
+      return lower_resource_tex(b, tex, s);
+   }
    default:
       return false;
    }
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index be5c1fd377e..d1265594e25 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -257,9 +257,6 @@ void si_llvm_build_monolithic_ps(struct si_shader_context *ctx, struct si_shader
 void si_llvm_ps_build_end(struct si_shader_context *ctx);
 void si_llvm_init_ps_callbacks(struct si_shader_context *ctx);
 
-/* si_shader_llvm_resources.c */
-void si_llvm_init_resource_callbacks(struct si_shader_context *ctx);
-
 /* si_shader_llvm_vs.c */
 void si_llvm_clipvertex_to_clipdist(struct si_shader_context *ctx,
                                     struct ac_export_args clipdist[2], LLVMValueRef clipvertex[4]);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index 77e1df29ca0..b33558cbb6f 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -724,6 +724,64 @@ static LLVMValueRef si_llvm_load_intrinsic(struct ac_shader_abi *abi, nir_intrin
    }
 }
 
+static LLVMValueRef si_llvm_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
+                                              unsigned base_index, unsigned constant_index,
+                                              LLVMValueRef dynamic_index,
+                                              enum ac_descriptor_type desc_type, bool image,
+                                              bool write, bool bindless)
+{
+   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
+   LLVMBuilderRef builder = ctx->ac.builder;
+
+   /* always 0 for OpenGL */
+   assert(!descriptor_set);
+
+   /* all image and texture has been lowered to bindless one in nir */
+   assert(bindless);
+
+   if (dynamic_index && LLVMTypeOf(dynamic_index) == ctx->ac.i32) {
+      /* image desc has been lowered in nir, we only expect texture here */
+      assert(!image);
+
+      bool is_vec4 = false;
+      LLVMValueRef index = dynamic_index;
+
+      switch (desc_type) {
+      case AC_DESC_IMAGE:
+         /* The image is at [0:7]. */
+         index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
+         break;
+      case AC_DESC_BUFFER:
+         /* The buffer is in [4:7]. */
+         index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
+         is_vec4 = true;
+         break;
+      case AC_DESC_FMASK:
+         /* The FMASK is at [8:15]. */
+         assert(ctx->screen->info.gfx_level < GFX11);
+         index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
+         break;
+      case AC_DESC_SAMPLER:
+         /* The sampler state is at [12:15]. */
+         index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
+                               LLVMConstInt(ctx->ac.i32, 3, 0));
+         is_vec4 = true;
+         break;
+      default:
+         unreachable("invalid desc");
+      }
+
+      struct ac_llvm_pointer list = {
+         .value = ac_get_arg(&ctx->ac, ctx->args->samplers_and_images),
+         .pointee_type = is_vec4 ? ctx->ac.v4i32 : ctx->ac.v8i32,
+      };
+
+      return ac_build_load_to_sgpr(&ctx->ac, list, index);
+   }
+
+   return dynamic_index;
+}
+
 bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shader,
                            struct nir_shader *nir, bool free_nir)
 {
@@ -741,8 +799,8 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
 
    ctx->abi.intrinsic_load = si_llvm_load_intrinsic;
    ctx->abi.export_vertex = gfx10_ngg_export_vertex;
+   ctx->abi.load_sampler_desc = si_llvm_load_sampler_desc;
 
-   si_llvm_init_resource_callbacks(ctx);
    si_llvm_create_main_func(ctx);
 
    if (ctx->stage <= MESA_SHADER_GEOMETRY &&
@@ -967,6 +1025,7 @@ bool si_llvm_translate_nir(struct si_shader_context *ctx, struct si_shader *shad
    ctx->abi.clamp_div_by_zero = ctx->screen->options.clamp_div_by_zero ||
                                 info->options & SI_PROFILE_CLAMP_DIV_BY_ZERO;
    ctx->abi.use_waterfall_for_divergent_tex_samplers = true;
+   ctx->abi.disable_aniso_single_level = true;
 
    unsigned num_outputs = info->num_outputs;
    /* need extra output to hold primitive id added by nir ngg lower */
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c b/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
deleted file mode 100644
index 86daf419cc8..00000000000
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_resources.c
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Copyright 2020 Advanced Micro Devices, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
- * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-#include "si_pipe.h"
-#include "si_shader_internal.h"
-#include "sid.h"
-
-/**
- * Return a value that is equal to the given i32 \p index if it lies in [0,num)
- * or an undefined value in the same interval otherwise.
- */
-static LLVMValueRef si_llvm_bound_index(struct si_shader_context *ctx, LLVMValueRef index,
-                                        unsigned num)
-{
-   LLVMBuilderRef builder = ctx->ac.builder;
-   LLVMValueRef c_max = LLVMConstInt(ctx->ac.i32, num - 1, 0);
-   LLVMValueRef cc;
-
-   if (util_is_power_of_two_or_zero(num)) {
-      index = LLVMBuildAnd(builder, index, c_max, "");
-   } else {
-      /* In theory, this MAX pattern should result in code that is
-       * as good as the bit-wise AND above.
-       *
-       * In practice, LLVM generates worse code (at the time of
-       * writing), because its value tracking is not strong enough.
-       */
-      cc = LLVMBuildICmp(builder, LLVMIntULE, index, c_max, "");
-      index = LLVMBuildSelect(builder, cc, index, c_max, "");
-   }
-
-   return index;
-}
-
-/**
- * Given a 256-bit resource descriptor, force the DCC enable bit to off.
- *
- * At least on Tonga, executing image stores on images with DCC enabled and
- * non-trivial can eventually lead to lockups. This can occur when an
- * application binds an image as read-only but then uses a shader that writes
- * to it. The OpenGL spec allows almost arbitrarily bad behavior (including
- * program termination) in this case, but it doesn't cost much to be a bit
- * nicer: disabling DCC in the shader still leads to undefined results but
- * avoids the lockup.
- */
-static LLVMValueRef force_dcc_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
-{
-   if (ctx->screen->info.gfx_level <= GFX7) {
-      return rsrc;
-   } else {
-      LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
-      LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_008F28_COMPRESSION_EN, 0);
-      LLVMValueRef tmp;
-
-      tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
-      tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
-      return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
-   }
-}
-
-static LLVMValueRef force_write_compress_off(struct si_shader_context *ctx, LLVMValueRef rsrc)
-{
-   LLVMValueRef i32_6 = LLVMConstInt(ctx->ac.i32, 6, 0);
-   LLVMValueRef i32_C = LLVMConstInt(ctx->ac.i32, C_00A018_WRITE_COMPRESS_ENABLE, 0);
-   LLVMValueRef tmp;
-
-   tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
-   tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
-   return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
-}
-
-static LLVMValueRef fixup_image_desc(struct si_shader_context *ctx, LLVMValueRef rsrc,
-                                     bool uses_store)
-{
-   if (uses_store && ctx->ac.gfx_level <= GFX9)
-      rsrc = force_dcc_off(ctx, rsrc);
-
-   if (!uses_store && ctx->screen->info.has_image_load_dcc_bug &&
-       ctx->screen->always_allow_dcc_stores)
-      rsrc = force_write_compress_off(ctx, rsrc);
-
-   return rsrc;
-}
-
-/* AC_DESC_FMASK is handled exactly like AC_DESC_IMAGE. The caller should
- * adjust "index" to point to FMASK. */
-static LLVMValueRef si_load_image_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list,
-                                       LLVMValueRef index, enum ac_descriptor_type desc_type,
-                                       bool uses_store, bool bindless)
-{
-   LLVMValueRef rsrc;
-
-   if (desc_type == AC_DESC_BUFFER) {
-      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
-      list.pointee_type = ctx->ac.v4i32;
-   } else {
-      assert(desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_FMASK);
-   }
-
-   if (bindless)
-      rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
-   else
-      rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
-
-   if (desc_type == AC_DESC_IMAGE)
-      rsrc = fixup_image_desc(ctx, rsrc, uses_store);
-
-   return rsrc;
-}
-
-/**
- * Load an image view, fmask view. or sampler state descriptor.
- */
-static LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx, struct ac_llvm_pointer list,
-                                         LLVMValueRef index, enum ac_descriptor_type type)
-{
-   LLVMBuilderRef builder = ctx->ac.builder;
-
-   switch (type) {
-   case AC_DESC_IMAGE:
-      /* The image is at [0:7]. */
-      index = LLVMBuildMul(builder, index, LLVMConstInt(ctx->ac.i32, 2, 0), "");
-      break;
-   case AC_DESC_BUFFER:
-      /* The buffer is in [4:7]. */
-      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0), ctx->ac.i32_1);
-      list.pointee_type = ctx->ac.v4i32;
-      break;
-   case AC_DESC_FMASK:
-      /* The FMASK is at [8:15]. */
-      assert(ctx->screen->info.gfx_level < GFX11);
-      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 2, 0), ctx->ac.i32_1);
-      break;
-   case AC_DESC_SAMPLER:
-      /* The sampler state is at [12:15]. */
-      index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->ac.i32, 4, 0),
-                            LLVMConstInt(ctx->ac.i32, 3, 0));
-      list.pointee_type = ctx->ac.v4i32;
-      break;
-   case AC_DESC_PLANE_0:
-   case AC_DESC_PLANE_1:
-   case AC_DESC_PLANE_2:
-      /* Only used for the multiplane image support for Vulkan. Should
-       * never be reached in radeonsi.
-       */
-      unreachable("Plane descriptor requested in radeonsi.");
-   }
-
-   return ac_build_load_to_sgpr(&ctx->ac, list, index);
-}
-
-static LLVMValueRef si_nir_load_sampler_desc(struct ac_shader_abi *abi, unsigned descriptor_set,
-                                             unsigned base_index, unsigned constant_index,
-                                             LLVMValueRef dynamic_index,
-                                             enum ac_descriptor_type desc_type, bool image,
-                                             bool write, bool bindless)
-{
-   struct si_shader_context *ctx = si_shader_context_from_abi(abi);
-   LLVMBuilderRef builder = ctx->ac.builder;
-   unsigned const_index = base_index + constant_index;
-
-   assert(!descriptor_set);
-   assert(desc_type <= AC_DESC_BUFFER);
-
-   if (bindless) {
-      struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->bindless_samplers_and_images);
-
-      /* dynamic_index is the bindless handle */
-      if (image) {
-         /* Bindless image descriptors use 16-dword slots. */
-         dynamic_index =
-            LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
-         /* FMASK is right after the image. */
-         if (desc_type == AC_DESC_FMASK) {
-            dynamic_index = LLVMBuildAdd(ctx->ac.builder, dynamic_index, ctx->ac.i32_1, "");
-         }
-
-         return si_load_image_desc(ctx, list, dynamic_index, desc_type, write, true);
-      }
-
-      /* Since bindless handle arithmetic can contain an unsigned integer
-       * wraparound and si_load_sampler_desc assumes there isn't any,
-       * use GEP without "inbounds" (inside ac_build_pointer_add)
-       * to prevent incorrect code generation and hangs.
-       */
-      dynamic_index =
-         LLVMBuildMul(ctx->ac.builder, dynamic_index, LLVMConstInt(ctx->ac.i64, 2, 0), "");
-      list.v = ac_build_pointer_add(&ctx->ac, ctx->ac.v8i32, list.v, dynamic_index);
-      return si_load_sampler_desc(ctx, list, ctx->ac.i32_0, desc_type);
-   }
-
-   unsigned num_slots = image ? ctx->num_images : ctx->num_samplers;
-
-   /* Redirect invalid resource indices to the first array element. */
-   if (const_index >= num_slots)
-      const_index = base_index;
-
-   struct ac_llvm_pointer list = ac_get_ptr_arg(&ctx->ac, &ctx->args->ac, ctx->args->samplers_and_images);
-   LLVMValueRef index = LLVMConstInt(ctx->ac.i32, const_index, false);
-
-   if (dynamic_index) {
-      index = LLVMBuildAdd(builder, index, dynamic_index, "");
-
-      /* From the GL_ARB_shader_image_load_store extension spec:
-       *
-       *    If a shader performs an image load, store, or atomic
-       *    operation using an image variable declared as an array,
-       *    and if the index used to select an individual element is
-       *    negative or greater than or equal to the size of the
-       *    array, the results of the operation are undefined but may
-       *    not lead to termination.
-       */
-      index = si_llvm_bound_index(ctx, index, num_slots);
-   }
-
-   if (image) {
-      /* Fast path if the image is in user SGPRs. */
-      if (!dynamic_index &&
-          const_index < ctx->shader->selector->cs_num_images_in_user_sgprs &&
-          (desc_type == AC_DESC_IMAGE || desc_type == AC_DESC_BUFFER)) {
-         LLVMValueRef rsrc = ac_get_arg(&ctx->ac, ctx->args->cs_image[const_index]);
-
-         if (desc_type == AC_DESC_IMAGE)
-            rsrc = fixup_image_desc(ctx, rsrc, write);
-         return rsrc;
-      }
-
-      /* FMASKs are separate from images. */
-      if (desc_type == AC_DESC_FMASK) {
-         index =
-            LLVMBuildAdd(ctx->ac.builder, index, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGES, 0), "");
-      }
-      index = LLVMBuildSub(ctx->ac.builder, LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS - 1, 0),
-                           index, "");
-      return si_load_image_desc(ctx, list, index, desc_type, write, false);
-   }
-
-   index = LLVMBuildAdd(ctx->ac.builder, index,
-                        LLVMConstInt(ctx->ac.i32, SI_NUM_IMAGE_SLOTS / 2, 0), "");
-   return si_load_sampler_desc(ctx, list, index, desc_type);
-}
-
-void si_llvm_init_resource_callbacks(struct si_shader_context *ctx)
-{
-   ctx->abi.load_sampler_desc = si_nir_load_sampler_desc;
-}
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index c704bfd312f..44c78433d79 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -264,16 +264,18 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
     *   and copy-propagated
     */
 
-   static const struct nir_lower_tex_options lower_tex_options = {
+   const struct nir_lower_tex_options lower_tex_options = {
       .lower_txp = ~0u,
       .lower_txs_cube_array = true,
       .lower_invalid_implicit_lod = true,
       .lower_tg4_offsets = true,
+      .lower_to_fragment_fetch_amd = sscreen->info.gfx_level < GFX11,
    };
    NIR_PASS_V(nir, nir_lower_tex, &lower_tex_options);
 
-   static const struct nir_lower_image_options lower_image_options = {
+   const struct nir_lower_image_options lower_image_options = {
       .lower_cube_size = true,
+      .lower_to_fragment_mask_load_amd = sscreen->info.gfx_level < GFX11,
    };
    NIR_PASS_V(nir, nir_lower_image, &lower_image_options);