ac/llvm: rewrite and unify how GLC, DLC, SLC are set

Use ACCESS_* flags in call sites instead of GLC/DLC/SLC.

ACCESS_* flags are extended to describe other aspects of memory instructions
like load/store/atomic/smem.

Then add a function that converts the access flags to GLC, DLC, SLC.

The new functions are also usable by ACO.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/22770>
This commit is contained in:
Marek Olšák 2023-04-27 03:49:10 -04:00 committed by Marge Bot
parent 968db0208d
commit f98871608c
6 changed files with 270 additions and 123 deletions

View file

@ -1014,3 +1014,143 @@ void ac_get_scratch_tmpring_size(const struct radeon_info *info,
*tmpring_size = S_0286E8_WAVES(max_scratch_waves) |
S_0286E8_WAVESIZE(*max_seen_bytes_per_wave >> size_shift);
}
/* Get chip-agnostic memory instruction access flags (as opposed to chip-specific GLC/DLC/SLC)
* from a NIR memory intrinsic.
*/
enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr)
{
enum gl_access_qualifier access =
nir_intrinsic_has_access(instr) ? nir_intrinsic_access(instr) : 0;
/* Determine ACCESS_MAY_STORE_SUBDWORD. (for the GFX6 TC L1 bug workaround) */
if (!nir_intrinsic_infos[instr->intrinsic].has_dest) {
switch (instr->intrinsic) {
case nir_intrinsic_bindless_image_store:
access |= ACCESS_MAY_STORE_SUBDWORD;
break;
case nir_intrinsic_store_ssbo:
case nir_intrinsic_store_buffer_amd:
case nir_intrinsic_store_global:
case nir_intrinsic_store_global_amd:
if (access & ACCESS_USES_FORMAT_AMD ||
(nir_intrinsic_has_align_offset(instr) && nir_intrinsic_align(instr) % 4 != 0) ||
((instr->src[0].ssa->bit_size / 8) * instr->src[0].ssa->num_components) % 4 != 0)
access |= ACCESS_MAY_STORE_SUBDWORD;
break;
default:
unreachable("unexpected store instruction");
}
}
return access;
}
/* Convert chip-agnostic memory access flags into hw-specific cache flags.
*
* "access" must be a result of ac_get_mem_access_flags() with the appropriate ACCESS_TYPE_*
* flags set.
*/
union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,
enum gl_access_qualifier access)
{
union ac_hw_cache_flags result;
result.value = 0;
assert(util_bitcount(access & (ACCESS_TYPE_LOAD | ACCESS_TYPE_STORE |
ACCESS_TYPE_ATOMIC)) == 1);
assert(!(access & ACCESS_TYPE_SMEM) || access & ACCESS_TYPE_LOAD);
assert(!(access & ACCESS_IS_SWIZZLED_AMD) || !(access & ACCESS_TYPE_SMEM));
assert(!(access & ACCESS_MAY_STORE_SUBDWORD) || access & ACCESS_TYPE_STORE);
bool scope_is_device = access & (ACCESS_COHERENT | ACCESS_VOLATILE);
if (gfx_level >= GFX11) {
/* GFX11 simplified it and exposes what is actually useful.
*
* GLC means device scope for loads only. (stores and atomics are always device scope)
* SLC means non-temporal for GL1 and GL2 caches. (GL1 = hit-evict, GL2 = stream, unavailable in SMEM)
* DLC means non-temporal for MALL. (noalloc, i.e. coherent bypass)
*
* GL0 doesn't have a non-temporal flag, so you always get LRU caching in CU scope.
*/
if (access & ACCESS_TYPE_LOAD && scope_is_device)
result.value |= ac_glc;
if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
result.value |= ac_slc;
} else if (gfx_level >= GFX10) {
/* GFX10-10.3:
*
* VMEM and SMEM loads (SMEM only supports the first four):
* !GLC && !DLC && !SLC means CU scope <== use for normal loads with CU scope
* GLC && !DLC && !SLC means SA scope
* !GLC && DLC && !SLC means CU scope, GL1 bypass
* GLC && DLC && !SLC means device scope <== use for normal loads with device scope
* !GLC && !DLC && SLC means CU scope, non-temporal (GL0 = GL1 = hit-evict, GL2 = stream) <== use for non-temporal loads with CU scope
* GLC && !DLC && SLC means SA scope, non-temporal (GL1 = hit-evict, GL2 = stream)
* !GLC && DLC && SLC means CU scope, GL0 non-temporal, GL1-GL2 coherent bypass (GL0 = hit-evict, GL1 = bypass, GL2 = noalloc)
* GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc) <== use for non-temporal loads with device scope
*
* VMEM stores/atomics (stores are CU scope only if they overwrite the whole cache line,
* atomics are always device scope, GL1 is always bypassed):
* !GLC && !DLC && !SLC means CU scope <== use for normal stores with CU scope
* GLC && !DLC && !SLC means device scope <== use for normal stores with device scope
* !GLC && DLC && !SLC means CU scope, GL2 non-coherent bypass
* GLC && DLC && !SLC means device scope, GL2 non-coherent bypass
* !GLC && !DLC && SLC means CU scope, GL2 non-temporal (stream) <== use for non-temporal stores with CU scope
* GLC && !DLC && SLC means device scope, GL2 non-temporal (stream) <== use for non-temporal stores with device scope
* !GLC && DLC && SLC means CU scope, GL2 coherent bypass (noalloc)
* GLC && DLC && SLC means device scope, GL2 coherent bypass (noalloc)
*
* "stream" allows write combining in GL2. "coherent bypass" doesn't.
* "non-coherent bypass" doesn't guarantee ordering with any coherent stores.
*/
if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC))
result.value |= ac_glc | (access & ACCESS_TYPE_LOAD ? ac_dlc : 0);
if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
result.value |= ac_slc;
} else {
/* GFX6-GFX9:
*
* VMEM loads:
* !GLC && !SLC means CU scope
* GLC && !SLC means (GFX6: device scope, GFX7-9: device scope [*])
* !GLC && SLC means (GFX6: CU scope, GFX7: device scope, GFX8-9: CU scope), GL2 non-temporal (stream)
* GLC && SLC means device scope, GL2 non-temporal (stream)
*
* VMEM stores (atomics don't have [*]):
* !GLC && !SLC means (GFX6: CU scope, GFX7-9: device scope [*])
* GLC && !SLC means (GFX6-7: device scope, GFX8-9: device scope [*])
* !GLC && SLC means (GFX6: CU scope, GFX7-9: device scope [*]), GL2 non-temporal (stream)
* GLC && SLC means device scope, GL2 non-temporal (stream)
*
* [*] data can be cached in GL1 for future CU scope
*
* SMEM loads:
* GLC means device scope (available on GFX8+)
*/
if (scope_is_device && !(access & ACCESS_TYPE_ATOMIC)) {
/* SMEM doesn't support the device scope on GFX6-7. */
assert(gfx_level >= GFX8 || !(access & ACCESS_TYPE_SMEM));
result.value |= ac_glc;
}
if (access & ACCESS_NON_TEMPORAL && !(access & ACCESS_TYPE_SMEM))
result.value |= ac_slc;
/* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All store opcodes not
* aligned to a dword are affected.
*/
if (gfx_level == GFX6 && access & ACCESS_MAY_STORE_SUBDWORD)
result.value |= ac_glc;
}
if (access & ACCESS_IS_SWIZZLED_AMD)
result.value |= ac_swizzled;
return result;
}

View file

@ -46,6 +46,41 @@ extern "C" {
#define AC_SENDMSG_GS_OP_EMIT (2 << 4)
#define AC_SENDMSG_GS_OP_EMIT_CUT (3 << 4)
/* An extension of gl_access_qualifier describing other aspects of memory operations
* for code generation.
*/
enum {
/* Only one of LOAD/STORE/ATOMIC can be set. */
ACCESS_TYPE_LOAD = BITFIELD_BIT(27),
ACCESS_TYPE_STORE = BITFIELD_BIT(28),
ACCESS_TYPE_ATOMIC = BITFIELD_BIT(29),
/* This access is expected to use an SMEM instruction if source operands are non-divergent.
* Only loads can set this.
*/
ACCESS_TYPE_SMEM = BITFIELD_BIT(30),
/* Whether a store offset or size alignment is less than 4. */
ACCESS_MAY_STORE_SUBDWORD = BITFIELD_BIT(31),
};
/* The meaning of these enums is different between chips. They match LLVM definitions,
* but they can also be used by ACO. Use ac_get_hw_cache_flags to get these.
*/
enum ac_cache_flags
{
ac_glc = BITFIELD_BIT(0),
ac_slc = BITFIELD_BIT(1),
ac_dlc = BITFIELD_BIT(2),
ac_swizzled = BITFIELD_BIT(3),
};
union ac_hw_cache_flags
{
/* NOTE: This will contain more fields in the future. */
enum ac_cache_flags value;
};
enum ac_image_dim
{
ac_image_1d,
@ -199,6 +234,11 @@ ac_ngg_get_scratch_lds_size(gl_shader_stage stage,
bool streamout_enabled,
bool can_cull);
enum gl_access_qualifier ac_get_mem_access_flags(const nir_intrinsic_instr *instr);
union ac_hw_cache_flags ac_get_hw_cache_flags(enum amd_gfx_level gfx_level,
enum gl_access_qualifier access);
#ifdef __cplusplus
}
#endif

View file

@ -1221,23 +1221,15 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
return ac_build_load_custom(ctx, ptr.t, ptr.v, index, true, true, false);
}
static unsigned get_load_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
static unsigned get_cache_flags(struct ac_llvm_context *ctx, enum gl_access_qualifier access)
{
return cache_policy |
(ctx->gfx_level >= GFX10 && ctx->gfx_level < GFX11 && cache_policy & ac_glc ? ac_dlc : 0);
}
static unsigned get_store_cache_policy(struct ac_llvm_context *ctx, unsigned cache_policy)
{
if (ctx->gfx_level >= GFX11)
cache_policy &= ~ac_glc; /* GLC has no effect on stores */
return cache_policy;
return ac_get_hw_cache_flags(ctx->gfx_level, access).value;
}
static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef data, LLVMValueRef vindex,
LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy, bool use_format)
enum gl_access_qualifier access, bool use_format)
{
LLVMValueRef args[6];
int idx = 0;
@ -1247,7 +1239,7 @@ static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueR
args[idx++] = vindex ? vindex : ctx->i32_0;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
args[idx++] = LLVMConstInt(ctx->i32, get_store_cache_policy(ctx, cache_policy), 0);
args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_STORE), 0);
const char *indexing_kind = vindex ? "struct" : "raw";
char name[256], type_name[8];
@ -1264,15 +1256,15 @@ static void ac_build_buffer_store_common(struct ac_llvm_context *ctx, LLVMValueR
}
void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy)
LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access)
{
ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, cache_policy, true);
ac_build_buffer_store_common(ctx, rsrc, data, vindex, voffset, NULL, access, true);
}
/* buffer_store_dword(,x2,x3,x4) <- the suffix is selected by the type of vdata. */
void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy)
enum gl_access_qualifier access)
{
unsigned num_channels = ac_get_llvm_num_components(vdata);
@ -1288,19 +1280,19 @@ void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
voffset2 = LLVMBuildAdd(ctx->builder, voffset ? voffset : ctx->i32_0,
LLVMConstInt(ctx->i32, 8, 0), "");
ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, cache_policy);
ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, cache_policy);
ac_build_buffer_store_dword(ctx, rsrc, v01, vindex, voffset, soffset, access);
ac_build_buffer_store_dword(ctx, rsrc, v[2], vindex, voffset2, soffset, access);
return;
}
ac_build_buffer_store_common(ctx, rsrc, ac_to_float(ctx, vdata), vindex, voffset, soffset,
cache_policy, false);
access, false);
}
static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vindex, LLVMValueRef voffset,
LLVMValueRef soffset, unsigned num_channels,
LLVMTypeRef channel_type, unsigned cache_policy,
LLVMTypeRef channel_type, enum gl_access_qualifier access,
bool can_speculate, bool use_format)
{
LLVMValueRef args[5];
@ -1310,7 +1302,7 @@ static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLV
args[idx++] = vindex;
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
unsigned func =
!ac_has_vec3_support(ctx->gfx_level, use_format) && num_channels == 3 ? 4 : num_channels;
const char *indexing_kind = vindex ? "struct" : "raw";
@ -1339,11 +1331,10 @@ static LLVMValueRef ac_build_buffer_load_common(struct ac_llvm_context *ctx, LLV
LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
LLVMTypeRef channel_type, unsigned cache_policy,
LLVMTypeRef channel_type, enum gl_access_qualifier access,
bool can_speculate, bool allow_smem)
{
if (allow_smem && !(cache_policy & ac_slc) &&
(!(cache_policy & ac_glc) || ctx->gfx_level >= GFX8)) {
if (allow_smem && (!(access & ACCESS_COHERENT) || ctx->gfx_level >= GFX8)) {
assert(vindex == NULL);
LLVMValueRef result[32];
@ -1365,7 +1356,8 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
LLVMValueRef args[3] = {
rsrc,
offset,
LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0),
LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD |
ACCESS_TYPE_SMEM), 0),
};
result[i] = ac_build_intrinsic(ctx, name, channel_type, args, 3, AC_ATTR_INVARIANT_LOAD);
}
@ -1386,7 +1378,7 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
LLVMConstInt(ctx->i32, i * ac_get_type_size(channel_type), 0), "");
LLVMValueRef item =
ac_build_buffer_load_common(ctx, rsrc, vindex, fetch_voffset, soffset, fetch_num_channels,
channel_type, cache_policy, can_speculate, false);
channel_type, access, can_speculate, false);
result = ac_build_concat(ctx, result, item);
}
@ -1395,13 +1387,13 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vindex, LLVMValueRef voffset,
unsigned num_channels, unsigned cache_policy,
unsigned num_channels, enum gl_access_qualifier access,
bool can_speculate, bool d16, bool tfe)
{
if (tfe) {
assert(!d16);
cache_policy = get_load_cache_policy(ctx, cache_policy);
unsigned cache_flags = get_cache_flags(ctx, access | ACCESS_TYPE_LOAD);
char code[256];
/* The definition in the assembly and the one in the constraint string
@ -1415,9 +1407,9 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueR
"v_mov_b32 v4, 0\n"
"buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
"s_waitcnt vmcnt(0)",
cache_policy & ac_glc ? "glc" : "",
cache_policy & ac_slc ? "slc" : "",
cache_policy & ac_dlc ? "dlc" : "");
cache_flags & ac_glc ? "glc" : "",
cache_flags & ac_slc ? "slc" : "",
cache_flags & ac_dlc ? "dlc" : "");
LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
@ -1435,7 +1427,7 @@ LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueR
}
return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0,
num_channels, d16 ? ctx->f16 : ctx->f32, cache_policy,
num_channels, d16 ? ctx->f16 : ctx->f32, access,
can_speculate, true);
}
@ -1443,7 +1435,7 @@ static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValue
LLVMValueRef vindex, LLVMValueRef voffset,
LLVMValueRef soffset, unsigned num_channels,
unsigned tbuffer_format, LLVMTypeRef channel_type,
unsigned cache_policy, bool can_speculate)
enum gl_access_qualifier access, bool can_speculate)
{
LLVMValueRef args[6];
int idx = 0;
@ -1453,7 +1445,7 @@ static LLVMValueRef ac_build_tbuffer_load(struct ac_llvm_context *ctx, LLVMValue
args[idx++] = voffset ? voffset : ctx->i32_0;
args[idx++] = soffset ? soffset : ctx->i32_0;
args[idx++] = LLVMConstInt(ctx->i32, tbuffer_format, 0);
args[idx++] = LLVMConstInt(ctx->i32, get_load_cache_policy(ctx, cache_policy), 0);
args[idx++] = LLVMConstInt(ctx->i32, get_cache_flags(ctx, access | ACCESS_TYPE_LOAD), 0);
const char *indexing_kind = vindex ? "struct" : "raw";
char name[256], type_name[8];
@ -1474,7 +1466,7 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
unsigned align_offset,
unsigned align_mul,
unsigned num_channels,
unsigned cache_policy,
enum gl_access_qualifier access,
bool can_speculate)
{
const unsigned max_channels = vtx_info->num_channels;
@ -1503,7 +1495,7 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
LLVMValueRef item =
ac_build_tbuffer_load(ctx, rsrc, vidx, fetch_voffset, soffset,
fetch_num_channels, fetch_format, channel_type,
cache_policy, can_speculate);
access, can_speculate);
result = ac_build_concat(ctx, result, item);
}
@ -1513,35 +1505,35 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy)
enum gl_access_qualifier access)
{
return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i16,
cache_policy, false, false);
access, false, false);
}
LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy)
enum gl_access_qualifier access)
{
return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, cache_policy,
return ac_build_buffer_load_common(ctx, rsrc, NULL, voffset, soffset, 1, ctx->i8, access,
false, false);
}
void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy)
enum gl_access_qualifier access)
{
vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i16, "");
ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
}
void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy)
LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access)
{
vdata = LLVMBuildBitCast(ctx->builder, vdata, ctx->i8, "");
ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, cache_policy, false);
ac_build_buffer_store_common(ctx, rsrc, vdata, NULL, voffset, soffset, access, false);
}
/**
@ -2025,7 +2017,11 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
args[num_args++] = LLVMConstInt(
ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
ctx->i32, get_cache_flags(ctx,
a->access |
(atomic ? ACCESS_TYPE_ATOMIC :
load ? ACCESS_TYPE_LOAD : ACCESS_TYPE_STORE)),
false);
const char *name;
const char *atomic_subop = "";

View file

@ -281,28 +281,28 @@ LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
void ac_build_buffer_store_dword(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy);
enum gl_access_qualifier access);
void ac_build_buffer_store_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef data,
LLVMValueRef vindex, LLVMValueRef voffset, unsigned cache_policy);
LLVMValueRef vindex, LLVMValueRef voffset, enum gl_access_qualifier access);
LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc, int num_channels,
LLVMValueRef vindex, LLVMValueRef voffset, LLVMValueRef soffset,
LLVMTypeRef channel_type, unsigned cache_policy,
LLVMTypeRef channel_type, enum gl_access_qualifier access,
bool can_speculate, bool allow_smem);
LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vindex, LLVMValueRef voffset,
unsigned num_channels, unsigned cache_policy,
unsigned num_channels, enum gl_access_qualifier access,
bool can_speculate, bool d16, bool tfe);
LLVMValueRef ac_build_buffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy);
enum gl_access_qualifier access);
LLVMValueRef ac_build_buffer_load_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy);
enum gl_access_qualifier access);
LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vindex, LLVMValueRef voffset,
@ -312,15 +312,15 @@ LLVMValueRef ac_build_safe_tbuffer_load(struct ac_llvm_context *ctx, LLVMValueRe
unsigned align_offset,
unsigned align_mul,
unsigned num_channels,
unsigned cache_policy,
enum gl_access_qualifier access,
bool can_speculate);
void ac_build_buffer_store_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vdata, LLVMValueRef voffset, LLVMValueRef soffset,
unsigned cache_policy);
enum gl_access_qualifier access);
void ac_build_buffer_store_byte(struct ac_llvm_context *ctx, LLVMValueRef rsrc, LLVMValueRef vdata,
LLVMValueRef voffset, LLVMValueRef soffset, unsigned cache_policy);
LLVMValueRef voffset, LLVMValueRef soffset, enum gl_access_qualifier access);
void ac_set_range_metadata(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned lo,
unsigned hi);
@ -391,21 +391,12 @@ enum ac_atomic_op
ac_atomic_fmax,
};
/* These cache policy bits match the definitions used by the LLVM intrinsics. */
enum ac_image_cache_policy
{
ac_glc = 1 << 0, /* per-CU cache control */
ac_slc = 1 << 1, /* global L2 cache control */
ac_dlc = 1 << 2, /* per-shader-array cache control */
ac_swizzled = 1 << 3, /* the access is swizzled, disabling load/store merging */
};
struct ac_image_args {
enum ac_image_opcode opcode;
enum ac_atomic_op atomic; /* for the ac_image_atomic opcode */
enum ac_image_dim dim;
enum gl_access_qualifier access;
unsigned dmask : 4;
unsigned cache_policy : 3;
bool unorm : 1;
bool level_zero : 1;
bool d16 : 1; /* GFX8+: data and return values are 16-bit */

View file

@ -1804,26 +1804,6 @@ static LLVMValueRef extract_vector_range(struct ac_llvm_context *ctx, LLVMValueR
}
}
static unsigned get_cache_policy(struct ac_nir_context *ctx, enum gl_access_qualifier access,
bool may_store_unaligned)
{
unsigned cache_policy = 0;
/* GFX6 has a TC L1 bug causing corruption of 8bit/16bit stores. All
* store opcodes not aligned to a dword are affected. The only way to
* get unaligned stores is through shader images.
*/
if (((may_store_unaligned && ctx->ac.gfx_level == GFX6) ||
access & (ACCESS_COHERENT | ACCESS_VOLATILE))) {
cache_policy |= ac_glc;
}
if (access & ACCESS_NON_TEMPORAL)
cache_policy |= ac_slc | ac_glc;
return cache_policy;
}
static LLVMValueRef enter_waterfall_ssbo(struct ac_nir_context *ctx, struct waterfall_context *wctx,
const nir_intrinsic_instr *instr, nir_src src)
{
@ -1841,8 +1821,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
LLVMValueRef src_data = get_src(ctx, instr->src[0]);
int elem_size_bytes = ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src_data)) / 8;
unsigned writemask = nir_intrinsic_write_mask(instr);
enum gl_access_qualifier access = nir_intrinsic_access(instr);
unsigned cache_policy = get_cache_policy(ctx, access, false);
enum gl_access_qualifier access = ac_get_mem_access_flags(instr);
struct waterfall_context wctx;
LLVMValueRef rsrc_base = enter_waterfall_ssbo(ctx, &wctx, instr, instr->src[1]);
@ -1897,9 +1876,9 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
LLVMConstInt(ctx->ac.i32, start * elem_size_bytes, false), "");
if (num_bytes == 1) {
ac_build_buffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
ac_build_buffer_store_byte(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, access);
} else if (num_bytes == 2) {
ac_build_buffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, cache_policy);
ac_build_buffer_store_short(&ctx->ac, rsrc, data, offset, ctx->ac.i32_0, access);
} else {
switch (num_bytes) {
case 16: /* v4f32 */
@ -1920,7 +1899,7 @@ static void visit_store_ssbo(struct ac_nir_context *ctx, nir_intrinsic_instr *in
data = LLVMBuildBitCast(ctx->ac.builder, data, data_type, "");
ac_build_buffer_store_dword(&ctx->ac, rsrc, data, NULL, offset,
ctx->ac.i32_0, cache_policy);
ctx->ac.i32_0, access);
}
}
@ -2066,11 +2045,16 @@ static LLVMValueRef visit_atomic_ssbo(struct ac_nir_context *ctx, nir_intrinsic_
data = ac_to_float(&ctx->ac, data);
return_type = LLVMTypeOf(data);
}
unsigned cache_flags =
ac_get_hw_cache_flags(ctx->ac.gfx_level,
ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value;
params[arg_count++] = data;
params[arg_count++] = descriptor;
params[arg_count++] = get_src(ctx, instr->src[1]); /* voffset */
params[arg_count++] = ctx->ac.i32_0; /* soffset */
params[arg_count++] = ctx->ac.i32_0; /* slc */
params[arg_count++] = LLVMConstInt(ctx->ac.i32, cache_flags, 0);
ac_build_type_name_for_intr(return_type, type, sizeof(type));
snprintf(name, sizeof(name), "llvm.amdgcn.raw.buffer.atomic.%s.%s", op, type);
@ -2095,8 +2079,7 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_
int elem_size_bytes = instr->dest.ssa.bit_size / 8;
int num_components = instr->num_components;
enum gl_access_qualifier access = nir_intrinsic_access(instr);
unsigned cache_policy = get_cache_policy(ctx, access, false);
enum gl_access_qualifier access = ac_get_mem_access_flags(instr);
LLVMValueRef offset = get_src(ctx, instr->src[1]);
LLVMValueRef rsrc = ctx->abi->load_ssbo ?
@ -2122,16 +2105,16 @@ static LLVMValueRef visit_load_buffer(struct ac_nir_context *ctx, nir_intrinsic_
if (load_bytes == 1) {
ret = ac_build_buffer_load_byte(&ctx->ac, rsrc, voffset, ctx->ac.i32_0,
cache_policy);
access);
} else if (load_bytes == 2) {
ret = ac_build_buffer_load_short(&ctx->ac, rsrc, voffset, ctx->ac.i32_0,
cache_policy);
access);
} else {
int num_channels = util_next_power_of_two(load_bytes) / 4;
bool can_speculate = access & ACCESS_CAN_REORDER;
ret = ac_build_buffer_load(&ctx->ac, rsrc, num_channels, vindex, voffset, ctx->ac.i32_0,
ctx->ac.f32, cache_policy, can_speculate, false);
ctx->ac.f32, access, can_speculate, false);
}
LLVMTypeRef byte_vec = LLVMVectorType(ctx->ac.i8, ac_get_type_size(LLVMTypeOf(ret)));
@ -2507,7 +2490,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
struct ac_image_args args = {0};
args.cache_policy = get_cache_policy(ctx, access, false);
args.access = ac_get_mem_access_flags(instr);
args.tfe = instr->intrinsic == nir_intrinsic_bindless_image_sparse_load;
if (dim == GLSL_SAMPLER_DIM_BUF) {
@ -2523,7 +2506,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
assert(instr->dest.is_ssa);
bool can_speculate = access & ACCESS_CAN_REORDER;
res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels,
args.cache_policy, can_speculate,
args.access, can_speculate,
instr->dest.ssa.bit_size == 16,
args.tfe);
res = ac_build_expand(&ctx->ac, res, num_channels, args.tfe ? 5 : 4);
@ -2588,14 +2571,13 @@ static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_in
}
enum glsl_sampler_dim dim = nir_intrinsic_image_dim(instr);
enum gl_access_qualifier access = nir_intrinsic_access(instr);
bool is_array = nir_intrinsic_image_array(instr);
struct waterfall_context wctx;
LLVMValueRef dynamic_index = enter_waterfall_image(ctx, &wctx, instr);
struct ac_image_args args = {0};
args.cache_policy = get_cache_policy(ctx, access, true);
args.access = ac_get_mem_access_flags(instr);
LLVMValueRef src = get_src(ctx, instr->src[3]);
if (instr->src[3].ssa->bit_size == 64) {
@ -2617,7 +2599,7 @@ static void visit_image_store(struct ac_nir_context *ctx, const nir_intrinsic_in
vindex =
LLVMBuildExtractElement(ctx->ac.builder, get_src(ctx, instr->src[1]), ctx->ac.i32_0, "");
ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.cache_policy);
ac_build_buffer_store_format(&ctx->ac, rsrc, src, vindex, ctx->ac.i32_0, args.access);
} else {
bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
@ -2730,9 +2712,12 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
} else {
LLVMTypeRef data_type = LLVMTypeOf(params[0]);
char type[8];
unsigned cache_flags =
ac_get_hw_cache_flags(ctx->ac.gfx_level,
ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value;
params[param_count++] = ctx->ac.i32_0; /* soffset */
params[param_count++] = ctx->ac.i32_0; /* slc */
params[param_count++] = LLVMConstInt(ctx->ac.i32, cache_flags, 0);
ac_build_type_name_for_intr(data_type, type, sizeof(type));
length = snprintf(intrinsic_name, sizeof(intrinsic_name),
@ -2752,6 +2737,7 @@ static LLVMValueRef visit_image_atomic(struct ac_nir_context *ctx, const nir_int
args.resource = ctx->abi->load_sampler_desc(ctx->abi, dynamic_index, AC_DESC_IMAGE);
get_image_coords(ctx, instr, dynamic_index, &args, dim, is_array);
args.dim = ac_get_image_dim(ctx->ac.gfx_level, dim, is_array);
args.access = ac_get_mem_access_flags(instr);
result = ac_build_image_opcode(&ctx->ac, &args);
}
@ -3805,19 +3791,9 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
LLVMValueRef vidx = idxen ? get_src(ctx, instr->src[src_base + 3]) : NULL;
unsigned num_components = instr->dest.ssa.num_components;
unsigned const_offset = nir_intrinsic_base(instr);
bool swizzled = nir_intrinsic_access(instr) & ACCESS_IS_SWIZZLED_AMD;
bool reorder = nir_intrinsic_can_reorder(instr);
bool coherent = nir_intrinsic_access(instr) & ACCESS_COHERENT;
bool slc = nir_intrinsic_access(instr) & ACCESS_NON_TEMPORAL;
bool uses_format = nir_intrinsic_access(instr) & ACCESS_USES_FORMAT_AMD;
enum ac_image_cache_policy cache_policy = 0;
if (swizzled)
cache_policy |= ac_swizzled;
if (slc)
cache_policy |= ac_slc;
if (coherent)
cache_policy |= ac_glc;
enum gl_access_qualifier access = ac_get_mem_access_flags(instr);
bool uses_format = access & ACCESS_USES_FORMAT_AMD;
LLVMValueRef voffset = LLVMBuildAdd(ctx->ac.builder, addr_voffset,
LLVMConstInt(ctx->ac.i32, const_offset, 0), "");
@ -3825,12 +3801,12 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
if (instr->intrinsic == nir_intrinsic_load_buffer_amd && uses_format) {
assert(instr->dest.ssa.bit_size == 16 || instr->dest.ssa.bit_size == 32);
result = ac_build_buffer_load_format(&ctx->ac, descriptor, vidx, voffset, num_components,
cache_policy, reorder,
access, reorder,
instr->dest.ssa.bit_size == 16, false);
result = ac_to_integer(&ctx->ac, result);
} else if (instr->intrinsic == nir_intrinsic_store_buffer_amd && uses_format) {
assert(instr->src[0].ssa->bit_size == 16 || instr->src[0].ssa->bit_size == 32);
ac_build_buffer_store_format(&ctx->ac, descriptor, store_data, vidx, voffset, cache_policy);
ac_build_buffer_store_format(&ctx->ac, descriptor, store_data, vidx, voffset, access);
} else if (instr->intrinsic == nir_intrinsic_load_buffer_amd ||
instr->intrinsic == nir_intrinsic_load_typed_buffer_amd) {
/* LLVM is unable to select instructions for larger than 32-bit channel types.
@ -3843,7 +3819,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
if (instr->intrinsic == nir_intrinsic_load_buffer_amd) {
result = ac_build_buffer_load(&ctx->ac, descriptor, fetch_num_components, vidx, voffset,
addr_soffset, channel_type, cache_policy, reorder, false);
addr_soffset, channel_type, access, reorder, false);
} else {
const unsigned align_offset = nir_intrinsic_align_offset(instr);
const unsigned align_mul = nir_intrinsic_align_mul(instr);
@ -3854,7 +3830,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
result =
ac_build_safe_tbuffer_load(&ctx->ac, descriptor, vidx, addr_voffset, addr_soffset,
channel_type, vtx_info, const_offset, align_offset,
align_mul, fetch_num_components, cache_policy, reorder);
align_mul, fetch_num_components, access, reorder);
}
/* Trim to needed vector components. */
@ -3884,7 +3860,7 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
LLVMValueRef data = extract_vector_range(&ctx->ac, store_data, start, count);
ac_build_buffer_store_dword(&ctx->ac, descriptor, data, vidx, voffset, addr_soffset,
cache_policy);
access);
}
}
break;
@ -3933,12 +3909,15 @@ static bool visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
LLVMValueRef data = get_src(ctx, instr->src[1]);
unsigned base = nir_intrinsic_base(instr);
LLVMTypeRef return_type = LLVMTypeOf(data);
unsigned cache_flags =
ac_get_hw_cache_flags(ctx->ac.gfx_level,
ac_get_mem_access_flags(instr) | ACCESS_TYPE_ATOMIC).value;
LLVMValueRef args[] = {
data, desc,
LLVMConstInt(ctx->ac.i32, base, false),
ctx->ac.i32_0, /* soffset */
ctx->ac.i32_0, /* cachepolicy */
LLVMConstInt(ctx->ac.i32, cache_flags, 0),
};
char name[64], type[8];

View file

@ -372,7 +372,8 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
ac_build_ifcc(&ctx->ac,
LLVMBuildICmp(ctx->ac.builder, LLVMIntEQ, rel_patch_id, ctx->ac.i32_0, ""), 6504);
ac_build_buffer_store_dword(&ctx->ac, buffer, LLVMConstInt(ctx->ac.i32, 0x80000000, 0),
NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base, ac_glc);
NULL, LLVMConstInt(ctx->ac.i32, offset, 0), tf_base,
ACCESS_COHERENT);
ac_build_endif(&ctx->ac, 6504);
offset += 4;
}
@ -381,13 +382,13 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
ac_build_buffer_store_dword(&ctx->ac, buffer, vec0, NULL,
LLVMBuildAdd(ctx->ac.builder, byteoffset,
LLVMConstInt(ctx->ac.i32, offset, 0), ""),
tf_base, ac_glc);
tf_base, ACCESS_COHERENT);
offset += 16;
if (vec1)
ac_build_buffer_store_dword(&ctx->ac, buffer, vec1, NULL,
LLVMBuildAdd(ctx->ac.builder, byteoffset,
LLVMConstInt(ctx->ac.i32, offset, 0), ""),
tf_base, ac_glc);
tf_base, ACCESS_COHERENT);
/* Store the tess factors into the offchip buffer if TES reads them. */
if (shader->key.ge.part.tcs.epilog.tes_reads_tess_factors) {
@ -405,7 +406,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
outer_vec = ac_build_gather_values(&ctx->ac, outer, outer_comps);
ac_build_buffer_store_dword(&ctx->ac, buf, outer_vec, NULL, tf_outer_offset,
base, ac_glc);
base, ACCESS_COHERENT);
if (inner_comps) {
param_inner = si_shader_io_get_unique_index_patch(VARYING_SLOT_TESS_LEVEL_INNER);
tf_inner_offset = get_tcs_tes_buffer_address(ctx, rel_patch_id, NULL,
@ -413,7 +414,7 @@ static void si_write_tess_factors(struct si_shader_context *ctx, union si_shader
inner_vec = ac_build_gather_values(&ctx->ac, inner, inner_comps);
ac_build_buffer_store_dword(&ctx->ac, buf, inner_vec, NULL,
tf_inner_offset, base, ac_glc);
tf_inner_offset, base, ACCESS_COHERENT);
}
}