diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c index 4d8c5da4d5e..e7b82fa7f8a 100644 --- a/src/amd/llvm/ac_llvm_build.c +++ b/src/amd/llvm/ac_llvm_build.c @@ -2164,6 +2164,29 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_ assert(!a->d16 || (ctx->chip_class >= GFX8 && a->opcode != ac_image_atomic && a->opcode != ac_image_atomic_cmpswap && a->opcode != ac_image_get_lod && a->opcode != ac_image_get_resinfo)); + assert(!a->a16 || ctx->chip_class >= GFX9); + assert(a->g16 == a->a16 || ctx->chip_class >= GFX10); + + assert(!a->offset || + ac_get_elem_bits(ctx, LLVMTypeOf(a->offset)) == 32); + assert(!a->bias || + ac_get_elem_bits(ctx, LLVMTypeOf(a->bias)) == 32); + assert(!a->compare || + ac_get_elem_bits(ctx, LLVMTypeOf(a->compare)) == 32); + assert(!a->derivs[0] || + ((!a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 16) && + (a->g16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->derivs[0])) == 32))); + assert(!a->coords[0] || + ((!a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 16) && + (a->a16 || ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0])) == 32))); + assert(!a->lod || + ((a->opcode != ac_image_get_resinfo || ac_get_elem_bits(ctx, LLVMTypeOf(a->lod))) && + (a->opcode == ac_image_get_resinfo || + ac_get_elem_bits(ctx, LLVMTypeOf(a->lod)) == + ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0]))))); + assert(!a->min_lod || + ac_get_elem_bits(ctx, LLVMTypeOf(a->min_lod)) == + ac_get_elem_bits(ctx, LLVMTypeOf(a->coords[0]))); if (a->opcode == ac_image_get_lod) { switch (dim) { @@ -2184,7 +2207,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_ bool atomic = a->opcode == ac_image_atomic || a->opcode == ac_image_atomic_cmpswap; bool load = a->opcode == ac_image_sample || a->opcode == ac_image_gather4 || a->opcode == ac_image_load || a->opcode == ac_image_load_mip; - LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32; + LLVMTypeRef coord_type = sample ? (a->a16 ? ctx->f16 : ctx->f32) : (a->a16 ? ctx->i16 : ctx->i32); uint8_t dmask = a->dmask; LLVMTypeRef data_type; char data_type_str[32]; @@ -2225,7 +2248,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_ unsigned count = ac_num_derivs(dim); for (unsigned i = 0; i < count; ++i) args[num_args++] = ac_to_float(ctx, a->derivs[i]); - overload[num_overloads++] = ".f32"; + overload[num_overloads++] = a->g16 ? ".f16" : ".f32"; } unsigned num_coords = a->opcode != ac_image_get_resinfo ? ac_num_coords(dim) : 0; for (unsigned i = 0; i < num_coords; ++i) @@ -2235,7 +2258,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_ if (a->min_lod) args[num_args++] = LLVMBuildBitCast(ctx->builder, a->min_lod, coord_type, ""); - overload[num_overloads++] = sample ? ".f32" : ".i32"; + overload[num_overloads++] = sample ? (a->a16 ? ".f16" : ".f32") : (a->a16 ? ".i16" : ".i32"); args[num_args++] = a->resource; if (sample) { @@ -3373,6 +3396,7 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LL fmask_load.coords[1] = addr[1]; if (is_array_tex) fmask_load.coords[2] = addr[2]; + fmask_load.a16 = ac_get_elem_bits(ac, LLVMTypeOf(addr[0])) == 16; LLVMValueRef fmask_value = ac_build_image_opcode(ac, &fmask_load); fmask_value = LLVMBuildExtractElement(ac->builder, fmask_value, ac->i32_0, ""); @@ -3380,11 +3404,15 @@ void ac_apply_fmask_to_sample(struct ac_llvm_context *ac, LLVMValueRef fmask, LL /* Apply the formula. */ unsigned sample_chan = is_array_tex ? 3 : 2; LLVMValueRef final_sample; - final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], LLVMConstInt(ac->i32, 4, 0), ""); - final_sample = LLVMBuildLShr(ac->builder, fmask_value, final_sample, ""); + final_sample = LLVMBuildMul(ac->builder, addr[sample_chan], + LLVMConstInt(LLVMTypeOf(addr[0]), 4, 0), ""); + final_sample = LLVMBuildLShr(ac->builder, fmask_value, + LLVMBuildZExt(ac->builder, final_sample, ac->i32, ""), ""); /* Mask the sample index by 0x7, because 0x8 means an unknown value * with EQAA, so those will map to 0. */ final_sample = LLVMBuildAnd(ac->builder, final_sample, LLVMConstInt(ac->i32, 0x7, 0), ""); + if (fmask_load.a16) + final_sample = LLVMBuildTrunc(ac->builder, final_sample, ac->i16, ""); /* Don't rewrite the sample index if WORD1.DATA_FORMAT of the FMASK * resource descriptor is 0 (invalid). diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h index 5a4a61a0e71..5763276ce2e 100644 --- a/src/amd/llvm/ac_llvm_build.h +++ b/src/amd/llvm/ac_llvm_build.h @@ -403,7 +403,9 @@ struct ac_image_args { unsigned cache_policy : 3; bool unorm : 1; bool level_zero : 1; - bool d16 : 1; /* data and return values are 16-bit, requires GFX8+ */ + bool d16 : 1; /* GFX8+: data and return values are 16-bit */ + bool a16 : 1; /* GFX9+: address components except compare, offset and bias are 16-bit */ + bool g16 : 1; /* GFX10+: derivatives are 16-bit; GFX<=9: must be equal to a16 */ bool tfe : 1; unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */ diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c index 3dac927a6cc..a70eb0a64b0 100644 --- a/src/amd/llvm/ac_nir_to_llvm.c +++ b/src/amd/llvm/ac_nir_to_llvm.c @@ -1430,6 +1430,11 @@ static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_te unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa); assert(instr->dest.is_ssa); + + /* Buffers don't support A16. */ + if (args->a16) + args->coords[0] = LLVMBuildZExt(ctx->ac.builder, args->coords[0], ctx->ac.i32, ""); + return ac_build_buffer_load_format(&ctx->ac, args->resource, args->coords[0], ctx->ac.i32_0, util_last_bit(mask), 0, true, instr->dest.ssa.bit_size == 16, @@ -4179,6 +4184,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) switch (instr->src[i].src_type) { case nir_tex_src_coord: { LLVMValueRef coord = get_src(ctx, instr->src[i].src); + args.a16 = instr->src[i].src.ssa->bit_size == 16; for (unsigned chan = 0; chan < instr->coord_components; ++chan) args.coords[chan] = ac_llvm_extract_elem(&ctx->ac, coord, chan); break; @@ -4189,22 +4195,25 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) if (instr->is_shadow) { args.compare = get_src(ctx, instr->src[i].src); args.compare = ac_to_float(&ctx->ac, args.compare); + assert(instr->src[i].src.ssa->bit_size == 32); } break; case nir_tex_src_offset: args.offset = get_src(ctx, instr->src[i].src); offset_src = i; + /* We pack it with bit shifts, so we need it to be 32-bit. */ + assert(ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.offset)) == 32); break; case nir_tex_src_bias: args.bias = get_src(ctx, instr->src[i].src); + assert(ac_get_elem_bits(&ctx->ac, LLVMTypeOf(args.bias)) == 32); break; - case nir_tex_src_lod: { + case nir_tex_src_lod: if (nir_src_is_const(instr->src[i].src) && nir_src_as_uint(instr->src[i].src) == 0) args.level_zero = true; else args.lod = get_src(ctx, instr->src[i].src); break; - } case nir_tex_src_ms_index: sample_index = get_src(ctx, instr->src[i].src); break; @@ -4212,9 +4221,11 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) break; case nir_tex_src_ddx: ddx = get_src(ctx, instr->src[i].src); + args.g16 = instr->src[i].src.ssa->bit_size == 16; break; case nir_tex_src_ddy: ddy = get_src(ctx, instr->src[i].src); + assert(LLVMTypeOf(ddy) == LLVMTypeOf(ddx)); break; case nir_tex_src_min_lod: args.min_lod = get_src(ctx, instr->src[i].src); @@ -4342,8 +4353,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) ac_to_float(&ctx->ac, ac_llvm_extract_elem(&ctx->ac, ddy, i)); } for (unsigned i = num_src_deriv_channels; i < num_dest_deriv_channels; i++) { - args.derivs[i] = ctx->ac.f32_0; - args.derivs[num_dest_deriv_channels + i] = ctx->ac.f32_0; + LLVMValueRef zero = args.g16 ? ctx->ac.f16_0 : ctx->ac.f32_0; + args.derivs[i] = zero; + args.derivs[num_dest_deriv_channels + i] = zero; } } @@ -4351,7 +4363,7 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) for (unsigned chan = 0; chan < instr->coord_components; chan++) args.coords[chan] = ac_to_float(&ctx->ac, args.coords[chan]); if (instr->coord_components == 3) - args.coords[3] = LLVMGetUndef(ctx->ac.f32); + args.coords[3] = LLVMGetUndef(args.a16 ? ctx->ac.f16 : ctx->ac.f32); ac_prepare_cube_coords(&ctx->ac, instr->op == nir_texop_txd, instr->is_array, instr->op == nir_texop_lod, args.coords, args.derivs); } @@ -4375,9 +4387,9 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) instr->op != nir_texop_lod) { LLVMValueRef filler; if (instr->op == nir_texop_txf) - filler = ctx->ac.i32_0; + filler = args.a16 ? ctx->ac.i16_0 : ctx->ac.i32_0; else - filler = LLVMConstReal(ctx->ac.f32, 0.5); + filler = LLVMConstReal(args.a16 ? ctx->ac.f16 : ctx->ac.f32, 0.5); if (instr->is_array) args.coords[2] = args.coords[1]; @@ -4417,6 +4429,8 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr) num_offsets = MIN2(num_offsets, instr->coord_components); for (unsigned i = 0; i < num_offsets; ++i) { LLVMValueRef off = ac_llvm_extract_elem(&ctx->ac, args.offset, i); + if (args.a16) + off = LLVMBuildTrunc(ctx->ac.builder, off, ctx->ac.i16, ""); args.coords[i] = LLVMBuildAdd(ctx->ac.builder, args.coords[i], off, ""); } args.offset = NULL;