diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c index 9aff2f8435d..3d7eb7b0421 100644 --- a/src/amd/common/ac_llvm_build.c +++ b/src/amd/common/ac_llvm_build.c @@ -905,6 +905,37 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); } +LLVMValueRef +ac_build_fs_interp_f16(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j) +{ + LLVMValueRef args[6]; + LLVMValueRef p1; + + args[0] = i; + args[1] = llvm_chan; + args[2] = attr_number; + args[3] = ctx->i1false; + args[4] = params; + + p1 = ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p1.f16", + ctx->f32, args, 5, AC_FUNC_ATTR_READNONE); + + args[0] = p1; + args[1] = j; + args[2] = llvm_chan; + args[3] = attr_number; + args[4] = ctx->i1false; + args[5] = params; + + return ac_build_intrinsic(ctx, "llvm.amdgcn.interp.p2.f16", + ctx->f16, args, 6, AC_FUNC_ATTR_READNONE); +} + LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h index f218eaf2832..370e7e9741c 100644 --- a/src/amd/common/ac_llvm_build.h +++ b/src/amd/common/ac_llvm_build.h @@ -216,6 +216,14 @@ ac_build_fs_interp(struct ac_llvm_context *ctx, LLVMValueRef i, LLVMValueRef j); +LLVMValueRef +ac_build_fs_interp_f16(struct ac_llvm_context *ctx, + LLVMValueRef llvm_chan, + LLVMValueRef attr_number, + LLVMValueRef params, + LLVMValueRef i, + LLVMValueRef j); + LLVMValueRef ac_build_fs_interp_mov(struct ac_llvm_context *ctx, LLVMValueRef parameter, diff --git a/src/amd/vulkan/radv_device.c b/src/amd/vulkan/radv_device.c index 46dd3b4a8f0..4bc4e6bcc03 100644 --- a/src/amd/vulkan/radv_device.c +++ b/src/amd/vulkan/radv_device.c @@ -806,7 +806,7 @@ void radv_GetPhysicalDeviceFeatures2( features->storageBuffer16BitAccess = enabled; features->uniformAndStorageBuffer16BitAccess = enabled; features->storagePushConstant16 = enabled; - features->storageInputOutput16 = enabled; + features->storageInputOutput16 = enabled && HAVE_LLVM >= 0x900; break; } case VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SAMPLER_YCBCR_CONVERSION_FEATURES: { diff --git a/src/amd/vulkan/radv_nir_to_llvm.c b/src/amd/vulkan/radv_nir_to_llvm.c index cfaeaabf539..415eb38dacc 100644 --- a/src/amd/vulkan/radv_nir_to_llvm.c +++ b/src/amd/vulkan/radv_nir_to_llvm.c @@ -92,6 +92,7 @@ struct radv_shader_context { gl_shader_stage stage; LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS * 4]; + uint64_t float16_shaded_mask; uint64_t input_mask; uint64_t output_mask; @@ -2051,6 +2052,7 @@ static void interp_fs_input(struct radv_shader_context *ctx, unsigned attr, LLVMValueRef interp_param, LLVMValueRef prim_mask, + bool float16, LLVMValueRef result[4]) { LLVMValueRef attr_number; @@ -2083,7 +2085,12 @@ static void interp_fs_input(struct radv_shader_context *ctx, for (chan = 0; chan < 4; chan++) { LLVMValueRef llvm_chan = LLVMConstInt(ctx->ac.i32, chan, false); - if (interp) { + if (interp && float16) { + result[chan] = ac_build_fs_interp_f16(&ctx->ac, + llvm_chan, + attr_number, + prim_mask, i, j); + } else if (interp) { result[chan] = ac_build_fs_interp(&ctx->ac, llvm_chan, attr_number, @@ -2095,7 +2102,30 @@ static void interp_fs_input(struct radv_shader_context *ctx, attr_number, prim_mask); result[chan] = LLVMBuildBitCast(ctx->ac.builder, result[chan], ctx->ac.i32, ""); - result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], LLVMTypeOf(interp_param), ""); + result[chan] = LLVMBuildTruncOrBitCast(ctx->ac.builder, result[chan], float16 ? ctx->ac.i16 : ctx->ac.i32, ""); + } + } +} + +static void mark_16bit_fs_input(struct radv_shader_context *ctx, + const struct glsl_type *type, + int location) +{ + if (glsl_type_is_scalar(type) || glsl_type_is_vector(type) || glsl_type_is_matrix(type)) { + unsigned attrib_count = glsl_count_attribute_slots(type, false); + if (glsl_type_is_16bit(type)) { + ctx->float16_shaded_mask |= ((1ull << attrib_count) - 1) << location; + } + } else if (glsl_type_is_array(type)) { + unsigned stride = glsl_count_attribute_slots(glsl_get_array_element(type), false); + for (unsigned i = 0; i < glsl_get_length(type); ++i) { + mark_16bit_fs_input(ctx, glsl_get_array_element(type), location + i * stride); + } + } else { + assert(glsl_type_is_struct(type)); + for (unsigned i = 0; i < glsl_get_length(type); i++) { + mark_16bit_fs_input(ctx, glsl_get_struct_field(type, i), location); + location += glsl_count_attribute_slots(glsl_get_struct_field(type, i), false); } } } @@ -2123,10 +2153,8 @@ handle_fs_input_decl(struct radv_shader_context *ctx, interp = lookup_interp_param(&ctx->abi, variable->data.interpolation, interp_type); } - bool is_16bit = glsl_type_is_16bit(glsl_without_array(variable->type)); - LLVMTypeRef type = is_16bit ? ctx->ac.i16 : ctx->ac.i32; if (interp == NULL) - interp = LLVMGetUndef(type); + interp = LLVMGetUndef(ctx->ac.i32); for (unsigned i = 0; i < attrib_count; ++i) ctx->inputs[ac_llvm_reg_index_soa(idx + i, 0)] = interp; @@ -2200,11 +2228,14 @@ handle_fs_inputs(struct radv_shader_context *ctx, if (i >= VARYING_SLOT_VAR0 || i == VARYING_SLOT_PNTC || i == VARYING_SLOT_PRIMITIVE_ID || i == VARYING_SLOT_LAYER) { interp_param = *inputs; - interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, + bool float16 = (ctx->float16_shaded_mask >> i) & 1; + interp_fs_input(ctx, index, interp_param, ctx->abi.prim_mask, float16, inputs); if (LLVMIsUndef(interp_param)) ctx->shader_info->fs.flat_shaded_mask |= 1u << index; + if (float16) + ctx->shader_info->fs.float16_shaded_mask |= 1u << index; if (i >= VARYING_SLOT_VAR0) ctx->abi.fs_input_attr_indices[i - VARYING_SLOT_VAR0] = index; ++index; @@ -2216,7 +2247,7 @@ handle_fs_inputs(struct radv_shader_context *ctx, interp_param = *inputs; interp_fs_input(ctx, index, interp_param, - ctx->abi.prim_mask, inputs); + ctx->abi.prim_mask, false, inputs); ++index; } } else if (i == VARYING_SLOT_POS) { diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c index 03e9369e094..61bdfc5cd2d 100644 --- a/src/amd/vulkan/radv_pipeline.c +++ b/src/amd/vulkan/radv_pipeline.c @@ -3088,13 +3088,17 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs, radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader); } -static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade) +static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade, bool float16) { uint32_t ps_input_cntl; if (offset <= AC_EXP_PARAM_OFFSET_31) { ps_input_cntl = S_028644_OFFSET(offset); if (flat_shade) ps_input_cntl |= S_028644_FLAT_SHADE(1); + if (float16) { + ps_input_cntl |= S_028644_FP16_INTERP_MODE(1) | + S_028644_ATTR0_VALID(1); + } } else { /* The input is a DEFAULT_VAL constant. */ assert(offset >= AC_EXP_PARAM_DEFAULT_VAL_0000 && @@ -3119,7 +3123,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, if (ps->info.info.ps.prim_id_input) { unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_PRIMITIVE_ID]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false); ++ps_offset; } } @@ -3129,9 +3133,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, ps->info.info.needs_multiview_view_index) { unsigned vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_LAYER]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, true, false); else - ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true); + ps_input_cntl[ps_offset] = offset_to_ps_input(AC_EXP_PARAM_DEFAULT_VAL_0000, true, false); ++ps_offset; } @@ -3147,14 +3151,14 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST0]; if (vs_offset != AC_EXP_PARAM_UNDEFINED) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false); ++ps_offset; } vs_offset = outinfo->vs_output_param_offset[VARYING_SLOT_CLIP_DIST1]; if (vs_offset != AC_EXP_PARAM_UNDEFINED && ps->info.info.ps.num_input_clips_culls > 4) { - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, false, false); ++ps_offset; } } @@ -3162,6 +3166,7 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, for (unsigned i = 0; i < 32 && (1u << i) <= ps->info.fs.input_mask; ++i) { unsigned vs_offset; bool flat_shade; + bool float16; if (!(ps->info.fs.input_mask & (1u << i))) continue; @@ -3173,8 +3178,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs, } flat_shade = !!(ps->info.fs.flat_shaded_mask & (1u << ps_offset)); + float16 = !!(ps->info.fs.float16_shaded_mask & (1u << ps_offset)); - ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade); + ps_input_cntl[ps_offset] = offset_to_ps_input(vs_offset, flat_shade, float16); ++ps_offset; } diff --git a/src/amd/vulkan/radv_shader.h b/src/amd/vulkan/radv_shader.h index 3652a811e80..1556b08e889 100644 --- a/src/amd/vulkan/radv_shader.h +++ b/src/amd/vulkan/radv_shader.h @@ -257,6 +257,7 @@ struct radv_shader_variant_info { unsigned num_interp; uint32_t input_mask; uint32_t flat_shaded_mask; + uint32_t float16_shaded_mask; bool can_discard; bool early_fragment_test; } fs;