mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 15:20:10 +01:00
radeonsi: reload PS inputs with direct indexing at each use (v2)
The LLVM compiler can CSE interp intrinsics thanks to LLVMReadNoneAttribute. 26011 shaders in 14651 tests Totals: SGPRS: 1146340 -> 1132676 (-1.19 %) VGPRS: 727371 -> 711730 (-2.15 %) Spilled SGPRs: 2218 -> 2078 (-6.31 %) Spilled VGPRs: 369 -> 369 (0.00 %) Scratch VGPRs: 1344 -> 1344 (0.00 %) dwords per thread Code Size: 35841268 -> 36009732 (0.47 %) bytes LDS: 767 -> 767 (0.00 %) blocks Max Waves: 222559 -> 224779 (1.00 %) Wait states: 0 -> 0 (0.00 %) v2: don't call load_input for fragment shaders in emit_declaration Reviewed-by: Nicolai Hähnle <nicolai.haehnle@amd.com>
This commit is contained in:
parent
007b512f9d
commit
ab29788250
3 changed files with 41 additions and 22 deletions
|
|
@ -30,7 +30,9 @@
|
|||
#include <llvm-c/Core.h>
|
||||
#include "gallivm/lp_bld_init.h"
|
||||
#include "gallivm/lp_bld_tgsi.h"
|
||||
#include "tgsi/tgsi_parse.h"
|
||||
|
||||
#define RADEON_LLVM_MAX_INPUT_SLOTS 32
|
||||
#define RADEON_LLVM_MAX_INPUTS 32 * 4
|
||||
#define RADEON_LLVM_MAX_OUTPUTS 32 * 4
|
||||
|
||||
|
|
@ -62,7 +64,8 @@ struct radeon_llvm_context {
|
|||
*/
|
||||
void (*load_input)(struct radeon_llvm_context *,
|
||||
unsigned input_index,
|
||||
const struct tgsi_full_declaration *decl);
|
||||
const struct tgsi_full_declaration *decl,
|
||||
LLVMValueRef out[4]);
|
||||
|
||||
void (*load_system_value)(struct radeon_llvm_context *,
|
||||
unsigned index,
|
||||
|
|
@ -75,6 +78,7 @@ struct radeon_llvm_context {
|
|||
* values will be in the form of a target intrinsic that will inform the
|
||||
* backend how to load the actual inputs to the shader.
|
||||
*/
|
||||
struct tgsi_full_declaration input_decls[RADEON_LLVM_MAX_INPUT_SLOTS];
|
||||
LLVMValueRef inputs[RADEON_LLVM_MAX_INPUTS];
|
||||
LLVMValueRef outputs[RADEON_LLVM_MAX_OUTPUTS][TGSI_NUM_CHANNELS];
|
||||
|
||||
|
|
|
|||
|
|
@ -446,14 +446,29 @@ LLVMValueRef radeon_llvm_emit_fetch(struct lp_build_tgsi_context *bld_base,
|
|||
}
|
||||
}
|
||||
|
||||
case TGSI_FILE_INPUT:
|
||||
result = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle)];
|
||||
case TGSI_FILE_INPUT: {
|
||||
unsigned index = reg->Register.Index;
|
||||
LLVMValueRef input[4];
|
||||
|
||||
/* I don't think doing this for vertex shaders is beneficial.
|
||||
* For those, we want to make sure the VMEM loads are executed
|
||||
* only once. Fragment shaders don't care much, because
|
||||
* v_interp instructions are much cheaper than VMEM loads.
|
||||
*/
|
||||
if (ctx->soa.bld_base.info->processor == PIPE_SHADER_FRAGMENT)
|
||||
ctx->load_input(ctx, index, &ctx->input_decls[index], input);
|
||||
else
|
||||
memcpy(input, &ctx->inputs[index * 4], sizeof(input));
|
||||
|
||||
result = input[swizzle];
|
||||
|
||||
if (tgsi_type_is_64bit(type)) {
|
||||
ptr = result;
|
||||
ptr2 = ctx->inputs[radeon_llvm_reg_index_soa(reg->Register.Index, swizzle + 1)];
|
||||
ptr2 = input[swizzle + 1];
|
||||
return radeon_llvm_emit_fetch_64bit(bld_base, type, ptr, ptr2);
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
case TGSI_FILE_TEMPORARY:
|
||||
if (reg->Register.Index >= ctx->temps_count)
|
||||
|
|
@ -626,8 +641,13 @@ static void emit_declaration(struct lp_build_tgsi_context *bld_base,
|
|||
{
|
||||
unsigned idx;
|
||||
for (idx = decl->Range.First; idx <= decl->Range.Last; idx++) {
|
||||
if (ctx->load_input)
|
||||
ctx->load_input(ctx, idx, decl);
|
||||
if (ctx->load_input) {
|
||||
ctx->input_decls[idx] = *decl;
|
||||
|
||||
if (bld_base->info->processor != PIPE_SHADER_FRAGMENT)
|
||||
ctx->load_input(ctx, idx, decl,
|
||||
&ctx->inputs[idx * 4]);
|
||||
}
|
||||
}
|
||||
}
|
||||
break;
|
||||
|
|
|
|||
|
|
@ -440,7 +440,8 @@ static LLVMValueRef get_instance_index_for_fetch(
|
|||
static void declare_input_vs(
|
||||
struct radeon_llvm_context *radeon_bld,
|
||||
unsigned input_index,
|
||||
const struct tgsi_full_declaration *decl)
|
||||
const struct tgsi_full_declaration *decl,
|
||||
LLVMValueRef out[4])
|
||||
{
|
||||
struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
|
||||
struct gallivm_state *gallivm = base->gallivm;
|
||||
|
|
@ -498,11 +499,8 @@ static void declare_input_vs(
|
|||
/* Break up the vec4 into individual components */
|
||||
for (chan = 0; chan < 4; chan++) {
|
||||
LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
|
||||
/* XXX: Use a helper function for this. There is one in
|
||||
* tgsi_llvm.c. */
|
||||
ctx->radeon_bld.inputs[radeon_llvm_reg_index_soa(input_index, chan)] =
|
||||
LLVMBuildExtractElement(gallivm->builder,
|
||||
input, llvm_chan, "");
|
||||
out[chan] = LLVMBuildExtractElement(gallivm->builder,
|
||||
input, llvm_chan, "");
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -1463,7 +1461,8 @@ static LLVMValueRef get_interp_param(struct si_shader_context *ctx,
|
|||
static void declare_input_fs(
|
||||
struct radeon_llvm_context *radeon_bld,
|
||||
unsigned input_index,
|
||||
const struct tgsi_full_declaration *decl)
|
||||
const struct tgsi_full_declaration *decl,
|
||||
LLVMValueRef out[4])
|
||||
{
|
||||
struct lp_build_context *base = &radeon_bld->soa.bld_base.base;
|
||||
struct si_shader_context *ctx =
|
||||
|
|
@ -1482,14 +1481,10 @@ static void declare_input_fs(
|
|||
unsigned offset = SI_PARAM_POS_FIXED_PT + 1 +
|
||||
(i ? util_bitcount(colors_read & 0xf) : 0);
|
||||
|
||||
radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)] =
|
||||
mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 1)] =
|
||||
mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 2)] =
|
||||
mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 3)] =
|
||||
mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
out[0] = mask & 0x1 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
out[1] = mask & 0x2 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
out[2] = mask & 0x4 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
out[3] = mask & 0x8 ? LLVMGetParam(main_fn, offset++) : base->undef;
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
@ -1513,7 +1508,7 @@ static void declare_input_fs(
|
|||
shader->selector->info.colors_read, interp_param,
|
||||
LLVMGetParam(main_fn, SI_PARAM_PRIM_MASK),
|
||||
LLVMGetParam(main_fn, SI_PARAM_FRONT_FACE),
|
||||
&radeon_bld->inputs[radeon_llvm_reg_index_soa(input_index, 0)]);
|
||||
&out[0]);
|
||||
}
|
||||
|
||||
static LLVMValueRef get_sample_id(struct radeon_llvm_context *radeon_bld)
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue