nv50: use actual loads/stores if TEMPs are accessed indirectly

This commit is contained in:
Christoph Bumiller 2010-09-09 19:12:54 +02:00
parent d8dcff7970
commit f30810cb68
10 changed files with 122 additions and 23 deletions

View file

@ -414,6 +414,8 @@ nv50_generate_code(struct nv50_translation_info *ti)
nv_print_program(pc);
#endif
pc->opt_reload_elim = ti->store_to_memory ? FALSE : TRUE;
/* optimization */
ret = nv_pc_exec_pass0(pc);
if (ret)

View file

@ -345,6 +345,9 @@ struct nv_pc {
struct nv_fixup *fixups;
int num_fixups;
/* optimization enables */
boolean opt_reload_elim;
};
void nvbb_insert_tail(struct nv_basic_block *, struct nv_instruction *);

View file

@ -412,25 +412,25 @@ emit_form_IMM(struct nv_pc *pc, struct nv_instruction *i, ubyte mod_mask)
}
static void
set_ld_st_size(struct nv_pc *pc, ubyte type)
set_ld_st_size(struct nv_pc *pc, int s, ubyte type)
{
switch (type) {
case NV_TYPE_F64:
pc->emit[1] |= 0x8000;
pc->emit[1] |= 0x8000 << s;
break;
case NV_TYPE_F32:
case NV_TYPE_S32:
case NV_TYPE_U32:
pc->emit[1] |= 0xc000;
pc->emit[1] |= 0xc000 << s;
break;
case NV_TYPE_S16:
pc->emit[1] |= 0x6000;
pc->emit[1] |= 0x6000 << s;
break;
case NV_TYPE_U16:
pc->emit[1] |= 0x4000;
pc->emit[1] |= 0x4000 << s;
break;
case NV_TYPE_S8:
pc->emit[1] |= 0x2000;
pc->emit[1] |= 0x2000 << s;
break;
default:
break;
@ -473,12 +473,14 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
if (sf == NV_FILE_MEM_L) {
pc->emit[0] = 0xd0000001;
pc->emit[1] = 0x40000000;
set_addr(pc, i);
} else {
NOUVEAU_ERR("invalid ld source file\n");
abort();
}
set_ld_st_size(pc, STYPE(i, 0));
set_ld_st_size(pc, (sf == NV_FILE_MEM_L) ? 8 : 0, STYPE(i, 0));
set_dst(pc, i->def[0]);
set_pred_wr(pc, i);
@ -495,7 +497,19 @@ emit_ld(struct nv_pc *pc, struct nv_instruction *i)
static void
emit_st(struct nv_pc *pc, struct nv_instruction *i)
{
assert(SFILE(i, 1) == NV_FILE_GPR);
assert(SFILE(i, 0) == NV_FILE_MEM_L);
pc->emit[0] = 0xd0000001;
pc->emit[1] = 0x60000000;
SID(pc, i->src[1], 2);
SID(pc, i->src[0], 9);
set_ld_st_size(pc, 8, STYPE(i, 1));
set_addr(pc, i);
set_pred(pc, i);
}
static int

View file

@ -82,6 +82,8 @@ inst_commutation_legal(struct nv_instruction *a,
static INLINE boolean
inst_cullable(struct nv_instruction *nvi)
{
if (nvi->opcode == NV_OP_STA)
return FALSE;
return (!(nvi->is_terminator || nvi->is_join ||
nvi->target ||
nvi->fixed ||
@ -739,6 +741,7 @@ struct nv_pass_reld_elim {
int alloc;
};
/* TODO: properly handle loads from l[] memory in the presence of stores */
static int
nv_pass_reload_elim(struct nv_pass_reld_elim *ctx, struct nv_basic_block *b)
{
@ -1074,13 +1077,15 @@ nv_pc_pass0(struct nv_pc *pc, struct nv_basic_block *root)
if (ret)
return ret;
reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
reldelim->pc = pc;
pc->pass_seq++;
ret = nv_pass_reload_elim(reldelim, root);
FREE(reldelim);
if (ret)
return ret;
if (pc->opt_reload_elim) {
reldelim = CALLOC_STRUCT(nv_pass_reld_elim);
reldelim->pc = pc;
pc->pass_seq++;
ret = nv_pass_reload_elim(reldelim, root);
FREE(reldelim);
if (ret)
return ret;
}
pc->pass_seq++;
ret = nv_pass_cse(&pass, root);

View file

@ -217,6 +217,9 @@ nv_print_value(struct nv_value *value, struct nv_value *ind, ubyte type)
case NV_FILE_FLAGS:
PRINT(" %s%cc%i", mgta, reg_pfx, nv_value_id(value));
break;
case NV_FILE_MEM_L:
nv_print_address('l', -1, ind, 4 * nv_value_id(value));
break;
case NV_FILE_MEM_S:
nv_print_address('s', -1, ind, 4 * nv_value_id(value));
break;

View file

@ -168,10 +168,17 @@ prog_inst(struct nv50_translation_info *ti,
inst->Src[0].Register.File == TGSI_FILE_INPUT &&
dst->Index == ti->edgeflag_out)
ti->p->vp.edgeflag = inst->Src[0].Register.Index;
} else
if (inst->Dst[0].Register.File == TGSI_FILE_TEMPORARY) {
if (inst->Dst[0].Register.Indirect)
ti->store_to_memory = TRUE;
}
for (s = 0; s < inst->Instruction.NumSrcRegs; ++s) {
src = &inst->Src[s].Register;
if (src->File == TGSI_FILE_TEMPORARY)
if (inst->Src[s].Register.Indirect)
ti->store_to_memory = TRUE;
if (src->File != TGSI_FILE_INPUT)
continue;
mask = nv50_tgsi_src_mask(inst, s);

View file

@ -116,6 +116,7 @@ struct nv50_translation_info {
int output_access[PIPE_MAX_SHADER_OUTPUTS][4];
boolean indirect_inputs;
boolean indirect_outputs;
boolean store_to_memory;
struct tgsi_shader_info scan;
uint32_t *immd32;
unsigned immd32_nr;

View file

@ -274,7 +274,7 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
uint64_t value;
unsigned chipset = dev->chipset;
unsigned tesla_class = 0;
unsigned stack_size;
unsigned stack_size, local_size, max_warps;
int ret, i;
const unsigned rl = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
@ -495,9 +495,10 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
/* shader stack */
nouveau_device_get_param(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
stack_size = util_bitcount(value & 0xffff);
stack_size *= util_bitcount((value >> 24) & 0xf);
stack_size *= 32 * 64 * 8;
max_warps = util_bitcount(value & 0xffff);
max_warps *= util_bitcount((value >> 24) & 0xf) * 32;
stack_size = max_warps * 64 * 8;
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
stack_size, &screen->stack_bo);
@ -510,6 +511,22 @@ nv50_screen_create(struct pipe_winsys *ws, struct nouveau_device *dev)
OUT_RELOCl(chan, screen->stack_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
OUT_RING (chan, 4);
local_size = (NV50_CAP_MAX_PROGRAM_TEMPS * 16) * max_warps * 32;
ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 16,
local_size, &screen->local_bo);
if (ret) {
nv50_screen_destroy(pscreen);
return NULL;
}
local_size = NV50_CAP_MAX_PROGRAM_TEMPS * 16;
BEGIN_RING(chan, screen->tesla, NV50TCL_LOCAL_ADDRESS_HIGH, 3);
OUT_RELOCh(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
OUT_RELOCl(chan, screen->local_bo, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
OUT_RING (chan, util_unsigned_logbase2(local_size / 8));
/* Vertex array limits - max them out */
for (i = 0; i < 16; i++) {
BEGIN_RING(chan, screen->tesla,

View file

@ -25,7 +25,8 @@ struct nv50_screen {
struct nouveau_bo *tic;
struct nouveau_bo *tsc;
struct nouveau_bo *stack_bo;
struct nouveau_bo *stack_bo; /* control flow stack */
struct nouveau_bo *local_bo; /* l[] memory */
boolean force_push;
};

View file

@ -558,6 +558,38 @@ bld_insn_3(struct bld_context *bld, uint opcode,
return bld_def(insn, 0, new_value(bld->pc, NV_FILE_GPR, src0->reg.type));
}
static void
bld_lmem_store(struct bld_context *bld, struct nv_value *ptr, int ofst,
struct nv_value *val)
{
struct nv_instruction *insn = new_instruction(bld->pc, NV_OP_STA);
struct nv_value *loc;
loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
loc->reg.id = ofst * 4;
nv_reference(bld->pc, &insn->src[0], loc);
nv_reference(bld->pc, &insn->src[1], val);
nv_reference(bld->pc, &insn->src[4], ptr);
}
static struct nv_value *
bld_lmem_load(struct bld_context *bld, struct nv_value *ptr, int ofst)
{
struct nv_value *loc, *val;
loc = new_value(bld->pc, NV_FILE_MEM_L, NV_TYPE_U32);
loc->reg.id = ofst * 4;
val = bld_insn_1(bld, NV_OP_LDA, loc);
nv_reference(bld->pc, &val->insn->src[4], ptr);
return val;
}
#define BLD_INSN_1_EX(d, op, dt, s0, s0t) \
do { \
(d) = bld_insn_1(bld, (NV_OP_##op), (s0)); \
@ -854,10 +886,18 @@ infer_dst_type(unsigned opcode)
static void
emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
unsigned chan, struct nv_value *value)
unsigned chan, struct nv_value *value)
{
struct nv_value *ptr;
const struct tgsi_full_dst_register *reg = &inst->Dst[0];
if (reg->Register.Indirect) {
ptr = FETCH_ADDR(reg->Indirect.Index,
tgsi_util_get_src_register_swizzle(&reg->Indirect, 0));
} else {
ptr = NULL;
}
assert(chan < 4);
if (inst->Instruction.Opcode != TGSI_OPCODE_MOV)
@ -893,7 +933,11 @@ emit_store(struct bld_context *bld, const struct tgsi_full_instruction *inst,
value->reg.file = NV_FILE_GPR;
if (value->insn->bb != bld->pc->current_block)
value = bld_insn_1(bld, NV_OP_MOV, value);
STORE_TEMP(reg->Register.Index, chan, value);
if (bld->ti->store_to_memory)
bld_lmem_store(bld, ptr, reg->Register.Index * 4 + chan, value);
else
STORE_TEMP(reg->Register.Index, chan, value);
break;
case TGSI_FILE_ADDRESS:
assert(reg->Register.Index < BLD_MAX_ADDRS);
@ -1064,8 +1108,10 @@ emit_fetch(struct bld_context *bld, const struct tgsi_full_instruction *insn,
bld->saved_inputs[bld->ti->input_map[idx][swz]] = res;
break;
case TGSI_FILE_TEMPORARY:
/* this should be load from l[], with reload elimination later on */
res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
if (bld->ti->store_to_memory)
res = bld_lmem_load(bld, ptr, idx * 4 + swz);
else
res = bld_fetch_global(bld, &bld->tvs[idx][swz]);
break;
case TGSI_FILE_ADDRESS:
res = bld_fetch_global(bld, &bld->avs[idx][swz]);