mirror of
https://gitlab.freedesktop.org/mesa/mesa.git
synced 2025-12-24 00:10:10 +01:00
nv50/ir,nvc0: use constant buffers for compute when possible on Kepler+
Gives a +7.79% increase in FPS with Hitman on lowest quality settings on
my GTX 1060.
total instructions in shared programs : 5787979 -> 5748677 (-0.68%)
total gprs used in shared programs : 669901 -> 669373 (-0.08%)
total shared used in shared programs : 548832 -> 548832 (0.00%)
total local used in shared programs : 21068 -> 21064 (-0.02%)
local shared gpr inst bytes
helped 1 0 152 274 274
hurt 0 0 0 0 0
Signed-off-by: Rhys Perry <pendingchaos02@gmail.com>
Reviewed-by: Karol Herbst <kherbst@redhat.com>
This commit is contained in:
parent
d27c791891
commit
e56e600bd3
2 changed files with 36 additions and 10 deletions
|
|
@ -2464,18 +2464,16 @@ NVC0LoweringPass::handleLDST(Instruction *i)
|
|||
assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
|
||||
}
|
||||
} else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
|
||||
int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
|
||||
Value *ind = i->getIndirect(0, 1);
|
||||
|
||||
if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
|
||||
prog->getType() == Program::TYPE_COMPUTE) {
|
||||
prog->getType() == Program::TYPE_COMPUTE &&
|
||||
(fileIndex >= 6 || ind)) {
|
||||
// The launch descriptor only allows to set up 8 CBs, but OpenGL
|
||||
// requires at least 12 UBOs. To bypass this limitation, we store the
|
||||
// addrs into the driver constbuf and we directly load from the global
|
||||
// memory.
|
||||
int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
|
||||
Value *ind = i->getIndirect(0, 1);
|
||||
|
||||
if (!ind && fileIndex == -1)
|
||||
return;
|
||||
|
||||
// requires at least 12 UBOs. To bypass this limitation, for constant
|
||||
// buffers 7+, we store the addrs into the driver constbuf and we
|
||||
// directly load from the global memory.
|
||||
if (ind) {
|
||||
// Clamp the UBO index when an indirect access is used to avoid
|
||||
// loading information from the wrong place in the driver cb.
|
||||
|
|
|
|||
|
|
@ -551,6 +551,30 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
|
|||
return NVC1_3D_CACHE_SPLIT_16K_SHARED_48K_L1;
|
||||
}
|
||||
|
||||
static void
|
||||
nve4_compute_setup_buf_cb(struct nvc0_context *nvc0, bool gp100, void *desc)
|
||||
{
|
||||
// only user constant buffers 1-6 can be put in the descriptor, the rest are
|
||||
// loaded through global memory
|
||||
for (int i = 1; i <= 6; i++) {
|
||||
if (nvc0->constbuf[5][i].user || !nvc0->constbuf[5][i].u.buf)
|
||||
continue;
|
||||
|
||||
struct nv04_resource *res =
|
||||
nv04_resource(nvc0->constbuf[5][i].u.buf);
|
||||
|
||||
uint32_t base = res->offset + nvc0->constbuf[5][i].offset;
|
||||
uint32_t size = nvc0->constbuf[5][i].size;
|
||||
if (gp100)
|
||||
gp100_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
|
||||
else
|
||||
nve4_cp_launch_desc_set_cb(desc, i, res->bo, base, size);
|
||||
}
|
||||
|
||||
// there is no need to do FLUSH(NVE4_COMPUTE_FLUSH_CB) because
|
||||
// nve4_compute_upload_input() does it later
|
||||
}
|
||||
|
||||
static void
|
||||
nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
|
||||
struct nve4_cp_launch_desc *desc,
|
||||
|
|
@ -588,6 +612,8 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
|
|||
}
|
||||
nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
|
||||
NVC0_CB_AUX_INFO(5), 1 << 11);
|
||||
|
||||
nve4_compute_setup_buf_cb(nvc0, false, desc);
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -626,6 +652,8 @@ gp100_compute_setup_launch_desc(struct nvc0_context *nvc0,
|
|||
}
|
||||
gp100_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
|
||||
NVC0_CB_AUX_INFO(5), 1 << 11);
|
||||
|
||||
nve4_compute_setup_buf_cb(nvc0, true, desc);
|
||||
}
|
||||
|
||||
static inline void *
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue