i965: Use unsynchronized maps for the program cache on LLC platforms.

There's no reason to stall on pwrite - the CPU always appends to the
buffer and never modifies existing contents, and the GPU never writes
it.  Further, the CPU always appends new data before submitting a batch
that requires it.

This code predates the unsynchronized mapping feature, so we simply
didn't have the option when it was written.

Ideally, we would do this for non-LLC platforms too, but unsynchronized
mapping support only exists for LLC systems.

Saves a bunch of stall avoidance copies when uploading shaders.

v2: Rebase on changes to previous patch.

Signed-off-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Kristian Høgsberg <krh@bitplanet.net> [v1]
This commit is contained in:
Kenneth Graunke 2014-08-21 14:41:17 -07:00
parent 4c766c7959
commit 02ca66fbc3

View file

@ -172,14 +172,23 @@ brw_cache_new_bo(struct brw_cache *cache, uint32_t new_size)
drm_intel_bo *new_bo; drm_intel_bo *new_bo;
new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64); new_bo = drm_intel_bo_alloc(brw->bufmgr, "program cache", new_size, 64);
if (brw->has_llc)
drm_intel_gem_bo_map_unsynchronized(new_bo);
/* Copy any existing data that needs to be saved. */ /* Copy any existing data that needs to be saved. */
if (cache->next_offset != 0) { if (cache->next_offset != 0) {
drm_intel_bo_map(cache->bo, false); if (brw->has_llc) {
drm_intel_bo_subdata(new_bo, 0, cache->next_offset, cache->bo->virtual); memcpy(new_bo->virtual, cache->bo->virtual, cache->next_offset);
drm_intel_bo_unmap(cache->bo); } else {
drm_intel_bo_map(cache->bo, false);
drm_intel_bo_subdata(new_bo, 0, cache->next_offset,
cache->bo->virtual);
drm_intel_bo_unmap(cache->bo);
}
} }
if (brw->has_llc)
drm_intel_bo_unmap(cache->bo);
drm_intel_bo_unreference(cache->bo); drm_intel_bo_unreference(cache->bo);
cache->bo = new_bo; cache->bo = new_bo;
cache->bo_used_by_gpu = false; cache->bo_used_by_gpu = false;
@ -200,6 +209,7 @@ brw_try_upload_using_copy(struct brw_cache *cache,
const void *data, const void *data,
const void *aux) const void *aux)
{ {
struct brw_context *brw = cache->brw;
int i; int i;
struct brw_cache_item *item; struct brw_cache_item *item;
@ -221,9 +231,11 @@ brw_try_upload_using_copy(struct brw_cache *cache,
continue; continue;
} }
drm_intel_bo_map(cache->bo, false); if (!brw->has_llc)
drm_intel_bo_map(cache->bo, false);
ret = memcmp(cache->bo->virtual + item->offset, data, item->size); ret = memcmp(cache->bo->virtual + item->offset, data, item->size);
drm_intel_bo_unmap(cache->bo); if (!brw->has_llc)
drm_intel_bo_unmap(cache->bo);
if (ret) if (ret)
continue; continue;
@ -256,7 +268,7 @@ brw_upload_item_data(struct brw_cache *cache,
/* If we would block on writing to an in-use program BO, just /* If we would block on writing to an in-use program BO, just
* recreate it. * recreate it.
*/ */
if (cache->bo_used_by_gpu) { if (!brw->has_llc && cache->bo_used_by_gpu) {
perf_debug("Copying busy program cache buffer.\n"); perf_debug("Copying busy program cache buffer.\n");
brw_cache_new_bo(cache, cache->bo->size); brw_cache_new_bo(cache, cache->bo->size);
} }
@ -279,6 +291,7 @@ brw_upload_cache(struct brw_cache *cache,
uint32_t *out_offset, uint32_t *out_offset,
void *out_aux) void *out_aux)
{ {
struct brw_context *brw = cache->brw;
struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item); struct brw_cache_item *item = CALLOC_STRUCT(brw_cache_item);
GLuint hash; GLuint hash;
void *tmp; void *tmp;
@ -319,7 +332,11 @@ brw_upload_cache(struct brw_cache *cache,
cache->n_items++; cache->n_items++;
/* Copy data to the buffer */ /* Copy data to the buffer */
drm_intel_bo_subdata(cache->bo, item->offset, data_size, data); if (brw->has_llc) {
memcpy((char *) cache->bo->virtual + item->offset, data, data_size);
} else {
drm_intel_bo_subdata(cache->bo, item->offset, data_size, data);
}
*out_offset = item->offset; *out_offset = item->offset;
*(void **)out_aux = (void *)((char *)item->key + item->key_size); *(void **)out_aux = (void *)((char *)item->key + item->key_size);
@ -341,6 +358,8 @@ brw_init_caches(struct brw_context *brw)
cache->bo = drm_intel_bo_alloc(brw->bufmgr, cache->bo = drm_intel_bo_alloc(brw->bufmgr,
"program cache", "program cache",
4096, 64); 4096, 64);
if (brw->has_llc)
drm_intel_gem_bo_map_unsynchronized(cache->bo);
cache->aux_compare[BRW_VS_PROG] = brw_vs_prog_data_compare; cache->aux_compare[BRW_VS_PROG] = brw_vs_prog_data_compare;
cache->aux_compare[BRW_GS_PROG] = brw_gs_prog_data_compare; cache->aux_compare[BRW_GS_PROG] = brw_gs_prog_data_compare;
@ -407,6 +426,8 @@ brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
DBG("%s\n", __FUNCTION__); DBG("%s\n", __FUNCTION__);
if (brw->has_llc)
drm_intel_bo_unmap(cache->bo);
drm_intel_bo_unreference(cache->bo); drm_intel_bo_unreference(cache->bo);
cache->bo = NULL; cache->bo = NULL;
brw_clear_cache(brw, cache); brw_clear_cache(brw, cache);