diff --git a/src/gallium/frontends/nine/buffer9.c b/src/gallium/frontends/nine/buffer9.c index 3fde6a784ba..81f239f3d93 100644 --- a/src/gallium/frontends/nine/buffer9.c +++ b/src/gallium/frontends/nine/buffer9.c @@ -87,7 +87,9 @@ NineBuffer9_ctor( struct NineBuffer9 *This, * some small behavior differences between vendors). Implementing exactly as MANAGED should * be fine. */ - if (Pool != D3DPOOL_DEFAULT) + if (Pool == D3DPOOL_SYSTEMMEM && Usage & D3DUSAGE_DYNAMIC) + info->usage = PIPE_USAGE_STREAM; + else if (Pool != D3DPOOL_DEFAULT) info->usage = PIPE_USAGE_DEFAULT; else if (Usage & D3DUSAGE_DYNAMIC && Usage & D3DUSAGE_WRITEONLY) info->usage = PIPE_USAGE_STREAM; @@ -140,6 +142,10 @@ NineBuffer9_ctor( struct NineBuffer9 *This, memset(This->managed.data, 0, Size); This->managed.dirty = TRUE; u_box_1d(0, Size, &This->managed.dirty_box); + u_box_1d(0, 0, &This->managed.valid_region); + u_box_1d(0, 0, &This->managed.required_valid_region); + u_box_1d(0, 0, &This->managed.filled_region); + This->managed.can_unsynchronized = true; list_inithead(&This->managed.list); list_inithead(&This->managed.list2); list_add(&This->managed.list2, &pParams->device->managed_buffers); @@ -245,28 +251,50 @@ NineBuffer9_Lock( struct NineBuffer9 *This, u_box_1d(OffsetToLock, SizeToLock, &box); if (This->base.pool != D3DPOOL_DEFAULT) { - /* Systemmem takes into account writes outside the locked region on AMD/NVidia */ - if (This->base.pool == D3DPOOL_SYSTEMMEM) - u_box_1d(0, This->size, &box); - /* READONLY doesn't dirty the buffer */ - /* Tests on Win: READONLY doesn't wait for the upload */ - if (!(Flags & D3DLOCK_READONLY)) { - if (!This->managed.dirty) { - assert(list_is_empty(&This->managed.list)); - This->managed.dirty = TRUE; - This->managed.dirty_box = box; - /* Flush if regions pending to be uploaded would be dirtied */ - if (p_atomic_read(&This->managed.pending_upload)) { - u_box_intersect_1d(&box, &box, &This->managed.upload_pending_regions); - if (box.width != 0) - nine_csmt_process(This->base.base.device); - } - } else - u_box_union_1d(&This->managed.dirty_box, &This->managed.dirty_box, &box); - /* Tests trying to draw while the buffer is locked show that - * MANAGED buffers are made dirty at Lock time */ + /* MANAGED: READONLY doesn't dirty the buffer, nor + * wait the upload in the worker thread + * SYSTEMMEM: AMD/NVidia: All locks dirty the full buffer. Not on Intel + * For Nvidia, SYSTEMMEM behaves are if there is no worker thread. + * On AMD, READONLY and NOOVERWRITE do dirty the buffer, but do not sync the previous uploads + * in the worker thread. On Intel only NOOVERWRITE has that effect. + * We implement the AMD behaviour. */ + if (This->base.pool == D3DPOOL_MANAGED) { + if (!(Flags & D3DLOCK_READONLY)) { + if (!This->managed.dirty) { + assert(list_is_empty(&This->managed.list)); + This->managed.dirty = TRUE; + This->managed.dirty_box = box; + /* Flush if regions pending to be uploaded would be dirtied */ + if (p_atomic_read(&This->managed.pending_upload)) { + u_box_intersect_1d(&box, &box, &This->managed.upload_pending_regions); + if (box.width != 0) + nine_csmt_process(This->base.base.device); + } + } else + u_box_union_1d(&This->managed.dirty_box, &This->managed.dirty_box, &box); + /* Tests trying to draw while the buffer is locked show that + * SYSTEMMEM/MANAGED buffers are made dirty at Lock time */ + BASEBUF_REGISTER_UPDATE(This); + } + } else { + if (!(Flags & (D3DLOCK_READONLY|D3DLOCK_NOOVERWRITE)) && + p_atomic_read(&This->managed.pending_upload)) { + nine_csmt_process(This->base.base.device); + /* Note: AS DISCARD is not relevant for SYSTEMMEM, + * NOOVERWRITE might have a similar meaning as what is + * in D3D7 doc. Basically that data from previous draws + * OF THIS FRAME are unaffected. As we flush csmt in Present(), + * we should be correct. In some parts of the doc, the notion + * of frame is implied to be related to Begin/EndScene(), + * but tests show NOOVERWRITE after EndScene() doesn't flush + * the csmt thread. */ + } + This->managed.dirty = true; + u_box_1d(0, This->size, &This->managed.dirty_box); /* systemmem non-dynamic */ + u_box_1d(0, 0, &This->managed.valid_region); /* systemmem dynamic */ BASEBUF_REGISTER_UPDATE(This); } + *ppbData = (char *)This->managed.data + OffsetToLock; DBG("returning pointer %p\n", *ppbData); This->nlocks++; diff --git a/src/gallium/frontends/nine/buffer9.h b/src/gallium/frontends/nine/buffer9.h index 8b2bcc88f6a..039ad1a2cda 100644 --- a/src/gallium/frontends/nine/buffer9.h +++ b/src/gallium/frontends/nine/buffer9.h @@ -29,6 +29,7 @@ #include "nine_state.h" #include "resource9.h" #include "pipe/p_context.h" +#include "pipe/p_defines.h" #include "pipe/p_state.h" #include "util/list.h" #include "util/u_box.h" @@ -69,6 +70,12 @@ struct NineBuffer9 struct list_head list; /* for update_buffers */ struct list_head list2; /* for managed_buffers */ unsigned pending_upload; /* for uploads */ + /* SYSTEMMEM DYNAMIC */ + bool can_unsynchronized; /* Whether the upload can use nooverwrite */ + struct pipe_box valid_region; /* Region in the GPU buffer with valid content */ + struct pipe_box required_valid_region; /* Region that needs to be valid right now. */ + struct pipe_box filled_region; /* Region in the GPU buffer filled since last discard */ + unsigned frame_count_last_discard; } managed; }; static inline struct NineBuffer9 * @@ -101,25 +108,146 @@ NineBuffer9_Lock( struct NineBuffer9 *This, HRESULT NINE_WINAPI NineBuffer9_Unlock( struct NineBuffer9 *This ); +/* Try to remove b from a, supposed to include b */ +static void u_box_try_remove_region_1d(struct pipe_box *dst, + const struct pipe_box *a, + const struct pipe_box *b) +{ + int x, width; + if (a->x == b->x) { + x = a->x + b->width; + width = a->width - b->width; + } else if ((a->x + a->width) == (b->x + b->width)) { + x = a->x; + width = a->width - b->width; + } else { + x = a->x; + width = a->width; + } + dst->x = x; + dst->width = width; +} + static inline void NineBuffer9_Upload( struct NineBuffer9 *This ) { struct NineDevice9 *device = This->base.base.device; + unsigned upload_flags = 0; + struct pipe_box box_upload; assert(This->base.pool != D3DPOOL_DEFAULT && This->managed.dirty); + + if (This->base.pool == D3DPOOL_SYSTEMMEM && This->base.usage & D3DUSAGE_DYNAMIC) { + struct pipe_box region_already_valid; + struct pipe_box conflicting_region; + struct pipe_box *valid_region = &This->managed.valid_region; + struct pipe_box *required_valid_region = &This->managed.required_valid_region; + struct pipe_box *filled_region = &This->managed.filled_region; + /* Try to upload SYSTEMMEM DYNAMIC in an efficient fashion. + * Unlike non-dynamic for which we upload the whole dirty region, try to + * only upload the data needed for the draw. The draw call preparation + * fills This->managed.required_valid_region for that */ + u_box_intersect_1d(®ion_already_valid, + valid_region, + required_valid_region); + /* If the required valid region is already valid, nothing to do */ + if (region_already_valid.x == required_valid_region->x && + region_already_valid.width == required_valid_region->width) { + u_box_1d(0, 0, required_valid_region); + return; + } + /* (Try to) Remove valid areas from the region to upload */ + u_box_try_remove_region_1d(&box_upload, + required_valid_region, + ®ion_already_valid); + assert(box_upload.width > 0); + /* To maintain correctly the valid region, as we will do union later with + * box_upload, we must ensure box_upload is consecutive with valid_region */ + if (box_upload.x > valid_region->x + valid_region->width && valid_region->width > 0) { + box_upload.width = box_upload.x + box_upload.width - (valid_region->x + valid_region->width); + box_upload.x = valid_region->x + valid_region->width; + } else if (box_upload.x + box_upload.width < valid_region->x && valid_region->width > 0) { + box_upload.width = valid_region->x - box_upload.x; + } + /* There is conflict if some areas, that are not valid but are filled for previous draw calls, + * intersect with the region we plan to upload. Note by construction valid_region IS + * included in filled_region, thus so is region_already_valid. */ + u_box_intersect_1d(&conflicting_region, &box_upload, filled_region); + /* As box_upload could still contain region_already_valid, check the intersection + * doesn't happen to be exactly region_already_valid (it cannot be smaller, see above) */ + if (This->managed.can_unsynchronized && (conflicting_region.width == 0 || + (conflicting_region.x == region_already_valid.x && + conflicting_region.width == region_already_valid.width))) { + /* No conflicts. */ + upload_flags |= PIPE_MAP_UNSYNCHRONIZED; + } else { + /* We cannot use PIPE_MAP_UNSYNCHRONIZED. We must choose between no flag and DISCARD. + * Criterias to discard: + * . Most of the resource was filled (but some apps do allocate a big buffer + * to only use a small part in a round fashion) + * . The region to upload is very small compared to the filled region and + * at the start of the buffer (hints at round usage starting again) + * . The region to upload is very big compared to the required region + * . We have not discarded yet this frame */ + if (filled_region->width > (This->size / 2) || + (10 * box_upload.width < filled_region->width && + box_upload.x < (filled_region->x + filled_region->width)/2) || + box_upload.width > 2 * required_valid_region->width || + This->managed.frame_count_last_discard != device->frame_count) { + /* Avoid DISCARDING too much by discarding only if most of the buffer + * has been used */ + DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER, + "Uploading %p DISCARD: valid %d %d, filled %d %d, required %d %d, box_upload %d %d, required already_valid %d %d, conficting %d %d\n", + This, valid_region->x, valid_region->width, filled_region->x, filled_region->width, + required_valid_region->x, required_valid_region->width, box_upload.x, box_upload.width, + region_already_valid.x, region_already_valid.width, conflicting_region.x, conflicting_region.width + ); + upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; + u_box_1d(0, 0, filled_region); + u_box_1d(0, 0, valid_region); + box_upload = This->managed.required_valid_region; + This->managed.can_unsynchronized = true; + This->managed.frame_count_last_discard = device->frame_count; + } else { + /* Once we use without UNSYNCHRONIZED, we cannot use it anymore. + * TODO: For SYSTEMMEM resources which hit this, + * it would probably be better to use stream_uploader */ + This->managed.can_unsynchronized = false; + } + } + + u_box_union_1d(filled_region, + filled_region, + &box_upload); + u_box_union_1d(valid_region, + valid_region, + &box_upload); + u_box_1d(0, 0, required_valid_region); + } else + box_upload = This->managed.dirty_box; + + if (box_upload.x == 0 && box_upload.width == This->size) { + upload_flags |= PIPE_MAP_DISCARD_WHOLE_RESOURCE; + } + if (This->managed.pending_upload) { u_box_union_1d(&This->managed.upload_pending_regions, &This->managed.upload_pending_regions, - &This->managed.dirty_box); + &box_upload); } else { - This->managed.upload_pending_regions = This->managed.dirty_box; + This->managed.upload_pending_regions = box_upload; } + + DBG_FLAG(DBG_INDEXBUFFER|DBG_VERTEXBUFFER, + "Uploading %p, offset=%d, size=%d, Flags=0x%x\n", + This, box_upload.x, box_upload.width, upload_flags); nine_context_range_upload(device, &This->managed.pending_upload, (struct NineUnknown *)This, This->base.resource, - This->managed.dirty_box.x, - This->managed.dirty_box.width, - (char *)This->managed.data + This->managed.dirty_box.x); + box_upload.x, + box_upload.width, + upload_flags, + (char *)This->managed.data + box_upload.x); This->managed.dirty = FALSE; } diff --git a/src/gallium/frontends/nine/device9.c b/src/gallium/frontends/nine/device9.c index 0a28440b788..26f09944d54 100644 --- a/src/gallium/frontends/nine/device9.c +++ b/src/gallium/frontends/nine/device9.c @@ -249,6 +249,8 @@ NineDevice9_ctor( struct NineDevice9 *This, * still succeeds when texture allocation fails. */ This->available_texture_limit = This->available_texture_mem * 5LL / 100LL; + This->frame_count = 0; /* Used to check if events occur the same frame */ + /* create implicit swapchains */ This->nswapchains = ID3DPresentGroup_GetMultiheadCount(This->present); This->swapchains = CALLOC(This->nswapchains, @@ -2912,15 +2914,50 @@ NineAfterDraw( struct NineDevice9 *This ) } } +#define IS_SYSTEMMEM_DYNAMIC(t) ((t) && (t)->base.pool == D3DPOOL_SYSTEMMEM && (t)->base.usage & D3DUSAGE_DYNAMIC) + +/* Indicates the region needed right now for these buffers and add them to the list + * of buffers to process in NineBeforeDraw. + * The reason we don't call the upload right now is to generate smaller code (no + * duplication of the NineBuffer9_Upload inline) and to have one upload (of the correct size) + * if a vertex buffer is twice input of the draw call. */ +static void +NineTrackSystemmemDynamic( struct NineBuffer9 *This, unsigned start, unsigned width ) +{ + struct pipe_box box; + + u_box_1d(start, width, &box); + u_box_union_1d(&This->managed.required_valid_region, + &This->managed.required_valid_region, + &box); + This->managed.dirty = TRUE; + BASEBUF_REGISTER_UPDATE(This); +} + HRESULT NINE_WINAPI NineDevice9_DrawPrimitive( struct NineDevice9 *This, D3DPRIMITIVETYPE PrimitiveType, UINT StartVertex, UINT PrimitiveCount ) { + unsigned i; DBG("iface %p, PrimitiveType %u, StartVertex %u, PrimitiveCount %u\n", This, PrimitiveType, StartVertex, PrimitiveCount); + /* Tracking for dynamic SYSTEMMEM */ + for (i = 0; i < This->caps.MaxStreams; i++) { + unsigned stride = This->state.vtxbuf[i].stride; + if (IS_SYSTEMMEM_DYNAMIC((struct NineBuffer9*)This->state.stream[i])) { + unsigned start = This->state.vtxbuf[i].buffer_offset + StartVertex * stride; + unsigned full_size = This->state.stream[i]->base.size; + unsigned num_vertices = prim_count_to_vertex_count(PrimitiveType, PrimitiveCount); + unsigned size = MIN2(full_size-start, num_vertices * stride); + if (!stride) /* Instancing. Not sure what to do. Require all */ + size = full_size; + NineTrackSystemmemDynamic(&This->state.stream[i]->base, start, size); + } + } + NineBeforeDraw(This); nine_context_draw_primitive(This, PrimitiveType, StartVertex, PrimitiveCount); NineAfterDraw(This); @@ -2937,6 +2974,7 @@ NineDevice9_DrawIndexedPrimitive( struct NineDevice9 *This, UINT StartIndex, UINT PrimitiveCount ) { + unsigned i, num_indices; DBG("iface %p, PrimitiveType %u, BaseVertexIndex %u, MinVertexIndex %u " "NumVertices %u, StartIndex %u, PrimitiveCount %u\n", This, PrimitiveType, BaseVertexIndex, MinVertexIndex, NumVertices, @@ -2945,6 +2983,28 @@ NineDevice9_DrawIndexedPrimitive( struct NineDevice9 *This, user_assert(This->state.idxbuf, D3DERR_INVALIDCALL); user_assert(This->state.vdecl, D3DERR_INVALIDCALL); + num_indices = prim_count_to_vertex_count(PrimitiveType, PrimitiveCount); + + /* Tracking for dynamic SYSTEMMEM */ + if (IS_SYSTEMMEM_DYNAMIC(&This->state.idxbuf->base)) + NineTrackSystemmemDynamic(&This->state.idxbuf->base, + StartIndex * This->state.idxbuf->index_size, + num_indices * This->state.idxbuf->index_size); + + for (i = 0; i < This->caps.MaxStreams; i++) { + if (IS_SYSTEMMEM_DYNAMIC((struct NineBuffer9*)This->state.stream[i])) { + uint32_t stride = This->state.vtxbuf[i].stride; + uint32_t full_size = This->state.stream[i]->base.size; + uint32_t start, stop; + + start = MAX2(0, This->state.vtxbuf[i].buffer_offset+(MinVertexIndex+BaseVertexIndex)*stride); + stop = This->state.vtxbuf[i].buffer_offset+(MinVertexIndex+NumVertices+BaseVertexIndex)*stride; + stop = MIN2(stop, full_size); + NineTrackSystemmemDynamic(&This->state.stream[i]->base, + start, stop-start); + } + } + NineBeforeDraw(This); nine_context_draw_indexed_primitive(This, PrimitiveType, BaseVertexIndex, MinVertexIndex, NumVertices, StartIndex, diff --git a/src/gallium/frontends/nine/device9.h b/src/gallium/frontends/nine/device9.h index ba9295206de..ac91a0906e2 100644 --- a/src/gallium/frontends/nine/device9.h +++ b/src/gallium/frontends/nine/device9.h @@ -163,6 +163,8 @@ struct NineDevice9 boolean swvp; /* pure device */ boolean pure; + + unsigned frame_count; /* It's ok if we overflow */ }; static inline struct NineDevice9 * NineDevice9( void *data ) diff --git a/src/gallium/frontends/nine/nine_state.c b/src/gallium/frontends/nine/nine_state.c index 113f41e8649..136e994aafc 100644 --- a/src/gallium/frontends/nine/nine_state.c +++ b/src/gallium/frontends/nine/nine_state.c @@ -2522,6 +2522,7 @@ CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload, ARG_BIND_RES(struct pipe_resource, res), ARG_VAL(unsigned, offset), ARG_VAL(unsigned, size), + ARG_VAL(unsigned, usage), ARG_VAL(const void *, data)) { struct nine_context *context = &device->context; @@ -2529,7 +2530,7 @@ CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_range_upload, /* Binding src_ref avoids release before upload */ (void)src_ref; - context->pipe->buffer_subdata(context->pipe, res, 0, offset, size, data); + context->pipe->buffer_subdata(context->pipe, res, usage, offset, size, data); } CSMT_ITEM_NO_WAIT_WITH_COUNTER(nine_context_box_upload, diff --git a/src/gallium/frontends/nine/nine_state.h b/src/gallium/frontends/nine/nine_state.h index b6b63877558..d5dfa1065a4 100644 --- a/src/gallium/frontends/nine/nine_state.h +++ b/src/gallium/frontends/nine/nine_state.h @@ -574,6 +574,7 @@ nine_context_range_upload(struct NineDevice9 *device, struct pipe_resource *res, unsigned offset, unsigned size, + unsigned usage, const void *data); void diff --git a/src/gallium/frontends/nine/swapchain9.c b/src/gallium/frontends/nine/swapchain9.c index 44aa75a949b..4451532e6cf 100644 --- a/src/gallium/frontends/nine/swapchain9.c +++ b/src/gallium/frontends/nine/swapchain9.c @@ -931,6 +931,7 @@ bypass_rendering: } This->base.device->end_scene_since_present = 0; + This->base.device->frame_count++; return D3D_OK; }