diff --git a/src/gallium/drivers/radeonsi/si_compute_blit.c b/src/gallium/drivers/radeonsi/si_compute_blit.c index cddca6e6fc5..fe7ffe8b6ab 100644 --- a/src/gallium/drivers/radeonsi/si_compute_blit.c +++ b/src/gallium/drivers/radeonsi/si_compute_blit.c @@ -526,6 +526,41 @@ void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct p } } +void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags) +{ + if (!size) + return; + + if (!sctx->cs_ubyte_to_ushort) + sctx->cs_ubyte_to_ushort = si_create_ubyte_to_ushort_compute_shader(sctx); + + enum si_coherency coher = SI_COHERENCY_SHADER; + + si_improve_sync_flags(sctx, dst, src, &flags); + + struct pipe_grid_info info = {}; + info.block[0] = si_determine_wave_size(sctx->screen, NULL); + info.block[1] = 1; + info.block[2] = 1; + info.grid[0] = DIV_ROUND_UP(size, info.block[0]); + info.grid[1] = 1; + info.grid[2] = 1; + info.last_block[0] = size % info.block[0]; + + struct pipe_shader_buffer sb[2] = {}; + sb[0].buffer = dst; + sb[0].buffer_offset = dst_offset; + sb[0].buffer_size = dst->width0; + + sb[1].buffer = src; + sb[1].buffer_offset = src_offset; + sb[1].buffer_size = src->width0; + + si_launch_grid_internal_ssbos(sctx, &info, sctx->cs_ubyte_to_ushort, flags, coher, + 2, sb, 0x1); +} + static unsigned set_work_size(struct pipe_grid_info *info, unsigned block_x, unsigned block_y, unsigned block_z, unsigned work_x, unsigned work_y, unsigned work_z) diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c index 264474f4e0f..a65f8b7d307 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.c +++ b/src/gallium/drivers/radeonsi/si_pipe.c @@ -276,6 +276,8 @@ static void si_destroy_context(struct pipe_context *context) sctx->b.delete_compute_state(&sctx->b, sctx->cs_clear_buffer_rmw); if (sctx->cs_copy_buffer) sctx->b.delete_compute_state(&sctx->b, sctx->cs_copy_buffer); + if (sctx->cs_ubyte_to_ushort) + sctx->b.delete_compute_state(&sctx->b, sctx->cs_ubyte_to_ushort); for (unsigned i = 0; i < ARRAY_SIZE(sctx->cs_copy_image); i++) { for (unsigned j = 0; j < ARRAY_SIZE(sctx->cs_copy_image[i]); j++) { for (unsigned k = 0; k < ARRAY_SIZE(sctx->cs_copy_image[i][j]); k++) { diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h index 5c3b31842f2..6277c448de2 100644 --- a/src/gallium/drivers/radeonsi/si_pipe.h +++ b/src/gallium/drivers/radeonsi/si_pipe.h @@ -1025,6 +1025,7 @@ struct si_context { void *cs_clear_buffer; void *cs_clear_buffer_rmw; void *cs_copy_buffer; + void *cs_ubyte_to_ushort; void *cs_copy_image[3][2][2]; /* [wg_dim-1][src_is_1d][dst_is_1d] */ void *cs_clear_render_target; void *cs_clear_render_target_1d_array; @@ -1491,6 +1492,8 @@ void si_compute_clear_buffer_rmw(struct si_context *sctx, struct pipe_resource * unsigned flags, enum si_coherency coher); void si_copy_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags); +void si_compute_shorten_ubyte_buffer(struct si_context *sctx, struct pipe_resource *dst, struct pipe_resource *src, + uint64_t dst_offset, uint64_t src_offset, unsigned size, unsigned flags); bool si_compute_copy_image(struct si_context *sctx, struct pipe_resource *dst, unsigned dst_level, struct pipe_resource *src, unsigned src_level, unsigned dstx, unsigned dsty, unsigned dstz, const struct pipe_box *src_box, @@ -1654,6 +1657,7 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type, unsigned num_layers); void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy); +void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx); void *si_create_clear_buffer_rmw_cs(struct si_context *sctx); void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type); void *si_clear_12bytes_buffer_shader(struct si_context *sctx); diff --git a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c index ee9bf2fc159..a689460066a 100644 --- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c +++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c @@ -624,6 +624,37 @@ void *si_clear_12bytes_buffer_shader(struct si_context *sctx) return create_shader_state(sctx, b.shader); } +void *si_create_ubyte_to_ushort_compute_shader(struct si_context *sctx) +{ + const nir_shader_compiler_options *options = + sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE); + + unsigned store_qualifier = ACCESS_COHERENT | ACCESS_RESTRICT; + + /* Don't cache loads, because there is no reuse. */ + unsigned load_qualifier = store_qualifier | ACCESS_NON_TEMPORAL; + + nir_builder b = + nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "ubyte_to_ushort"); + + unsigned default_wave_size = si_determine_wave_size(sctx->screen, NULL); + + b.shader->info.workgroup_size[0] = default_wave_size; + b.shader->info.workgroup_size[1] = 1; + b.shader->info.workgroup_size[2] = 1; + b.shader->info.num_ssbos = 2; + + nir_def *load_address = get_global_ids(&b, 1); + nir_def *store_address = nir_imul_imm(&b, load_address, 2); + + nir_def *ubyte_value = nir_load_ssbo(&b, 1, 8, nir_imm_int(&b, 1), + load_address, .access = load_qualifier); + nir_store_ssbo(&b, nir_u2uN(&b, ubyte_value, 16), nir_imm_int(&b, 0), + store_address, .access = store_qualifier); + + return create_shader_state(sctx, b.shader); +} + /* Create a compute shader implementing clear_buffer or copy_buffer. */ void *si_create_dma_compute_shader(struct si_context *sctx, unsigned num_dwords_per_thread, bool dst_stream_cache_policy, bool is_copy) diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp index 737f873f703..95e1478c5cc 100644 --- a/src/gallium/drivers/radeonsi/si_state_draw.cpp +++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp @@ -2044,23 +2044,22 @@ static void si_draw(struct pipe_context *ctx, /* Translate or upload, if needed. */ /* 8-bit indices are supported on GFX8. */ if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) { - unsigned start, count, start_offset, size, offset; - void *ptr; + unsigned start, count, start_offset, size; si_get_draw_start_count(sctx, info, indirect, draws, num_draws, &start, &count); start_offset = start * 2; size = count * 2; - indexbuf = NULL; - u_upload_alloc(ctx->stream_uploader, start_offset, size, - si_optimal_tcc_alignment(sctx, size), &offset, &indexbuf, &ptr); + /* Don't use u_upload_alloc because we don't need to map the buffer for CPU access. */ + indexbuf = pipe_buffer_create(&sctx->screen->b, 0, PIPE_USAGE_IMMUTABLE, start_offset + size); if (unlikely(!indexbuf)) return; - util_shorten_ubyte_elts_to_userptr(&sctx->b, info, 0, 0, index_offset + start, count, ptr); + si_compute_shorten_ubyte_buffer(sctx, indexbuf, info->index.resource, + start_offset, index_offset + start, count, + SI_OP_SYNC_AFTER); - /* info->start will be added by the drawing code */ - index_offset = offset - start_offset; + index_offset = 0; index_size = 2; } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) { unsigned start_offset;