From 64b5823d33f1b366905a31b8eeb722afcbb1c139 Mon Sep 17 00:00:00 2001 From: Calder Young Date: Fri, 20 Mar 2026 16:48:55 -0700 Subject: [PATCH] blorp: Work around sampler overfetch for buffer copies First, the surface dimensions are used to determine the range of valid pages that the data in the buffer overlaps, then rows are removed from the surface until it does not overfetch into any neighboring pages. If any rows were removed, an extra BTI is set up with a texel buffer that views the contents of all the rows that were removed, and the shader is compiled with a branch to sample the last rows through the texel buffer instead of the main surface. Using the texel buffer allows it to access the last rows without dealing with overfetch or weird alignment hacks, and restricting texel buffer usage to just the part of the surface that can't be accessed safely ensures that we don't significantly impact performance for any buffer to image copy that is unlucky enough to be close to a page boundry. Co-authored-by: Lionel Landwerlin Reviewed-by: Lionel Landwerlin Part-of: --- src/gallium/drivers/crocus/crocus_blorp.c | 1 + src/gallium/drivers/iris/iris_blorp.c | 1 + src/intel/blorp/blorp.c | 20 +++ src/intel/blorp/blorp.h | 34 +++-- src/intel/blorp/blorp_blit.c | 159 +++++++++++++++++++--- src/intel/blorp/blorp_genX_exec_brw.h | 77 +++++++++-- src/intel/blorp/blorp_genX_exec_elk.h | 9 +- src/intel/blorp/blorp_priv.h | 33 ++++- src/intel/vulkan/anv_blorp.c | 35 +++-- src/intel/vulkan/genX_blorp_exec.c | 11 -- src/intel/vulkan_hasvk/anv_blorp.c | 9 ++ src/intel/vulkan_hasvk/genX_blorp_exec.c | 8 -- 12 files changed, 314 insertions(+), 83 deletions(-) diff --git a/src/gallium/drivers/crocus/crocus_blorp.c b/src/gallium/drivers/crocus/crocus_blorp.c index ba8425a518b..3abe041d75d 100644 --- a/src/gallium/drivers/crocus/crocus_blorp.c +++ b/src/gallium/drivers/crocus/crocus_blorp.c @@ -447,5 +447,6 @@ genX(crocus_init_blorp)(struct crocus_context *ice) blorp_init_elk(&ice->blorp, ice, &screen->isl_dev, screen->compiler, NULL); ice->blorp.lookup_shader = crocus_blorp_lookup_shader; ice->blorp.upload_shader = crocus_blorp_upload_shader; + ice->blorp.get_surface_address = blorp_get_surface_address; ice->blorp.exec = crocus_blorp_exec; } diff --git a/src/gallium/drivers/iris/iris_blorp.c b/src/gallium/drivers/iris/iris_blorp.c index 53329d90a04..39737142d11 100644 --- a/src/gallium/drivers/iris/iris_blorp.c +++ b/src/gallium/drivers/iris/iris_blorp.c @@ -513,6 +513,7 @@ genX(init_blorp)(struct iris_context *ice) #endif ice->blorp.lookup_shader = iris_blorp_lookup_shader; ice->blorp.upload_shader = iris_blorp_upload_shader; + ice->blorp.get_surface_address = blorp_get_surface_address; ice->blorp.exec = iris_blorp_exec; ice->blorp.enable_tbimr = screen->driconf.enable_tbimr; } diff --git a/src/intel/blorp/blorp.c b/src/intel/blorp/blorp.c index 29405390110..741036ad34a 100644 --- a/src/intel/blorp/blorp.c +++ b/src/intel/blorp/blorp.c @@ -253,6 +253,26 @@ blorp_surface_info_init(struct blorp_batch *batch, info->surf.phys_level0_sa.w += surf->tile_x_sa; info->surf.phys_level0_sa.h += surf->tile_y_sa; } + + if (blorp->isl_dev->requires_padding && !is_dest && + (batch->flags & BLORP_BATCH_SRC_UNPADDED)) { + blorp_assert_is_buffer(info->surf, info->view); + + /* Infers the page boundaries for a buffer to image copy based on the + * surface address and dimensions, following Vulkan semantics to + * determine the extent of the final row. + */ + uint64_t size_B = + (uint64_t) info->surf.phys_level0_sa.w * + (isl_format_get_layout(info->view.format)->bpb / 8) + + (uint64_t) (info->surf.phys_level0_sa.h - 1) * + info->surf.row_pitch_B; + + uint64_t mask = blorp->isl_dev->info->mem_alignment - 1; + uint64_t address = batch->blorp->get_surface_address(batch, info->addr); + info->page_base = address & ~mask; + info->page_limit = (address + size_B + mask) & ~mask; + } } diff --git a/src/intel/blorp/blorp.h b/src/intel/blorp/blorp.h index 8109c12c27f..b07006777f5 100644 --- a/src/intel/blorp/blorp.h +++ b/src/intel/blorp/blorp.h @@ -79,6 +79,20 @@ enum blorp_dynamic_state { BLORP_DYNAMIC_STATE_COUNT, }; +struct blorp_address { + void *buffer; + int64_t offset; + unsigned reloc_flags; + uint32_t mocs; + + /** + * True if this buffer is intended to live in device-local memory. + * This is only a performance hint; it's OK to set it to true even + * if eviction has temporarily forced the buffer to system memory. + */ + bool local_hint; +}; + struct blorp_context { void *driver_ctx; @@ -105,6 +119,8 @@ struct blorp_context { const void *prog_data, uint32_t prog_data_size, uint32_t *kernel_out, void *prog_data_out); + uint64_t (*get_surface_address)(struct blorp_batch *batch, + struct blorp_address addr); void (*exec)(struct blorp_batch *batch, const struct blorp_params *params); struct blorp_config config; @@ -155,6 +171,10 @@ enum blorp_batch_flags { * Mostly for debug */ BLORP_BATCH_DISABLE_VF_DISTRIBUTION = BITFIELD_BIT(6), + + /** Source buffer is unpadded and needs careful accesses + */ + BLORP_BATCH_SRC_UNPADDED = BITFIELD_BIT(7), }; struct blorp_batch { @@ -186,20 +206,6 @@ blorp_batch_isl_copy_usage(const struct blorp_batch *batch, bool is_dest, return usage; } -struct blorp_address { - void *buffer; - int64_t offset; - unsigned reloc_flags; - uint32_t mocs; - - /** - * True if this buffer is intended to live in device-local memory. - * This is only a performance hint; it's OK to set it to true even - * if eviction has temporarily forced the buffer to system memory. - */ - bool local_hint; -}; - static inline bool blorp_address_is_null(struct blorp_address address) { diff --git a/src/intel/blorp/blorp_blit.c b/src/intel/blorp/blorp_blit.c index e81b7411a92..a25566a7154 100644 --- a/src/intel/blorp/blorp_blit.c +++ b/src/intel/blorp/blorp_blit.c @@ -45,6 +45,8 @@ struct blorp_blit_vars { nir_variable *v_src_offset; nir_variable *v_dst_offset; nir_variable *v_src_inv_size; + nir_variable *v_src_buffer_first_row; + nir_variable *v_src_buffer_row_pitch; }; static void @@ -60,6 +62,8 @@ blorp_blit_vars_init(nir_builder *b, struct blorp_blit_vars *v) LOAD_INPUT(src_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) LOAD_INPUT(dst_offset, glsl_vector_type(GLSL_TYPE_UINT, 2)) LOAD_INPUT(src_inv_size, glsl_vector_type(GLSL_TYPE_FLOAT, 2)) + LOAD_INPUT(src_buffer_first_row, glsl_uint_type()) + LOAD_INPUT(src_buffer_row_pitch, glsl_uint_type()) #undef LOAD_INPUT } @@ -224,6 +228,47 @@ blorp_nir_txf(nir_builder *b, struct blorp_blit_vars *v, return &tex->def; } +/* Same as blorp_nir_txf, except the last few rows may be loaded from a texel + * buffer bound to BLORP_TEXBUF_BT_INDEX instead to avoid page faults due to + * an unaligned source. + */ +static nir_def * +blorp_nir_txf_buf(nir_builder *b, struct blorp_blit_vars *v, + nir_def *pos, nir_alu_type dst_type, + const struct intel_device_info *devinfo) +{ + nir_def *buf_start = nir_load_var(b, v->v_src_buffer_first_row); + + /* Just use if statements, non uniform texture access is expensive */ + nir_push_if(b, nir_ilt(b, nir_channel(b, pos, 1), buf_start)); + + nir_def *tex = blorp_nir_txf(b, v, pos, dst_type, devinfo); + + nir_push_else(b, NULL); + + /* Get the offset into the buffer if we're beyond src_buffer_first_row */ + pos = nir_vec2(b, + nir_iadd(b, + nir_imul(b, + nir_isub(b, + nir_channel(b, pos, 1), + buf_start), + nir_load_var(b, v->v_src_buffer_row_pitch)), + nir_channel(b, pos, 0)), + nir_imm_int(b, 0)); + + nir_tex_instr *buf = + blorp_create_nir_tex_instr(b, v, nir_texop_txf, pos, 1, dst_type, devinfo); + + buf->texture_index = BLORP_TEXBUF_BT_INDEX; + buf->sampler_dim = GLSL_SAMPLER_DIM_BUF; + + nir_builder_instr_insert(b, &buf->instr); + + nir_pop_if(b, NULL); + return nir_if_phi(b, tex, &buf->def); +} + static nir_def * blorp_nir_txf_ms(nir_builder *b, struct blorp_blit_vars *v, nir_def *pos, nir_alu_type dst_type, @@ -1322,6 +1367,7 @@ blorp_build_nir_shader(struct blorp_context *blorp, case BLORP_FILTER_NONE: case BLORP_FILTER_NEAREST: case BLORP_FILTER_SAMPLE_0: + assert(!key->need_src_buffer || key->src_samples == 1); /* We're going to use texelFetch, so we need integers */ if (src_pos->num_components == 2) { src_pos = nir_f2i32(&b, src_pos); @@ -1364,7 +1410,9 @@ blorp_build_nir_shader(struct blorp_context *blorp, * the texturing unit, will cause data to be read from the correct * memory location. So we can fetch the texel now. */ - if (key->src_samples == 1) { + if (key->need_src_buffer) { + color = blorp_nir_txf_buf(&b, &v, src_pos, key->texture_data_type, devinfo); + } else if (key->src_samples == 1) { color = blorp_nir_txf(&b, &v, src_pos, key->texture_data_type, devinfo); } else { color = blorp_nir_txf_ms(&b, &v, src_pos, key->texture_data_type, devinfo); @@ -1373,6 +1421,7 @@ blorp_build_nir_shader(struct blorp_context *blorp, case BLORP_FILTER_BILINEAR: assert(!key->src_tiled_w); + assert(!key->need_src_buffer); assert(key->tex_samples == key->src_samples); assert(key->tex_layout == key->src_layout); @@ -1389,6 +1438,7 @@ blorp_build_nir_shader(struct blorp_context *blorp, case BLORP_FILTER_MIN_SAMPLE: case BLORP_FILTER_MAX_SAMPLE: assert(!key->src_tiled_w); + assert(!key->need_src_buffer); assert(key->tex_samples == key->src_samples); assert(key->tex_layout == key->src_layout); @@ -2011,6 +2061,48 @@ surf_fake_rgb_with_red(const struct isl_device *isl_dev, info->surf.format = info->view.format = red_format; } +/** + * Converts the overfetching part of a linear 2D surface to a 1D buffer, this + * is part of a workaround for performing buffer-to-image-copies when source + * straddles an extra page due to a misaligned sampler cache. + */ +static inline void +blorp_surf_convert_overfetch_to_buffer(struct blorp_batch *batch, + struct blorp_surface_info *info) +{ + const struct isl_device *isl_dev = batch->blorp->isl_dev; + + blorp_assert_is_buffer(info->surf, info->view); + assert(isl_format_block_is_1x1x1(info->view.format)); + + uint64_t address = batch->blorp->get_surface_address(batch, info->addr); + uint64_t max_size_B = info->page_limit - address; + uint64_t overfetch_B = + isl_surf_get_sampler_overfetch_size_B(isl_dev, &info->surf, &info->view); + + if (overfetch_B > max_size_B) { + uint32_t rows = (uint32_t) DIV_ROUND_UP(overfetch_B - max_size_B, + info->surf.row_pitch_B); + + /* We could overflow the subtraction below in some cases */ + rows = MIN2(rows, info->surf.logical_level0_px.h); + + info->buffer = true; + info->buffer_rows = rows; + info->surf.logical_level0_px.h -= rows; + info->surf.phys_level0_sa.h -= rows; + info->surf.size_B -= rows * info->surf.row_pitch_B; + + if (info->surf.logical_level0_px.h == 0) { + info->surf.size_B = 0; + return; + } + + assert(isl_surf_get_sampler_overfetch_size_B(isl_dev, + &info->surf, &info->view) <= max_size_B); + } +} + enum blit_shrink_status { BLIT_NO_SHRINK = 0, BLIT_SRC_WIDTH_SHRINK = (1 << 0), @@ -2359,6 +2451,21 @@ try_blorp_blit(struct blorp_batch *batch, key->use_kill = true; } + if (batch->blorp->isl_dev->requires_padding && + (batch->flags & BLORP_BATCH_SRC_UNPADDED)) { + params->src.view.usage |= ISL_SURF_USAGE_NO_ARRAY_OVERFETCH_BIT; + blorp_surf_convert_overfetch_to_buffer(batch, ¶ms->src); + } + + key->need_src_buffer = params->src.buffer; + if (key->need_src_buffer) { + params->wm_inputs.blit.src_buffer_first_row = + params->src.surf.logical_level0_px.h; + params->wm_inputs.blit.src_buffer_row_pitch = + params->src.surf.row_pitch_B / + (isl_format_get_layout(params->src.view.format)->bpb / 8); + } + if (compute) { if (!blorp_get_blit_kernel_cs(batch, params, key)) return 0; @@ -2434,9 +2541,11 @@ shrink_surface_params(const struct isl_device *dev, struct blorp_surface_info *info, double *x0, double *x1, double *y0, double *y1) { - uint64_t offset_B; + uint64_t start_offset_B; + uint64_t end_offset_B; uint32_t x_offset_sa, y_offset_sa, size; struct isl_extent2d px_size_sa; + struct isl_extent4d surf_size_sa; int adjust; blorp_surf_convert_to_single_slice(dev, info); @@ -2449,19 +2558,28 @@ shrink_surface_params(const struct isl_device *dev, */ x_offset_sa = (uint32_t)*x0 * px_size_sa.w + info->tile_x_sa; y_offset_sa = (uint32_t)*y0 * px_size_sa.h + info->tile_y_sa; + surf_size_sa = (struct isl_extent4d) { + .w = (uint32_t)ceil(*x1) * px_size_sa.w + info->tile_x_sa, + .h = (uint32_t)ceil(*y1) * px_size_sa.h + info->tile_y_sa, + .d = 1, + .a = 1, + }; + uint32_t tile_z_sa, tile_a; - isl_tiling_get_intratile_offset_sa(info->surf.tiling, info->surf.dim, - info->surf.msaa_layout, - info->surf.format, info->surf.samples, - info->surf.row_pitch_B, - info->surf.array_pitch_el_rows, - x_offset_sa, y_offset_sa, 0, 0, - &offset_B, - &info->tile_x_sa, &info->tile_y_sa, - &tile_z_sa, &tile_a); + isl_tiling_get_intratile_range_sa(info->surf.tiling, info->surf.dim, + info->surf.msaa_layout, + info->surf.format, info->surf.samples, + info->surf.row_pitch_B, + info->surf.array_pitch_el_rows, + x_offset_sa, y_offset_sa, 0, 0, + surf_size_sa, + &start_offset_B, + &end_offset_B, + &info->tile_x_sa, &info->tile_y_sa, + &tile_z_sa, &tile_a); assert(tile_z_sa == 0 && tile_a == 0); - info->addr.offset += offset_B; + info->addr.offset += start_offset_B; adjust = (int)info->tile_x_sa / px_size_sa.w - (int)*x0; *x0 += adjust; @@ -2481,6 +2599,7 @@ shrink_surface_params(const struct isl_device *dev, info->surf.logical_level0_px.height = size; info->surf.phys_level0_sa.height = size * px_size_sa.h; + info->surf.size_B = end_offset_B - start_offset_B; info->surf.usage |= ISL_SURF_USAGE_NO_OVERFETCH_PADDING_BIT; /* Stomp the 64B alignment because we set NO_OVERFETCH_PADDING_BIT */ @@ -3085,7 +3204,8 @@ blorp_copy_get_formats(const struct isl_device *isl_dev, static int get_max_format_scale(const struct isl_device *isl_dev, const struct blorp_surface_info *info, - uint32_t x, uint32_t width, uint32_t height) + uint32_t x, uint32_t width, uint32_t height, + bool unpadded) { const bool full_width = u_minify(info->surf.logical_level0_px.width, info->view.base_level) == width; @@ -3168,9 +3288,10 @@ get_max_format_scale(const struct isl_device *isl_dev, continue; } - if (!(info->view.usage & ISL_SURF_USAGE_TEXTURE_BIT)) { - /* All surface types except for textures need their row pitch aligned - * to the pixel block size. + if (!(info->view.usage & ISL_SURF_USAGE_TEXTURE_BIT) || + (isl_dev->requires_padding && unpadded)) { + /* All surface types except for padded textures need their row pitch + * aligned to the pixel block size. */ if (info->surf.row_pitch_B * 8 % max_bpb) continue; @@ -3336,9 +3457,11 @@ blorp_copy(struct blorp_batch *batch, dst_width = src_width * src_fmtl->bpb / dst_fmtl->bpb; int max_fmt_scale_src = get_max_format_scale(isl_dev, ¶ms.src, src_x, - src_width, src_height); + src_width, src_height, + batch->flags & BLORP_BATCH_SRC_UNPADDED); int max_fmt_scale_dst = get_max_format_scale(isl_dev, ¶ms.dst, dst_x, - dst_width, dst_height); + dst_width, dst_height, + false); int copy_fmt_bpb = MIN2(src_fmtl->bpb * max_fmt_scale_src, dst_fmtl->bpb * max_fmt_scale_dst); diff --git a/src/intel/blorp/blorp_genX_exec_brw.h b/src/intel/blorp/blorp_genX_exec_brw.h index cf216d650b6..ba50dd8efc3 100644 --- a/src/intel/blorp/blorp_genX_exec_brw.h +++ b/src/intel/blorp/blorp_genX_exec_brw.h @@ -108,10 +108,6 @@ static void blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, struct blorp_address address, uint32_t delta); -static uint64_t -blorp_get_surface_address(struct blorp_batch *batch, - struct blorp_address address); - #if GFX_VER < 10 static struct blorp_address blorp_get_surface_base_address(struct blorp_batch *batch); @@ -1241,11 +1237,11 @@ blorp_emit_surface_state(struct blorp_batch *batch, .aux_surf = &surface->aux_surf, .aux_usage = aux_usage, .aux_format = surface->aux_format, .address = - blorp_get_surface_address(batch, surface->addr), + batch->blorp->get_surface_address(batch, surface->addr), .aux_address = !use_aux_address ? 0 : - blorp_get_surface_address(batch, surface->aux_addr), + batch->blorp->get_surface_address(batch, surface->aux_addr), .clear_address = !use_clear_address ? 0 : - blorp_get_surface_address(batch, op_clear_addr), + batch->blorp->get_surface_address(batch, op_clear_addr), .mocs = surface->addr.mocs, .clear_color = surface->clear_color, .use_clear_address = use_clear_address); @@ -1287,6 +1283,45 @@ blorp_emit_surface_state(struct blorp_batch *batch, blorp_flush_range(batch, state, GENX(RENDER_SURFACE_STATE_length) * 4); } +/** + * Emits the remaining rows of the 2D linear surface as a texel buffer, this + * is part of a workaround for performing buffer to image copies when the + * surface is straddling an extra page due to a misaligned sampler cache. + */ +static void +blorp_emit_buffer_surface_state(struct blorp_batch *batch, + const struct blorp_surface_info *surface, + void *state, uint32_t state_offset) +{ + blorp_assert_is_buffer(surface->surf, surface->view); + assert(isl_format_block_is_1x1x1(surface->view.format)); + + const struct isl_device *isl_dev = batch->blorp->isl_dev; + + struct blorp_address buffer_addr = surface->addr; + buffer_addr.offset += + surface->surf.row_pitch_B * surface->surf.logical_level0_px.h; + + uint32_t element_size_B = + isl_format_get_layout(surface->view.format)->bpb / 8; + uint64_t surface_size_B = + (uint64_t) surface->surf.row_pitch_B * (surface->buffer_rows - 1) + + surface->surf.logical_level0_px.w * element_size_B; + + isl_buffer_fill_state(isl_dev, state, + .address = + batch->blorp->get_surface_address(batch, buffer_addr), + .size_B = surface_size_B, + .stride_B = element_size_B, + .format = surface->view.format, + .swizzle = surface->view.swizzle, + .mocs = surface->addr.mocs, + .usage = surface->surf.usage | surface->view.usage); + + blorp_surface_reloc(batch, state_offset + isl_dev->ss.addr_offset, + buffer_addr, 0); +} + static void blorp_emit_null_surface_state(struct blorp_batch *batch, const struct blorp_surface_info *surface, @@ -1295,8 +1330,8 @@ blorp_emit_null_surface_state(struct blorp_batch *batch, struct GENX(RENDER_SURFACE_STATE) ss = { .SurfaceType = SURFTYPE_NULL, .SurfaceFormat = ISL_FORMAT_R8G8B8A8_UNORM, - .Width = surface->surf.logical_level0_px.width - 1, - .Height = surface->surf.logical_level0_px.height - 1, + .Width = MAX2(surface->surf.logical_level0_px.width, 1) - 1, + .Height = MAX2(surface->surf.logical_level0_px.height, 1) - 1, .MIPCountLOD = surface->view.base_level, .MinimumArrayElement = surface->view.base_array_layer, .Depth = surface->view.array_len - 1, @@ -1329,7 +1364,7 @@ blorp_setup_binding_table(struct blorp_batch *batch, if (params->use_pre_baked_binding_table) { bind_offset = params->pre_baked_binding_table_offset; } else { - unsigned num_surfaces = 1 + params->src.enabled; + unsigned num_surfaces = 1 + params->src.enabled + params->src.buffer; if (!blorp_alloc_binding_table(batch, num_surfaces, isl_dev->ss.size, isl_dev->ss.align, &bind_offset, surface_offsets, surface_maps)) @@ -1350,11 +1385,23 @@ blorp_setup_binding_table(struct blorp_batch *batch, } if (params->src.enabled) { - blorp_emit_surface_state(batch, ¶ms->src, - params->fast_clear_op, - surface_maps[BLORP_TEXTURE_BT_INDEX], - surface_offsets[BLORP_TEXTURE_BT_INDEX], - 0, false); + if (params->src.surf.size_B != 0) { + blorp_emit_surface_state(batch, ¶ms->src, + params->fast_clear_op, + surface_maps[BLORP_TEXTURE_BT_INDEX], + surface_offsets[BLORP_TEXTURE_BT_INDEX], + 0, false); + } else { + /* Nothing to do, the entire surface got converted to a buffer */ + blorp_emit_null_surface_state(batch, ¶ms->src, + surface_maps[BLORP_TEXTURE_BT_INDEX]); + } + + if (params->src.buffer) { + blorp_emit_buffer_surface_state(batch, ¶ms->src, + surface_maps[BLORP_TEXBUF_BT_INDEX], + surface_offsets[BLORP_TEXBUF_BT_INDEX]); + } } } diff --git a/src/intel/blorp/blorp_genX_exec_elk.h b/src/intel/blorp/blorp_genX_exec_elk.h index dd1a49163ea..fcc6f5aa147 100644 --- a/src/intel/blorp/blorp_genX_exec_elk.h +++ b/src/intel/blorp/blorp_genX_exec_elk.h @@ -107,10 +107,6 @@ static void blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, struct blorp_address address, uint32_t delta); -static uint64_t -blorp_get_surface_address(struct blorp_batch *batch, - struct blorp_address address); - #if GFX_VER >= 7 static struct blorp_address blorp_get_surface_base_address(struct blorp_batch *batch); @@ -1445,6 +1441,7 @@ blorp_emit_surface_state(struct blorp_batch *batch, uint8_t color_write_disable, bool is_render_target) { + assert(!surface->buffer); const struct isl_device *isl_dev = batch->blorp->isl_dev; struct isl_surf surf = surface->surf; @@ -1483,9 +1480,9 @@ blorp_emit_surface_state(struct blorp_batch *batch, .surf = &surf, .view = &surface->view, .aux_surf = &surface->aux_surf, .aux_usage = aux_usage, .address = - blorp_get_surface_address(batch, surface->addr), + batch->blorp->get_surface_address(batch, surface->addr), .aux_address = !use_aux_address ? 0 : - blorp_get_surface_address(batch, surface->aux_addr), + batch->blorp->get_surface_address(batch, surface->aux_addr), .mocs = surface->addr.mocs, .clear_color = surface->clear_color, .write_disables = write_disable_mask); diff --git a/src/intel/blorp/blorp_priv.h b/src/intel/blorp/blorp_priv.h index 5155f995444..cc77cd770bd 100644 --- a/src/intel/blorp/blorp_priv.h +++ b/src/intel/blorp/blorp_priv.h @@ -75,6 +75,7 @@ struct blorp_compiler { enum { BLORP_RENDERBUFFER_BT_INDEX, BLORP_TEXTURE_BT_INDEX, + BLORP_TEXBUF_BT_INDEX, BLORP_NUM_BT_ENTRIES }; @@ -84,9 +85,16 @@ struct blorp_surface_info { bool enabled; + /* Should we unpack the last few rows using a texel buffer? */ + bool buffer; + uint32_t buffer_rows; + struct isl_surf surf; struct blorp_address addr; + /* Inferred page boundaries of the surface address */ + uint64_t page_base, page_limit; + struct isl_surf aux_surf; struct blorp_address aux_addr; enum isl_aux_usage aux_usage; @@ -182,6 +190,9 @@ struct blorp_wm_inputs_blit /* (1/width, 1/height) for the source surface */ float src_inv_size[2]; + uint32_t src_buffer_first_row; + uint32_t src_buffer_row_pitch; + /* Minimum layer setting works for all the textures types but texture_3d * for which the setting has no effect. Use the z-coordinate instead. */ @@ -204,7 +215,7 @@ struct blorp_wm_inputs /* Note: Pad out to an integral number of registers when extending, but * make sure subgroup_id is the last 32-bit item. */ - uint32_t pad[4]; + uint32_t pad[2]; uint32_t subgroup_id; }; @@ -434,6 +445,12 @@ struct blorp_blit_prog_key */ bool need_src_offset; + /* True if this blit operation is unpacking the last few rows of the 2D image + * from a 1D buffer. This is part of a workaround for performing buffer-to-image + * copies when the source is straddling an extra page due to a misaligned cache. + */ + bool need_src_buffer; + /* True if this blit operation may involve intratile offsets on the * destination. In this case, we need to add the offset to gl_FragCoord. */ @@ -580,6 +597,20 @@ blorp_op_type_is_clear(enum blorp_op op) } } +/* Asserts unless the surface is a buffer to image copy */ +#define blorp_assert_is_buffer(surf, view) \ + do { \ + assert((surf).dim == ISL_SURF_DIM_2D); \ + assert((surf).tiling == ISL_TILING_LINEAR); \ + assert((surf).logical_level0_px.d == 1); \ + assert((surf).logical_level0_px.array_len == 1); \ + assert((surf).samples == 1); \ + assert((surf).levels == 1); \ + UNUSED const struct isl_format_layout *fmtl = \ + isl_format_get_layout((view).format); \ + assert((surf).row_pitch_B % (fmtl->bpb / 8) == 0); \ + } while (false) + /** \} */ #ifdef __cplusplus diff --git a/src/intel/vulkan/anv_blorp.c b/src/intel/vulkan/anv_blorp.c index b8f1d2f4ef4..61aca743509 100644 --- a/src/intel/vulkan/anv_blorp.c +++ b/src/intel/vulkan/anv_blorp.c @@ -110,6 +110,17 @@ get_fp64_nir(struct blorp_context *context) return device->fp64_nir; } +static uint64_t +blorp_get_surface_address(struct blorp_batch *blorp_batch, + struct blorp_address address) +{ + struct anv_address anv_addr = { + .bo = address.buffer, + .offset = address.offset, + }; + return anv_address_physical(anv_addr); +} + void anv_device_init_blorp(struct anv_device *device) { @@ -126,6 +137,7 @@ anv_device_init_blorp(struct anv_device *device) device->blorp.context.lookup_shader = lookup_blorp_shader; device->blorp.context.upload_shader = upload_blorp_shader; device->blorp.context.enable_tbimr = device->physical->instance->enable_tbimr; + device->blorp.context.get_surface_address = blorp_get_surface_address; device->blorp.context.exec = anv_genX(device->info, blorp_exec); device->blorp.context.upload_dynamic_state = upload_dynamic_state; @@ -853,7 +865,7 @@ void anv_CmdCopyBufferToImage2( anv_cmd_require_rcs(cmd_buffer, blorp_execute_on_companion) { struct blorp_batch batch; - anv_blorp_batch_init(cmd_buffer, &batch, 0); + anv_blorp_batch_init(cmd_buffer, &batch, BLORP_BATCH_SRC_UNPADDED); for (unsigned r = 0; r < pCopyBufferToImageInfo->regionCount; r++) { const VkBufferImageCopy2 *region = &pCopyBufferToImageInfo->pRegions[r]; @@ -1183,9 +1195,10 @@ anv_cmd_copy_addr(struct anv_cmd_buffer *cmd_buffer, struct blorp_batch batch; anv_blorp_batch_init(cmd_buffer, &batch, - cmd_buffer->state.current_pipeline == - cmd_buffer->device->physical->gpgpu_pipeline_value ? - BLORP_BATCH_USE_COMPUTE : 0); + BLORP_BATCH_SRC_UNPADDED | + (cmd_buffer->state.current_pipeline == + cmd_buffer->device->physical->gpgpu_pipeline_value ? + BLORP_BATCH_USE_COMPUTE : 0)); copy_memory(device, &batch, src_addr, dst_addr, size); @@ -1203,9 +1216,10 @@ void anv_CmdCopyBuffer2( struct blorp_batch batch; anv_blorp_batch_init(cmd_buffer, &batch, - cmd_buffer->state.current_pipeline == - cmd_buffer->device->physical->gpgpu_pipeline_value ? - BLORP_BATCH_USE_COMPUTE : 0); + BLORP_BATCH_SRC_UNPADDED | + (cmd_buffer->state.current_pipeline == + cmd_buffer->device->physical->gpgpu_pipeline_value ? + BLORP_BATCH_USE_COMPUTE : 0)); for (unsigned r = 0; r < pCopyBufferInfo->regionCount; r++) { const VkBufferCopy2 *region = &pCopyBufferInfo->pRegions[r]; @@ -1230,9 +1244,10 @@ anv_cmd_buffer_update_addr( { struct blorp_batch batch; anv_blorp_batch_init(cmd_buffer, &batch, - cmd_buffer->state.current_pipeline == - cmd_buffer->device->physical->gpgpu_pipeline_value ? - BLORP_BATCH_USE_COMPUTE : 0); + BLORP_BATCH_SRC_UNPADDED | + (cmd_buffer->state.current_pipeline == + cmd_buffer->device->physical->gpgpu_pipeline_value ? + BLORP_BATCH_USE_COMPUTE : 0)); /* We can't quite grab a full block because the state stream needs a * little data at the top to build its linked list. diff --git a/src/intel/vulkan/genX_blorp_exec.c b/src/intel/vulkan/genX_blorp_exec.c index f6f212479f5..b0f59e6edad 100644 --- a/src/intel/vulkan/genX_blorp_exec.c +++ b/src/intel/vulkan/genX_blorp_exec.c @@ -95,17 +95,6 @@ blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, anv_batch_set_error(&cmd_buffer->batch, result); } -static uint64_t -blorp_get_surface_address(struct blorp_batch *blorp_batch, - struct blorp_address address) -{ - struct anv_address anv_addr = { - .bo = address.buffer, - .offset = address.offset, - }; - return anv_address_physical(anv_addr); -} - #if GFX_VER == 9 static struct blorp_address blorp_get_surface_base_address(struct blorp_batch *batch) diff --git a/src/intel/vulkan_hasvk/anv_blorp.c b/src/intel/vulkan_hasvk/anv_blorp.c index 13ab2f65e8e..e79897b7e9e 100644 --- a/src/intel/vulkan_hasvk/anv_blorp.c +++ b/src/intel/vulkan_hasvk/anv_blorp.c @@ -84,6 +84,14 @@ upload_blorp_shader(struct blorp_batch *batch, uint32_t stage, return true; } +static uint64_t +blorp_get_surface_address(struct blorp_batch *blorp_batch, + struct blorp_address address) +{ + /* We'll let blorp_surface_reloc write the address. */ + return 0; +} + void anv_device_init_blorp(struct anv_device *device) { @@ -93,6 +101,7 @@ anv_device_init_blorp(struct anv_device *device) device->physical->compiler, &config); device->blorp.lookup_shader = lookup_blorp_shader; device->blorp.upload_shader = upload_blorp_shader; + device->blorp.get_surface_address = blorp_get_surface_address; switch (device->info->verx10) { case 70: device->blorp.exec = gfx7_blorp_exec; diff --git a/src/intel/vulkan_hasvk/genX_blorp_exec.c b/src/intel/vulkan_hasvk/genX_blorp_exec.c index 020236b54f0..0a8f431e01f 100644 --- a/src/intel/vulkan_hasvk/genX_blorp_exec.c +++ b/src/intel/vulkan_hasvk/genX_blorp_exec.c @@ -100,14 +100,6 @@ blorp_surface_reloc(struct blorp_batch *batch, uint32_t ss_offset, write_reloc(cmd_buffer->device, dest, address_u64, false); } -static uint64_t -blorp_get_surface_address(struct blorp_batch *blorp_batch, - struct blorp_address address) -{ - /* We'll let blorp_surface_reloc write the address. */ - return 0; -} - static struct blorp_address blorp_get_surface_base_address(struct blorp_batch *batch) {