From cdb2e3d2b51419488287eb4284c879972b2cfcc2 Mon Sep 17 00:00:00 2001 From: Konstantin Seurer Date: Sat, 28 Sep 2024 10:14:16 +0200 Subject: [PATCH] lavapipe: Prefetch 56 bytes of node data during ray traversal Almost all node types need around 56 bytes of data. This patch fetches this data in a less divergent block. Reviewed-By: Mike Blumenkrantz Part-of: --- .../lavapipe/lvp_acceleration_structure.c | 2 + .../lavapipe/lvp_acceleration_structure.h | 6 ++ .../lavapipe/lvp_ray_tracing_pipeline.c | 6 +- src/gallium/frontends/lavapipe/nir/lvp_nir.h | 2 +- .../lavapipe/nir/lvp_nir_lower_ray_queries.c | 6 +- .../lavapipe/nir/lvp_nir_ray_tracing.c | 102 ++++++++++++------ 6 files changed, 82 insertions(+), 42 deletions(-) diff --git a/src/gallium/frontends/lavapipe/lvp_acceleration_structure.c b/src/gallium/frontends/lavapipe/lvp_acceleration_structure.c index 86ff7b9e72a..9e3602f4546 100644 --- a/src/gallium/frontends/lavapipe/lvp_acceleration_structure.c +++ b/src/gallium/frontends/lavapipe/lvp_acceleration_structure.c @@ -640,6 +640,8 @@ lvp_get_as_size(VkDevice device, nodes_size += leaf_count * output_leaf_node_size; + nodes_size = util_align_npot(nodes_size, LVP_BVH_NODE_PREFETCH_SIZE); + return sizeof(struct lvp_bvh_header) + nodes_size; } diff --git a/src/gallium/frontends/lavapipe/lvp_acceleration_structure.h b/src/gallium/frontends/lavapipe/lvp_acceleration_structure.h index ff260bb1127..40808297d3c 100644 --- a/src/gallium/frontends/lavapipe/lvp_acceleration_structure.h +++ b/src/gallium/frontends/lavapipe/lvp_acceleration_structure.h @@ -23,6 +23,7 @@ #define lvp_bvh_node_instance 2 #define lvp_bvh_node_aabb 3 +/* 48 bytes */ struct lvp_bvh_triangle_node { float coords[3][3]; @@ -33,6 +34,7 @@ struct lvp_bvh_triangle_node { uint32_t geometry_id_and_flags; }; +/* 32 bytes */ struct lvp_bvh_aabb_node { vk_aabb bounds; @@ -41,6 +43,7 @@ struct lvp_bvh_aabb_node { uint32_t geometry_id_and_flags; }; +/* 120 bytes */ struct lvp_bvh_instance_node { uint64_t bvh_ptr; @@ -58,11 +61,14 @@ struct lvp_bvh_instance_node { mat3x4 otw_matrix; }; +/* 56 bytes */ struct lvp_bvh_box_node { vk_aabb bounds[2]; uint32_t children[2]; }; +#define LVP_BVH_NODE_PREFETCH_SIZE 56 + struct lvp_bvh_header { vk_aabb bounds; diff --git a/src/gallium/frontends/lavapipe/lvp_ray_tracing_pipeline.c b/src/gallium/frontends/lavapipe/lvp_ray_tracing_pipeline.c index 56d1485582b..65452042cf9 100644 --- a/src/gallium/frontends/lavapipe/lvp_ray_tracing_pipeline.c +++ b/src/gallium/frontends/lavapipe/lvp_ray_tracing_pipeline.c @@ -933,7 +933,7 @@ lvp_lower_ray_tracing_instr(nir_builder *b, nir_instr *instr, void *data) unsigned c = nir_intrinsic_column(intr); nir_def *instance_node_addr = nir_load_var(b, state->instance_addr); nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, instance_node_addr, wto_matrix); + lvp_load_wto_matrix(b, instance_node_addr, NULL, wto_matrix); nir_def *vals[3]; for (unsigned i = 0; i < 3; ++i) @@ -956,14 +956,14 @@ lvp_lower_ray_tracing_instr(nir_builder *b, nir_instr *instr, void *data) case nir_intrinsic_load_ray_object_origin: { nir_def *instance_node_addr = nir_load_var(b, state->instance_addr); nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, instance_node_addr, wto_matrix); + lvp_load_wto_matrix(b, instance_node_addr, NULL, wto_matrix); def = lvp_mul_vec3_mat(b, nir_load_var(b, state->origin), wto_matrix, true); break; } case nir_intrinsic_load_ray_object_direction: { nir_def *instance_node_addr = nir_load_var(b, state->instance_addr); nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, instance_node_addr, wto_matrix); + lvp_load_wto_matrix(b, instance_node_addr, NULL, wto_matrix); def = lvp_mul_vec3_mat(b, nir_load_var(b, state->dir), wto_matrix, false); break; } diff --git a/src/gallium/frontends/lavapipe/nir/lvp_nir.h b/src/gallium/frontends/lavapipe/nir/lvp_nir.h index a1f685c5c09..20910879e2d 100644 --- a/src/gallium/frontends/lavapipe/nir/lvp_nir.h +++ b/src/gallium/frontends/lavapipe/nir/lvp_nir.h @@ -12,7 +12,7 @@ nir_def *lvp_mul_vec3_mat(nir_builder *b, nir_def *vec, nir_def *matrix[], bool translation); -void lvp_load_wto_matrix(nir_builder *b, nir_def *instance_addr, nir_def **out); +void lvp_load_wto_matrix(nir_builder *b, nir_def *instance_addr, nir_def **node_data, nir_def **out); nir_def *lvp_load_vertex_position(nir_builder *b, nir_def *instance_addr, nir_def *primitive_id, uint32_t index); diff --git a/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_ray_queries.c b/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_ray_queries.c index bb89e0dd06b..2ad9f41aeba 100644 --- a/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_ray_queries.c +++ b/src/gallium/frontends/lavapipe/nir/lvp_nir_lower_ray_queries.c @@ -379,13 +379,13 @@ lower_rq_load(nir_builder *b, nir_def *index, nir_intrinsic_instr *instr, case nir_ray_query_value_intersection_object_ray_direction: { nir_def *instance_node_addr = rq_load_var(b, index, intersection->instance_addr); nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, instance_node_addr, wto_matrix); + lvp_load_wto_matrix(b, instance_node_addr, NULL, wto_matrix); return lvp_mul_vec3_mat(b, rq_load_var(b, index, vars->direction), wto_matrix, false); } case nir_ray_query_value_intersection_object_ray_origin: { nir_def *instance_node_addr = rq_load_var(b, index, intersection->instance_addr); nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, instance_node_addr, wto_matrix); + lvp_load_wto_matrix(b, instance_node_addr, NULL, wto_matrix); return lvp_mul_vec3_mat(b, rq_load_var(b, index, vars->origin), wto_matrix, true); } case nir_ray_query_value_intersection_object_to_world: { @@ -415,7 +415,7 @@ lower_rq_load(nir_builder *b, nir_def *index, nir_intrinsic_instr *instr, nir_def *instance_node_addr = rq_load_var(b, index, intersection->instance_addr); nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, instance_node_addr, wto_matrix); + lvp_load_wto_matrix(b, instance_node_addr, NULL, wto_matrix); nir_def *vals[3]; for (unsigned i = 0; i < 3; ++i) diff --git a/src/gallium/frontends/lavapipe/nir/lvp_nir_ray_tracing.c b/src/gallium/frontends/lavapipe/nir/lvp_nir_ray_tracing.c index 3118674e36c..5f57ac2cf81 100644 --- a/src/gallium/frontends/lavapipe/nir/lvp_nir_ray_tracing.c +++ b/src/gallium/frontends/lavapipe/nir/lvp_nir_ray_tracing.c @@ -31,12 +31,27 @@ lvp_mul_vec3_mat(nir_builder *b, nir_def *vec, nir_def *matrix[], bool translati return nir_vec(b, result_components, 3); } +static nir_def * +lvp_load_node_data(nir_builder *b, nir_def *addr, nir_def **node_data, uint32_t offset) +{ + if (offset < LVP_BVH_NODE_PREFETCH_SIZE && node_data) + return node_data[offset / 4]; + + return nir_build_load_global(b, 1, 32, nir_iadd_imm(b, addr, offset)); +} + void -lvp_load_wto_matrix(nir_builder *b, nir_def *instance_addr, nir_def **out) +lvp_load_wto_matrix(nir_builder *b, nir_def *instance_addr, nir_def **node_data, nir_def **out) { unsigned offset = offsetof(struct lvp_bvh_instance_node, wto_matrix); for (unsigned i = 0; i < 3; ++i) { out[i] = nir_build_load_global(b, 4, 32, nir_iadd_imm(b, instance_addr, offset + i * 16)); + out[i] = nir_vec4(b, + lvp_load_node_data(b, instance_addr, node_data, offset + i * 16 + 0), + lvp_load_node_data(b, instance_addr, node_data, offset + i * 16 + 4), + lvp_load_node_data(b, instance_addr, node_data, offset + i * 16 + 8), + lvp_load_node_data(b, instance_addr, node_data, offset + i * 16 + 12) + ); } } @@ -58,7 +73,7 @@ lvp_load_vertex_position(nir_builder *b, nir_def *instance_addr, nir_def *primit } static nir_def * -lvp_build_intersect_ray_box(nir_builder *b, nir_def *node_addr, nir_def *ray_tmax, +lvp_build_intersect_ray_box(nir_builder *b, nir_def **node_data, nir_def *ray_tmax, nir_def *origin, nir_def *dir, nir_def *inv_dir) { const struct glsl_type *vec2_type = glsl_vector_type(GLSL_TYPE_FLOAT, 2); @@ -81,12 +96,19 @@ lvp_build_intersect_ray_box(nir_builder *b, nir_def *node_addr, nir_def *ray_tma offsetof(struct lvp_bvh_box_node, bounds[i].max.x), }; - nir_def *child_index = - nir_build_load_global(b, 1, 32, nir_iadd_imm(b, node_addr, child_offset)); + nir_def *child_index = lvp_load_node_data(b, NULL, node_data, child_offset); nir_def *node_coords[2] = { - nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, coord_offsets[0])), - nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, coord_offsets[1])), + nir_vec3(b, + lvp_load_node_data(b, NULL, node_data, coord_offsets[0] + 0), + lvp_load_node_data(b, NULL, node_data, coord_offsets[0] + 4), + lvp_load_node_data(b, NULL, node_data, coord_offsets[0] + 8) + ), + nir_vec3(b, + lvp_load_node_data(b, NULL, node_data, coord_offsets[1] + 0), + lvp_load_node_data(b, NULL, node_data, coord_offsets[1] + 4), + lvp_load_node_data(b, NULL, node_data, coord_offsets[1] + 8) + ), }; /* If x of the aabb min is NaN, then this is an inactive aabb. @@ -140,7 +162,7 @@ lvp_build_intersect_ray_box(nir_builder *b, nir_def *node_addr, nir_def *ray_tma } static nir_def * -lvp_build_intersect_ray_tri(nir_builder *b, nir_def *node_addr, nir_def *ray_tmax, +lvp_build_intersect_ray_tri(nir_builder *b, nir_def **node_data, nir_def *ray_tmax, nir_def *origin, nir_def *dir, nir_def *inv_dir) { const struct glsl_type *vec4_type = glsl_vector_type(GLSL_TYPE_FLOAT, 4); @@ -152,9 +174,21 @@ lvp_build_intersect_ray_tri(nir_builder *b, nir_def *node_addr, nir_def *ray_tma }; nir_def *node_coords[3] = { - nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, coord_offsets[0])), - nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, coord_offsets[1])), - nir_build_load_global(b, 3, 32, nir_iadd_imm(b, node_addr, coord_offsets[2])), + nir_vec3(b, + lvp_load_node_data(b, NULL, node_data, coord_offsets[0] + 0), + lvp_load_node_data(b, NULL, node_data, coord_offsets[0] + 4), + lvp_load_node_data(b, NULL, node_data, coord_offsets[0] + 8) + ), + nir_vec3(b, + lvp_load_node_data(b, NULL, node_data, coord_offsets[1] + 0), + lvp_load_node_data(b, NULL, node_data, coord_offsets[1] + 4), + lvp_load_node_data(b, NULL, node_data, coord_offsets[1] + 8) + ), + nir_vec3(b, + lvp_load_node_data(b, NULL, node_data, coord_offsets[2] + 0), + lvp_load_node_data(b, NULL, node_data, coord_offsets[2] + 4), + lvp_load_node_data(b, NULL, node_data, coord_offsets[2] + 8) + ), }; nir_variable *result = nir_variable_create(b->shader, nir_var_shader_temp, vec4_type, "result"); @@ -286,7 +320,7 @@ lvp_build_hit_is_opaque(nir_builder *b, nir_def *sbt_offset_and_flags, static void lvp_build_triangle_case(nir_builder *b, const struct lvp_ray_traversal_args *args, const struct lvp_ray_flags *ray_flags, nir_def *result, - nir_def *node_addr) + nir_def *node_addr, nir_def **node_data) { if (!args->triangle_cb) return; @@ -315,12 +349,10 @@ lvp_build_triangle_case(nir_builder *b, const struct lvp_ray_traversal_args *arg nir_push_if(b, nir_iand(b, nir_flt(b, args->tmin, intersection.t), not_cull)); { intersection.base.node_addr = node_addr; - nir_def *triangle_info = nir_build_load_global( - b, 2, 32, - nir_iadd_imm(b, intersection.base.node_addr, - offsetof(struct lvp_bvh_triangle_node, primitive_id))); - intersection.base.primitive_id = nir_channel(b, triangle_info, 0); - intersection.base.geometry_id_and_flags = nir_channel(b, triangle_info, 1); + intersection.base.primitive_id = + lvp_load_node_data(b, node_addr, node_data, offsetof(struct lvp_bvh_triangle_node, primitive_id)); + intersection.base.geometry_id_and_flags = + lvp_load_node_data(b, node_addr, node_data, offsetof(struct lvp_bvh_triangle_node, geometry_id_and_flags)); intersection.base.opaque = lvp_build_hit_is_opaque(b, nir_load_deref(b, args->vars.sbt_offset_and_flags), ray_flags, intersection.base.geometry_id_and_flags); @@ -340,18 +372,18 @@ lvp_build_triangle_case(nir_builder *b, const struct lvp_ray_traversal_args *arg static void lvp_build_aabb_case(nir_builder *b, const struct lvp_ray_traversal_args *args, - const struct lvp_ray_flags *ray_flags, nir_def *node_addr) + const struct lvp_ray_flags *ray_flags, nir_def *node_addr, + nir_def **node_data) { if (!args->aabb_cb) return; struct lvp_leaf_intersection intersection; intersection.node_addr = node_addr; - nir_def *triangle_info = nir_build_load_global( - b, 2, 32, - nir_iadd_imm(b, intersection.node_addr, offsetof(struct lvp_bvh_aabb_node, primitive_id))); - intersection.primitive_id = nir_channel(b, triangle_info, 0); - intersection.geometry_id_and_flags = nir_channel(b, triangle_info, 1); + intersection.primitive_id = + lvp_load_node_data(b, node_addr, node_data, offsetof(struct lvp_bvh_aabb_node, primitive_id)); + intersection.geometry_id_and_flags = + lvp_load_node_data(b, node_addr, node_data, offsetof(struct lvp_bvh_aabb_node, geometry_id_and_flags)); intersection.opaque = lvp_build_hit_is_opaque(b, nir_load_deref(b, args->vars.sbt_offset_and_flags), ray_flags, intersection.geometry_id_and_flags); @@ -438,6 +470,10 @@ lvp_build_ray_traversal(nir_builder *b, const struct lvp_ray_traversal_args *arg nir_def *node_addr = nir_iadd(b, nir_load_deref(b, args->vars.bvh_base), nir_u2u64(b, nir_iand_imm(b, bvh_node, ~3u))); + nir_def *node_data[LVP_BVH_NODE_PREFETCH_SIZE / 4]; + for (uint32_t i = 0; i < ARRAY_SIZE(node_data); i++) + node_data[i] = nir_build_load_global(b, 1, 32, nir_iadd_imm(b, node_addr, i * 4)); + nir_def *node_type = nir_iand_imm(b, bvh_node, 3); nir_push_if(b, nir_uge_imm(b, node_type, lvp_bvh_node_internal)); { @@ -445,24 +481,20 @@ lvp_build_ray_traversal(nir_builder *b, const struct lvp_ray_traversal_args *arg { nir_push_if(b, nir_ieq_imm(b, node_type, lvp_bvh_node_aabb)); { - lvp_build_aabb_case(b, args, &ray_flags, node_addr); + lvp_build_aabb_case(b, args, &ray_flags, node_addr, node_data); } nir_push_else(b, NULL); { /* instance */ nir_store_deref(b, args->vars.instance_addr, node_addr, 1); - nir_def *instance_data = nir_build_load_global( - b, 4, 32, - nir_iadd_imm(b, node_addr, offsetof(struct lvp_bvh_instance_node, bvh_ptr))); - nir_def *wto_matrix[3]; - lvp_load_wto_matrix(b, node_addr, wto_matrix); + lvp_load_wto_matrix(b, node_addr, node_data, wto_matrix); - nir_store_deref(b, args->vars.sbt_offset_and_flags, nir_channel(b, instance_data, 3), + nir_store_deref(b, args->vars.sbt_offset_and_flags, node_data[3], 1); - nir_def *instance_and_mask = nir_channel(b, instance_data, 2); + nir_def *instance_and_mask = node_data[2]; nir_push_if(b, nir_ult(b, nir_iand(b, instance_and_mask, args->cull_mask), nir_imm_int(b, 1 << 24))); { @@ -471,7 +503,7 @@ lvp_build_ray_traversal(nir_builder *b, const struct lvp_ray_traversal_args *arg nir_pop_if(b, NULL); nir_store_deref(b, args->vars.bvh_base, - nir_pack_64_2x32(b, nir_trim_vector(b, instance_data, 2)), 1); + nir_pack_64_2x32_split(b, node_data[0], node_data[1]), 1); nir_store_deref(b, args->vars.stack_base, nir_load_deref(b, args->vars.stack_ptr), 0x1); @@ -491,7 +523,7 @@ lvp_build_ray_traversal(nir_builder *b, const struct lvp_ray_traversal_args *arg nir_push_else(b, NULL); { nir_def *result = lvp_build_intersect_ray_box( - b, node_addr, nir_load_deref(b, args->vars.tmax), + b, node_data, nir_load_deref(b, args->vars.tmax), nir_load_deref(b, args->vars.origin), nir_load_deref(b, args->vars.dir), nir_load_deref(b, args->vars.inv_dir)); @@ -508,10 +540,10 @@ lvp_build_ray_traversal(nir_builder *b, const struct lvp_ray_traversal_args *arg nir_push_else(b, NULL); { nir_def *result = lvp_build_intersect_ray_tri( - b, node_addr, nir_load_deref(b, args->vars.tmax), nir_load_deref(b, args->vars.origin), + b, node_data, nir_load_deref(b, args->vars.tmax), nir_load_deref(b, args->vars.origin), nir_load_deref(b, args->vars.dir), nir_load_deref(b, args->vars.inv_dir)); - lvp_build_triangle_case(b, args, &ray_flags, result, node_addr); + lvp_build_triangle_case(b, args, &ray_flags, result, node_addr, node_data); } nir_pop_if(b, NULL); }