radeonsi/gfx11: use SET_SH_REG_PAIRS_PACKED for gfx by buffering reg writes

Instead of writing SH registers into the command buffer, push them into
an array in si_context. Before a draw, take all buffered register writes
and create a single SET_SH_REG_PAIRS_PACKED packet for them.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23517>
This commit is contained in:
Marek Olšák 2023-06-11 18:37:26 -04:00 committed by Marge Bot
parent a6e6646d91
commit 1753b321f8
7 changed files with 250 additions and 67 deletions

View file

@ -118,6 +118,32 @@
radeon_emit(value); \
} while (0)
#define radeon_push_gfx_sh_reg(reg, value) do { \
unsigned __i = sctx->num_buffered_gfx_sh_regs++; \
assert(__i / 2 < ARRAY_SIZE(sctx->buffered_gfx_sh_regs)); \
sctx->buffered_gfx_sh_regs[__i / 2].reg_offset[__i % 2] = ((reg) - SI_SH_REG_OFFSET) >> 2; \
sctx->buffered_gfx_sh_regs[__i / 2].reg_value[__i % 2] = value; \
} while (0)
#define radeon_set_or_push_gfx_sh_reg(reg, value) do { \
if (GFX_VERSION >= GFX11) { \
radeon_push_gfx_sh_reg(reg, value); \
} else { \
radeon_set_sh_reg_seq(reg, 1); \
radeon_emit(value); \
} \
} while (0)
#define radeon_opt_push_gfx_sh_reg(offset, reg, val) do { \
unsigned __value = val; \
if (((sctx->tracked_regs.other_reg_saved_mask >> (reg)) & 0x1) != 0x1 || \
sctx->tracked_regs.other_reg_value[reg] != __value) { \
radeon_push_gfx_sh_reg(offset, __value); \
sctx->tracked_regs.other_reg_saved_mask |= BITFIELD64_BIT(reg); \
sctx->tracked_regs.other_reg_value[reg] = __value; \
} \
} while (0)
#define radeon_set_uconfig_reg_seq(reg, num, perfctr) do { \
assert((reg) >= CIK_UCONFIG_REG_OFFSET && (reg) < CIK_UCONFIG_REG_END); \
radeon_emit(PKT3(PKT3_SET_UCONFIG_REG, num, perfctr)); \

View file

@ -2160,16 +2160,25 @@ void si_shader_change_notify(struct si_context *sctx)
if (sh_reg_base) { \
unsigned mask = sctx->shader_pointers_dirty & (pointer_mask); \
\
while (mask) { \
int start, count; \
u_bit_scan_consecutive_range(&mask, &start, &count); \
\
struct si_descriptors *descs = &sctx->descriptors[start]; \
unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \
\
radeon_set_sh_reg_seq(sh_offset, count); \
for (int i = 0; i < count; i++) \
radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \
if (sctx->gfx_level >= GFX11 && sh_reg_base != R_00B900_COMPUTE_USER_DATA_0) { \
u_foreach_bit(i, mask) { \
struct si_descriptors *descs = &sctx->descriptors[i]; \
unsigned sh_reg = sh_reg_base + descs->shader_userdata_offset; \
\
radeon_push_gfx_sh_reg(sh_reg, descs->gpu_address); \
} \
} else { \
while (mask) { \
int start, count; \
u_bit_scan_consecutive_range(&mask, &start, &count); \
\
struct si_descriptors *descs = &sctx->descriptors[start]; \
unsigned sh_offset = sh_reg_base + descs->shader_userdata_offset; \
\
radeon_set_sh_reg_seq(sh_offset, count); \
for (int i = 0; i < count; i++) \
radeon_emit_32bit_pointer(sctx->screen, descs[i].gpu_address); \
} \
} \
} \
} while (0)
@ -2179,9 +2188,12 @@ static void si_emit_global_shader_pointers(struct si_context *sctx, struct si_de
radeon_begin(&sctx->gfx_cs);
if (sctx->gfx_level >= GFX11) {
radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
radeon_emit_one_32bit_pointer(sctx, descs, R_00B230_SPI_SHADER_USER_DATA_GS_0);
radeon_emit_one_32bit_pointer(sctx, descs, R_00B430_SPI_SHADER_USER_DATA_HS_0);
radeon_push_gfx_sh_reg(R_00B030_SPI_SHADER_USER_DATA_PS_0 + descs->shader_userdata_offset,
descs->gpu_address);
radeon_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + descs->shader_userdata_offset,
descs->gpu_address);
radeon_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 + descs->shader_userdata_offset,
descs->gpu_address);
} else if (sctx->gfx_level >= GFX10) {
radeon_emit_one_32bit_pointer(sctx, descs, R_00B030_SPI_SHADER_USER_DATA_PS_0);
/* HW VS stage only used in non-NGG mode. */
@ -2231,8 +2243,9 @@ void si_emit_graphics_shader_pointers(struct si_context *sctx)
if (sctx->gs_attribute_ring_pointer_dirty) {
assert(sctx->gfx_level >= GFX11);
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
sctx->screen->attribute_ring->gpu_address);
radeon_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
GFX9_SGPR_ATTRIBUTE_RING_ADDR * 4,
sctx->screen->attribute_ring->gpu_address);
sctx->gs_attribute_ring_pointer_dirty = false;
}
radeon_end();

View file

@ -545,6 +545,9 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
ctx->last_tes_sh_base = -1;
ctx->last_num_tcs_input_cp = -1;
assert(ctx->num_buffered_gfx_sh_regs == 0);
ctx->num_buffered_gfx_sh_regs = 0;
if (ctx->scratch_buffer) {
si_context_add_resource_size(ctx, &ctx->scratch_buffer->b.b);
si_mark_atom_dirty(ctx, &ctx->atoms.s.scratch_state);

View file

@ -910,6 +910,20 @@ struct si_vertex_state {
uint32_t descriptors[4 * SI_MAX_ATTRIBS];
};
/* The structure layout is identical to a pair of registers in SET_*_REG_PAIRS_PACKED. */
struct si_sh_reg_pair {
union {
/* A pair of register offsets. */
struct {
uint16_t reg_offset[2];
};
/* The same pair of register offsets as a dword. */
uint32_t reg_offsets;
};
/* A pair of register values for the register offsets above. */
uint32_t reg_value[2];
};
typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
const struct pipe_draw_info *info,
unsigned drawid_offset,
@ -1017,6 +1031,9 @@ struct si_context {
unsigned dirty_states;
union si_state queued;
union si_state emitted;
/* Gfx11+: Buffered SH registers for SET_SH_REG_PAIRS_PACKED*. */
unsigned num_buffered_gfx_sh_regs;
struct si_sh_reg_pair buffered_gfx_sh_regs[32];
/* Atom declarations. */
struct si_framebuffer framebuffer;

View file

@ -855,7 +855,20 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx)
if (!sctx->shader.tes.cso || !sctx->shader.tcs.current)
return;
if (sctx->gfx_level >= GFX9) {
if (sctx->gfx_level >= GFX11) {
radeon_opt_push_gfx_sh_reg(R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
/* Set userdata SGPRs for merged LS-HS. */
radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
GFX9_SGPR_TCS_OFFCHIP_LAYOUT * 4,
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_LAYOUT,
sctx->tcs_offchip_layout);
radeon_opt_push_gfx_sh_reg(R_00B430_SPI_SHADER_USER_DATA_HS_0 +
GFX9_SGPR_TCS_OFFCHIP_ADDR * 4,
SI_TRACKED_SPI_SHADER_USER_DATA_HS__TCS_OFFCHIP_ADDR,
sctx->tes_offchip_ring_va_sgpr);
} else if (sctx->gfx_level >= GFX9) {
radeon_opt_set_sh_reg(sctx, R_00B42C_SPI_SHADER_PGM_RSRC2_HS,
SI_TRACKED_SPI_SHADER_PGM_RSRC2_HS, sctx->ls_hs_rsrc2);
@ -890,9 +903,16 @@ static void si_emit_tess_io_layout_state(struct si_context *sctx)
/* These can't be optimized because the user data SGPRs may have different meaning
* without tessellation. (they are VS and ES/GS user data SGPRs)
*/
radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
radeon_emit(sctx->tcs_offchip_layout);
radeon_emit(sctx->tes_offchip_ring_va_sgpr);
if (sctx->gfx_level >= GFX11) {
radeon_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4,
sctx->tcs_offchip_layout);
radeon_push_gfx_sh_reg(tes_sh_base + SI_SGPR_TES_OFFCHIP_ADDR * 4,
sctx->tes_offchip_ring_va_sgpr);
} else {
radeon_set_sh_reg_seq(tes_sh_base + SI_SGPR_TES_OFFCHIP_LAYOUT * 4, 2);
radeon_emit(sctx->tcs_offchip_layout);
radeon_emit(sctx->tes_offchip_ring_va_sgpr);
}
radeon_end();
radeon_begin_again(cs);
@ -1238,23 +1258,23 @@ static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
radeon_begin(cs);
if (HAS_GS) {
radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
radeon_set_or_push_gfx_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
/* GS always uses the state bits for emulating VGT_ESGS_RING_ITEMSIZE on Gfx9
* (via nir_load_esgs_vertex_stride_amd) and for emulating GS pipeline statistics
* on gfx10.x. NGG GS also has lots of states in there.
*/
if (GFX_VERSION >= GFX9)
radeon_set_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
radeon_set_or_push_gfx_sh_reg(gs_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
/* The GS copy shader (for legacy GS) always uses the state bits. */
if (!NGG)
radeon_set_sh_reg(gs_copy_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
radeon_set_or_push_gfx_sh_reg(gs_copy_base + SI_SGPR_VS_STATE_BITS * 4, gs_state);
} else if (HAS_TESS) {
radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
radeon_set_sh_reg(tes_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state);
radeon_set_or_push_gfx_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, vs_state);
radeon_set_or_push_gfx_sh_reg(tes_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state);
} else {
radeon_set_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state);
radeon_set_or_push_gfx_sh_reg(vs_base + SI_SGPR_VS_STATE_BITS * 4, NGG ? gs_state : vs_state);
}
radeon_end();
@ -1454,6 +1474,49 @@ static void si_emit_draw_registers(struct si_context *sctx,
radeon_end();
}
static ALWAYS_INLINE void
gfx11_emit_buffered_sh_regs_inline(struct si_context *sctx, unsigned *num_regs,
struct si_sh_reg_pair *reg_pairs)
{
unsigned reg_count = *num_regs;
if (!reg_count)
return;
*num_regs = 0;
/* If there is only one register, we can't use the packed SET packet. */
if (reg_count == 1) {
radeon_begin(&sctx->gfx_cs);
radeon_emit(PKT3(PKT3_SET_SH_REG, 1, 0));
radeon_emit(reg_pairs[0].reg_offset[0]);
radeon_emit(reg_pairs[0].reg_value[0]);
radeon_end();
return;
}
unsigned packet = reg_count <= 14 ? PKT3_SET_SH_REG_PAIRS_PACKED_N :
PKT3_SET_SH_REG_PAIRS_PACKED;
unsigned padded_reg_count = align(reg_count, 2);
radeon_begin(&sctx->gfx_cs);
radeon_emit(PKT3(packet, (padded_reg_count / 2) * 3, 0) | PKT3_RESET_FILTER_CAM_S(1));
radeon_emit(padded_reg_count);
radeon_emit_array(reg_pairs, (reg_count / 2) * 3);
if (reg_count % 2 == 1) {
unsigned i = reg_count / 2;
/* Pad the packet by setting the first register again at the end because the register
* count must be even and 2 consecutive offsets must not be equal.
*/
radeon_emit(reg_pairs[i].reg_offset[0] | ((uint32_t)reg_pairs[0].reg_offset[0] << 16));
radeon_emit(reg_pairs[i].reg_value[0]);
radeon_emit(reg_pairs[0].reg_value[0]);
}
radeon_end();
}
#define EMIT_SQTT_END_DRAW \
do { \
if (GFX_VERSION >= GFX9 && unlikely(sctx->sqtt_enabled)) { \
@ -1605,6 +1668,13 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
assert(indirect_va % 8 == 0);
if (GFX_VERSION >= GFX11) {
radeon_end();
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs,
sctx->buffered_gfx_sh_regs);
radeon_begin_again(cs);
}
si_invalidate_draw_constants(sctx);
radeon_emit(PKT3(PKT3_SET_BASE, 2, 0));
@ -1674,43 +1744,80 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
bool set_draw_id = !IS_DRAW_VERTEX_STATE && sctx->vs_uses_draw_id;
bool set_base_instance = sctx->vs_uses_base_instance;
bool is_blit = !IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs;
if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
if (!is_blit) {
/* Prefer SET_SH_REG_PAIRS_PACKED* on Gfx11+. */
if (GFX_VERSION >= GFX11) {
if (base_vertex != sctx->last_base_vertex ||
sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN) {
radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);
sctx->last_base_vertex = base_vertex;
}
if (set_draw_id &&
(drawid_base != sctx->last_drawid ||
sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) {
radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_DRAWID * 4, drawid_base);
sctx->last_drawid = drawid_base;
}
if (set_base_instance &&
(info->start_instance != sctx->last_start_instance ||
sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) {
radeon_push_gfx_sh_reg(sh_base_reg + SI_SGPR_START_INSTANCE * 4,
info->start_instance);
sctx->last_start_instance = info->start_instance;
}
} else if (base_vertex != sctx->last_base_vertex ||
sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
(set_base_instance &&
(info->start_instance != sctx->last_start_instance ||
sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) ||
(set_draw_id &&
(drawid_base != sctx->last_drawid ||
sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) ||
sh_base_reg != sctx->last_sh_base_reg) {
if (set_base_instance) {
radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
radeon_emit(base_vertex);
radeon_emit(drawid_base);
radeon_emit(info->start_instance);
sctx->last_start_instance = info->start_instance;
sctx->last_drawid = drawid_base;
} else if (set_draw_id) {
radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
radeon_emit(base_vertex);
radeon_emit(drawid_base);
sctx->last_drawid = drawid_base;
} else {
radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);
}
sctx->last_base_vertex = base_vertex;
sctx->last_sh_base_reg = sh_base_reg;
}
}
if (GFX_VERSION >= GFX11) {
radeon_end();
gfx11_emit_buffered_sh_regs_inline(sctx, &sctx->num_buffered_gfx_sh_regs,
sctx->buffered_gfx_sh_regs);
radeon_begin_again(cs);
}
/* Blit SGPRs must be set after gfx11_emit_buffered_sh_regs_inline because they can
* overwrite them.
*/
if (is_blit) {
/* Re-emit draw constants after we leave u_blitter. */
si_invalidate_draw_sh_constants(sctx);
/* Blit VS doesn't use BASE_VERTEX, START_INSTANCE, and DRAWID. */
radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_VS_BLIT_DATA * 4, sctx->num_vs_blit_sgprs);
radeon_emit_array(sctx->vs_blit_sh_data, sctx->num_vs_blit_sgprs);
} else if (base_vertex != sctx->last_base_vertex ||
sctx->last_base_vertex == SI_BASE_VERTEX_UNKNOWN ||
(set_base_instance &&
(info->start_instance != sctx->last_start_instance ||
sctx->last_start_instance == SI_START_INSTANCE_UNKNOWN)) ||
(set_draw_id &&
(drawid_base != sctx->last_drawid ||
sctx->last_drawid == SI_DRAW_ID_UNKNOWN)) ||
sh_base_reg != sctx->last_sh_base_reg) {
if (set_base_instance) {
radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 3);
radeon_emit(base_vertex);
radeon_emit(drawid_base);
radeon_emit(info->start_instance);
sctx->last_start_instance = info->start_instance;
sctx->last_drawid = drawid_base;
} else if (set_draw_id) {
radeon_set_sh_reg_seq(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, 2);
radeon_emit(base_vertex);
radeon_emit(drawid_base);
sctx->last_drawid = drawid_base;
} else {
radeon_set_sh_reg(sh_base_reg + SI_SGPR_BASE_VERTEX * 4, base_vertex);
}
sctx->last_base_vertex = base_vertex;
sctx->last_sh_base_reg = sh_base_reg;
}
/* Don't update draw_id in the following code if it doesn't increment. */
@ -2018,7 +2125,7 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
unsigned vb_desc_offset =
sh_base + get_vb_descriptor_sgpr_ptr_offset<GFX_VERSION, HAS_TESS, HAS_GS, NGG>();
radeon_set_sh_reg(vb_desc_offset, vb_descriptors_address);
radeon_set_or_push_gfx_sh_reg(vb_desc_offset, vb_descriptors_address);
/* the first iteration always executes */
do {
@ -2072,7 +2179,7 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
unsigned vb_desc_ptr_offset =
sh_base + get_vb_descriptor_sgpr_ptr_offset<GFX_VERSION, HAS_TESS, HAS_GS, NGG>();
radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg(vb_desc_ptr_offset, vb_descriptors_address);
radeon_set_or_push_gfx_sh_reg(vb_desc_ptr_offset, vb_descriptors_address);
radeon_end();
}

View file

@ -1186,12 +1186,22 @@ static void gfx10_emit_shader_ngg_tail(struct si_context *sctx, struct si_shader
radeon_begin_again(&sctx->gfx_cs);
radeon_opt_set_uconfig_reg(sctx, R_030980_GE_PC_ALLOC, SI_TRACKED_GE_PC_ALLOC,
shader->ngg.ge_pc_alloc);
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ngg.spi_shader_pgm_rsrc4_gs);
if (sctx->gfx_level >= GFX11) {
assert(!sctx->screen->info.uses_kernel_cu_mask);
radeon_opt_push_gfx_sh_reg(R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->gs.spi_shader_pgm_rsrc3_gs);
radeon_opt_push_gfx_sh_reg(R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->gs.spi_shader_pgm_rsrc4_gs);
} else {
radeon_opt_set_sh_reg_idx3(sctx, R_00B21C_SPI_SHADER_PGM_RSRC3_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC3_GS,
shader->ngg.spi_shader_pgm_rsrc3_gs);
radeon_opt_set_sh_reg_idx3(sctx, R_00B204_SPI_SHADER_PGM_RSRC4_GS,
SI_TRACKED_SPI_SHADER_PGM_RSRC4_GS,
shader->ngg.spi_shader_pgm_rsrc4_gs);
}
radeon_end();
}

View file

@ -90,10 +90,17 @@ static void si_emit_cull_state(struct si_context *sctx)
/* This will end up in SGPR6 as (value << 8), shifted by the hw. */
radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, sctx->small_prim_cull_info_buf,
RADEON_USAGE_READ | RADEON_PRIO_CONST_BUFFER);
radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4,
sctx->small_prim_cull_info_address);
radeon_end();
if (sctx->gfx_level >= GFX11) {
radeon_push_gfx_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 +
GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4,
sctx->small_prim_cull_info_address);
} else {
radeon_begin(&sctx->gfx_cs);
radeon_set_sh_reg(R_00B230_SPI_SHADER_USER_DATA_GS_0 + GFX9_SGPR_SMALL_PRIM_CULL_INFO * 4,
sctx->small_prim_cull_info_address);
radeon_end();
}
/* Better subpixel precision increases the efficiency of small
* primitive culling. (more precision means a tighter bounding box