Merge branch 'pan_blit_with_run_fullscreen' into 'main'

Draft: panfrost: Blit with the RUN_FULLSCREEN instruction

See merge request mesa/mesa!40124
This commit is contained in:
Loïc Molinari 2026-03-11 05:57:36 +01:00
commit 6efb3cb766
12 changed files with 336 additions and 24 deletions

View file

@ -72,13 +72,11 @@ struct blitter_context
*
* \param type Semantics of the attributes "attrib".
* If type is UTIL_BLITTER_ATTRIB_NONE, ignore them.
* If type is UTIL_BLITTER_ATTRIB_COLOR, the attributes
* make up a constant RGBA color, and should go
* to the GENERIC0 varying slot of a fragment shader.
* If type is UTIL_BLITTER_ATTRIB_TEXCOORD, {a1, a2} and
* {a3, a4} specify top-left and bottom-right texture
* coordinates of the rectangle, respectively, and should go
* to the GENERIC0 varying slot of a fragment shader.
* If type is UTIL_BLITTER_ATTRIB_TEXCOORD_XY or
* UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW, attrib stores the
* 2-component or 4-component texture coordinates of the
* rectangle, and should go to the GENERIC0 varying slot of a
* fragment shader.
*
* \param attrib See type.
*

View file

@ -10,6 +10,97 @@
#include "pan_trace.h"
#include "pan_util.h"
/* XXX Using a custom fragment shader for textured blits would allow to get
* rid of the texcoords (and varying interp) using fragment position
* instead.
* XXX Depth and stencil clears with the same value could reuse a common
* DrawCallDescriptor.
*/
#define TRANSFORM_TXF(coord, scale, translate) \
((float)(coord) * (scale) + (translate))
static void
panfrost_blitter_draw_rectangle(struct blitter_context *blitter,
void *vertex_elements_cso,
blitter_get_vs_func get_vs,
int x1, int y1, int x2, int y2,
float depth, unsigned num_instances,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib)
{
assert(num_instances);
assert(type == UTIL_BLITTER_ATTRIB_NONE ||
type == UTIL_BLITTER_ATTRIB_TEXCOORD_XY ||
type == UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW);
/* Fall back to draw_vbo. */
if (num_instances > 1)
util_blitter_draw_rectangle(blitter, vertex_elements_cso, get_vs, x1,
y1, x2, y2, depth, num_instances, type,
attrib);
struct pipe_context *ctx = blitter->pipe;
struct panfrost_context *pctx = pan_context(ctx);
struct panfrost_screen *scr = pan_screen(ctx->screen);
/* Set the viewport so that it maps to the dest rect of the framebuffer.
* The tiler will then be configured to use it as scissor box in order to
* clip fullscreen fragments lying outside.
*
* Note that: tx = x1 + ((x2 - x1) / 2) = (x2 + x1) / 2
* ty = y1 + ((y2 - y1) / 2) = (y2 + y1) / 2
*/
const struct pipe_viewport_state viewport_state = {
.scale = { 0.5f * (x2 - x1), 0.5f * (y2 - y1), 1.0f },
.translate = { 0.5f * (x2 + x1), 0.5f * (y2 + y1), 0.0f },
.swizzle_x = PIPE_VIEWPORT_SWIZZLE_POSITIVE_X,
.swizzle_y = PIPE_VIEWPORT_SWIZZLE_POSITIVE_Y,
.swizzle_z = PIPE_VIEWPORT_SWIZZLE_POSITIVE_Z,
.swizzle_w = PIPE_VIEWPORT_SWIZZLE_POSITIVE_W
};
ctx->set_viewport_states(ctx, 0, 1, &viewport_state);
/* Map fullscreen texcoords to the framebuffer, then normalize if not using
* texel fetch. */
struct blitter_attrib fs_attrib;
if (attrib != UTIL_BLITTER_ATTRIB_NONE) {
float w = pctx->pipe_framebuffer.width;
float h = pctx->pipe_framebuffer.height;
fs_attrib.texcoord.x1 = TRANSFORM_TXF(0.0f, w, -x1);
fs_attrib.texcoord.y1 = TRANSFORM_TXF(0.0f, h, -y1);
fs_attrib.texcoord.x2 = TRANSFORM_TXF(1.0f, w, -x1);
fs_attrib.texcoord.y2 = TRANSFORM_TXF(1.0f, h, -y1);
fs_attrib.texcoord.z = attrib->texcoord.z;
fs_attrib.texcoord.w = attrib->texcoord.w;
if (attrib->texcoord.x2 <= 1.0f && attrib->texcoord.y2 <= 1.0f) {
float inv_w = 1.0f / w;
float inv_h = 1.0f / h;
fs_attrib.texcoord.x1 *= inv_w;
fs_attrib.texcoord.y1 *= inv_h;
fs_attrib.texcoord.x2 *= inv_w;
fs_attrib.texcoord.y2 *= inv_h;
}
};
/* Run tiling of a fullscreen fragment job using RUN_FULLSCREEN. */
scr->vtbl.draw_fullscreen(pan_context(ctx), get_vs(blitter), type,
&fs_attrib);
}
struct blitter_context *
panfrost_blitter_create(struct pipe_context *pipe)
{
struct blitter_context *blitter;
blitter = util_blitter_create(pipe);
if (pan_screen(pipe->screen)->dev.arch == 10)
blitter->draw_rectangle = panfrost_blitter_draw_rectangle;
return blitter;
}
void
panfrost_blitter_save(struct panfrost_context *ctx,
const enum panfrost_blitter_op blitter_op)

View file

@ -3332,13 +3332,12 @@ panfrost_increase_vertex_count(struct panfrost_batch *batch, uint32_t increment)
* because all dirty flags are set there.
*/
static void
panfrost_update_active_prim(struct panfrost_context *ctx,
const struct pipe_draw_info *info)
panfrost_update_active_prim(struct panfrost_context *ctx, enum mesa_prim prim)
{
const enum mesa_prim prev_prim = u_reduced_prim(ctx->active_prim);
const enum mesa_prim new_prim = u_reduced_prim(info->mode);
const enum mesa_prim new_prim = u_reduced_prim(prim);
ctx->active_prim = info->mode;
ctx->active_prim = prim;
if ((ctx->dirty & PAN_DIRTY_RASTERIZER) ||
(prev_prim != new_prim)) {
@ -3405,7 +3404,7 @@ panfrost_single_draw_direct(struct panfrost_batch *batch,
struct panfrost_context *ctx = batch->ctx;
panfrost_update_active_prim(ctx, info);
panfrost_update_active_prim(ctx, info->mode);
/* Take into account a negative bias */
ctx->vertex_count =
@ -3480,7 +3479,7 @@ panfrost_compatible_batch_state(struct panfrost_batch *batch,
}
static struct panfrost_batch *
prepare_draw(struct pipe_context *pipe, const struct pipe_draw_info *info)
prepare_draw(struct pipe_context *pipe, enum mesa_prim prim)
{
struct panfrost_context *ctx = pan_context(pipe);
struct panfrost_device *dev = pan_device(pipe->screen);
@ -3502,7 +3501,7 @@ prepare_draw(struct pipe_context *pipe, const struct pipe_draw_info *info)
return NULL;
}
enum mesa_prim reduced_prim = u_reduced_prim(info->mode);
enum mesa_prim reduced_prim = u_reduced_prim(prim);
if (unlikely(!panfrost_compatible_batch_state(batch, reduced_prim))) {
batch = panfrost_get_fresh_batch_for_fbo(ctx, "State change");
@ -3545,7 +3544,7 @@ panfrost_draw_indirect(struct pipe_context *pipe,
return;
}
struct panfrost_batch *batch = prepare_draw(pipe, info);
struct panfrost_batch *batch = prepare_draw(pipe, info->mode);
if (!batch) {
mesa_loge("prepare_draw failed");
return;
@ -3556,7 +3555,7 @@ panfrost_draw_indirect(struct pipe_context *pipe,
panfrost_batch_read_rsrc(batch, pan_resource(indirect->buffer),
MESA_SHADER_VERTEX);
panfrost_update_active_prim(ctx, &tmp_info);
panfrost_update_active_prim(ctx, info->mode);
ctx->drawid = drawid_offset;
@ -3594,7 +3593,7 @@ panfrost_multi_draw_direct(struct pipe_context *pipe,
unsigned num_draws)
{
struct panfrost_context *ctx = pan_context(pipe);
struct panfrost_batch *batch = prepare_draw(pipe, info);
struct panfrost_batch *batch = prepare_draw(pipe, info->mode);
if (!batch) {
mesa_loge("prepare_draw failed");
return;
@ -3637,6 +3636,44 @@ panfrost_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info,
}
}
static void
panfrost_draw_fullscreen(struct panfrost_context *ctx,
struct panfrost_uncompiled_shader *vs,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib)
{
assert(!ctx->cond_query);
assert(!ctx->active_queries);
assert(!ctx->streamout.num_targets);
PAN_TRACE_FUNC(PAN_TRACE_GL_CMDSTREAM);
ctx->draw_calls++;
struct panfrost_batch *batch = prepare_draw(&ctx->base, MESA_PRIM_QUADS);
if (!batch) {
mesa_loge("prepare_draw failed");
return;
}
/* RUN_FULLSCREEN doesn't configure any position or varying shader but link
* info is needed. The active primitive update takes care of the fragment
* shader variant update. */
ctx->uncompiled[MESA_SHADER_VERTEX] = vs;
panfrost_update_shader_variant(ctx, MESA_SHADER_VERTEX);
panfrost_update_active_prim(ctx, MESA_PRIM_QUADS);
/* Clear the dirty vertex flag to ensure the shader state update doesn't
* emit any vertex info. */
ctx->dirty &= ~PAN_DIRTY_VERTEX;
panfrost_update_state_3d(batch);
panfrost_update_shader_state(batch, MESA_SHADER_FRAGMENT);
panfrost_clean_state_3d(ctx);
JOBX(launch_draw_fullscreen)(batch, type, attrib);
batch->draw_count++;
}
/* Launch grid is the compute equivalent of draw_vbo, so in this routine, we
* construct the COMPUTE job and some of its payload.
*/
@ -4655,6 +4692,7 @@ GENX(panfrost_cmdstream_screen_init)(struct panfrost_screen *screen)
screen->vtbl.emit_write_timestamp = emit_write_timestamp;
screen->vtbl.select_tile_size = GENX(pan_select_tile_size);
screen->vtbl.get_conv_desc = get_conv_desc;
screen->vtbl.draw_fullscreen = panfrost_draw_fullscreen;
pan_blend_shader_cache_init(&dev->blend_shaders, panfrost_device_gpu_id(dev),
dev->kmod.dev->props.gpu_variant,

View file

@ -1100,7 +1100,7 @@ panfrost_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
goto failed;
}
ctx->blitter = util_blitter_create(gallium);
ctx->blitter = panfrost_blitter_create(gallium);
ctx->writers = _mesa_hash_table_create(gallium, _mesa_hash_pointer,
_mesa_key_pointer_equal);

View file

@ -1461,6 +1461,163 @@ GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch,
}
}
#if PAN_ARCH == 10
static struct pan_ptr
panfrost_emit_fullscreen_vertex_array(struct panfrost_batch *batch,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib,
size_t *packet_stride,
size_t *attribute_stride)
{
struct pan_ptr array = { .cpu = NULL, .gpu = 0 };
if (type != UTIL_BLITTER_ATTRIB_TEXCOORD_XY &&
type != UTIL_BLITTER_ATTRIB_TEXCOORD_XYZW)
return array;
/* In RUN_IDVS malloc mode, the tiler is configured to allocate memory
* itself for the vertex shader. RUN_FULLSCREEN can also interpolate
* varyings but from a preallocated vertex array. Allocate a packet of 64
* vertices (like the tiler to respect cache-line alignment) and store the
* texcoords into varying slot 0. */
const size_t position_sz = 4 * sizeof(float);
struct varying { struct { float s, t, r, q; } slot0; };
const size_t packet_sz = position_sz + sizeof(struct varying);
array = pan_pool_alloc_aligned(&batch->pool.base, 64 * packet_sz, 64);
struct varying *varyings = (struct varying *)
((uint8_t *)array.cpu + 64 * position_sz);
varyings[0].slot0.s = attrib->texcoord.x1;
varyings[0].slot0.t = attrib->texcoord.y1;
varyings[0].slot0.r = attrib->texcoord.z;
varyings[0].slot0.q = attrib->texcoord.w;
varyings[1].slot0.s = attrib->texcoord.x2;
varyings[1].slot0.t = attrib->texcoord.y1;
varyings[1].slot0.r = attrib->texcoord.z;
varyings[1].slot0.q = attrib->texcoord.w;
varyings[2].slot0.s = attrib->texcoord.x1;
varyings[2].slot0.t = attrib->texcoord.y2;
varyings[2].slot0.r = attrib->texcoord.z;
varyings[2].slot0.q = attrib->texcoord.w;
*packet_stride = packet_sz;
*attribute_stride = sizeof(struct varying);
return array;
}
static struct pan_ptr
panfrost_emit_fullscreen_dcd(struct panfrost_batch *batch,
struct pan_ptr vertex_array,
size_t packet_stride, size_t attribute_stride,
uint64_t resources)
{
struct panfrost_context *ctx = batch->ctx;
struct panfrost_compiled_shader *fs = ctx->prog[MESA_SHADER_FRAGMENT];
struct pan_ptr dcd = pan_pool_alloc_desc(&batch->pool.base, DRAW);
struct pan_earlyzs_state earlyzs = pan_earlyzs_get(
fs->earlyzs, ctx->depth_stencil->writes_zs,
ctx->blend->base.alpha_to_coverage,
ctx->depth_stencil->zs_always_passes,
PAN_EARLYZS_ZS_TILEBUF_NOT_READ);
pan_cast_and_pack(dcd.cpu, DRAW, cfg) {
/* Flag 0 */
cfg.flags_0.multisample_enable = ctx->rasterizer->base.multisample;
cfg.flags_0.evaluate_per_sample = ctx->rasterizer->base.multisample &&
((ctx->min_samples > 1) || fs->info.fs.sample_shading ||
ctx->valhall_has_blend_shader);
cfg.flags_0.pixel_kill_operation = (enum mali_pixel_kill)earlyzs.kill;
cfg.flags_0.zs_update_operation = (enum mali_pixel_kill)earlyzs.update;
cfg.flags_0.allow_forward_pixel_to_kill =
pan_allow_forward_pixel_to_kill(ctx, fs);
cfg.flags_0.allow_forward_pixel_to_be_killed = !fs->info.writes_global;
cfg.flags_0.overdraw_alpha0 = panfrost_overdraw_alpha(ctx, 0);
cfg.flags_0.overdraw_alpha1 = panfrost_overdraw_alpha(ctx, 1);
cfg.flags_0.shader_modifies_coverage = fs->info.fs.writes_coverage ||
fs->info.fs.can_discard || ctx->blend->base.alpha_to_coverage;
cfg.flags_0.alpha_to_coverage = ctx->blend->base.alpha_to_coverage;
/* Flag 1 */
cfg.flags_1.sample_mask = ctx->rasterizer->base.multisample ?
ctx->sample_mask : 0xFFFF;
cfg.flags_1.render_target_mask =
(fs->info.outputs_written >> FRAG_RESULT_DATA0) & ctx->fb_rt_mask;
/* Vertex descriptor */
if (vertex_array.cpu) {
cfg.vertex_array.packet = true;
cfg.vertex_array.pointer = vertex_array.gpu;
cfg.vertex_array.vertex_packet_stride = packet_stride;
cfg.vertex_array.vertex_attribute_stride = attribute_stride;
}
/* Depth/stencil and blend descriptor */
cfg.minimum_z = batch->minimum_z;
cfg.maximum_z = batch->maximum_z;
cfg.depth_stencil = batch->depth_stencil;
cfg.blend_count = MAX2(batch->key.nr_cbufs, 1);
cfg.blend = batch->blend;
/* Shader environment */
cfg.shader.attribute_offset = 0;
cfg.shader.fau_count = DIV_ROUND_UP(
batch->nr_push_uniforms[MESA_SHADER_FRAGMENT], 2);
cfg.shader.resources = resources;
cfg.shader.shader = batch->rsd[MESA_SHADER_FRAGMENT];
cfg.shader.thread_storage = batch->tls.gpu;
cfg.shader.fau = batch->push_uniforms[MESA_SHADER_FRAGMENT];
}
return dcd;
}
#endif
void
GENX(csf_launch_draw_fullscreen)(struct panfrost_batch *batch,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib)
{
#if PAN_ARCH == 10
PAN_TRACE_FUNC(PAN_TRACE_GL_CSF);
struct cs_builder *b = batch->csf.cs.builder;
if (batch->draw_count == 0) {
emit_tiler_oom_context(b, batch);
cs_vt_start(batch->csf.cs.builder, cs_now());
}
/* Build draw call. */
size_t packet_stride, attribute_stride;
struct pan_ptr vertex_array = panfrost_emit_fullscreen_vertex_array(
batch, type, attrib, &packet_stride, &attribute_stride);
uint64_t resources = panfrost_emit_resources(batch, MESA_SHADER_FRAGMENT);
struct pan_ptr dcd = panfrost_emit_fullscreen_dcd(
batch, vertex_array, packet_stride, attribute_stride, resources);
struct mali_primitive_flags_packed primitive_flags;
pan_pack(&primitive_flags, PRIMITIVE_FLAGS, cfg) {
cfg.scissor_array_enable = false;
cfg.view_mask = 0;
}
/* Set input staging registers. */
uint64_t *sbd = (uint64_t *)batch->scissor;
cs_move64_to(b, cs_sr_reg64(b, FULLSCREEN, TILER_CTX),
csf_get_tiler_desc(batch));
cs_move64_to(b, cs_sr_reg64(b, FULLSCREEN, SCISSOR_BOX), *sbd);
cs_move32_to(b, cs_sr_reg32(b, FULLSCREEN, TILER_FLAGS),
primitive_flags.opaque[0]);
/* Emit RUN_FULLSCREEN. */
struct cs_index dcd_pointer = cs_reg64(b, 64);
cs_move64_to(b, dcd_pointer, dcd.gpu);
cs_run_fullscreen(b, 0, dcd_pointer);
#else
UNREACHABLE("Unsupported architecture!");
#endif
}
#define POSITION_FIFO_SIZE (64 * 1024)
static enum drm_panthor_group_priority

View file

@ -83,6 +83,7 @@ struct panfrost_csf_context {
#if defined(PAN_ARCH) && PAN_ARCH >= 10
#include "genxml/gen_macros.h"
#include "util/u_blitter.h"
struct panfrost_batch;
struct panfrost_context;
@ -120,6 +121,9 @@ void GENX(csf_launch_draw_indirect)(struct panfrost_batch *batch,
const struct pipe_draw_info *info,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect);
void GENX(csf_launch_draw_fullscreen)(struct panfrost_batch *batch,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib);
void GENX(csf_emit_write_timestamp)(struct panfrost_batch *batch,
struct panfrost_resource *dst,

View file

@ -1020,6 +1020,14 @@ GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
UNREACHABLE("draw indirect not implemented for jm");
}
void
GENX(jm_launch_draw_fullscreen)(struct panfrost_batch *batch,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib)
{
UNREACHABLE("draw fullscreen not implemented for jm");
}
void
GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
struct panfrost_resource *dst, unsigned offset)

View file

@ -26,6 +26,7 @@ struct panfrost_jm_batch {
#if defined(PAN_ARCH) && PAN_ARCH < 10
#include "genxml/gen_macros.h"
#include "util/u_blitter.h"
struct panfrost_batch;
struct panfrost_context;
@ -75,6 +76,9 @@ void GENX(jm_launch_draw_indirect)(struct panfrost_batch *batch,
const struct pipe_draw_info *info,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect);
void GENX(jm_launch_draw_fullscreen)(struct panfrost_batch *batch,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib);
void GENX(jm_emit_write_timestamp)(struct panfrost_batch *batch,
struct panfrost_resource *dst,

View file

@ -167,6 +167,8 @@ enum {
PAN_RENDER_CLEAR = PAN_SAVE_FRAGMENT_STATE | PAN_SAVE_FRAGMENT_CONSTANT,
};
struct blitter_context *panfrost_blitter_create(struct pipe_context *pipe);
/* Callers should ensure that all AFBC/AFRC resources that will be used in the
* blit operation are legalized before calling blitter operations, otherwise
* we may trigger a recursive blit */

View file

@ -15,6 +15,7 @@
#include "util/disk_cache.h"
#include "util/log.h"
#include "util/set.h"
#include "util/u_blitter.h"
#include "util/u_dynarray.h"
#include "pan_device.h"
@ -30,6 +31,7 @@ struct panfrost_batch;
struct panfrost_context;
struct panfrost_resource;
struct panfrost_compiled_shader;
struct panfrost_uncompiled_shader;
struct pan_fb_info;
struct pan_blend_state;
@ -92,6 +94,12 @@ struct panfrost_vtable {
/* construct a render target blend descriptor */
uint64_t (*get_conv_desc)(enum pipe_format fmt, unsigned rt,
unsigned force_size, bool dithered);
/* Run a fullscreen draw call (for blits) */
void (*draw_fullscreen)(struct panfrost_context *ctx,
struct panfrost_uncompiled_shader *vs,
enum blitter_attrib_type type,
const struct blitter_attrib *attrib);
};
struct panfrost_screen {

View file

@ -920,6 +920,12 @@
<value name="PRIMITIVE_SIZE" value="60"/>
</enum>
<enum name="FULLSCREEN SR">
<value name="TILER_CTX" value="40"/>
<value name="SCISSOR_BOX" value="42"/>
<value name="TILER_FLAGS" value="56"/>
</enum>
<enum name="FRAGMENT SR">
<value name="FBD_POINTER" value="40"/>
<value name="BBOX_MIN" value="42"/>
@ -2096,10 +2102,8 @@
<struct name="Vertex Array" size="3">
<field name="Packet" size="1" start="0:0" type="bool"/>
<!-- Written by hardware in packet mode -->
<!-- Written by hardware in RUN_IDVS malloc mode -->
<field name="Pointer" size="58" start="0:6" type="address" modifier="shr(6)"/>
<!-- Written by hardware, leave zero -->
<field name="Vertex packet stride" size="16" start="2:0" type="uint"/>
<field name="Vertex attribute stride" size="16" start="2:16" type="uint"/>
</struct>

View file

@ -1557,10 +1557,8 @@
<struct name="Vertex Array" size="3">
<field name="Packet" size="1" start="0:0" type="bool"/>
<!-- Written by hardware in packet mode -->
<!-- Written by hardware in RUN_IDVS malloc mode -->
<field name="Pointer" size="58" start="0:6" type="address" modifier="shr(6)"/>
<!-- Written by hardware, leave zero -->
<field name="Vertex packet stride" size="16" start="2:0" type="uint"/>
<field name="Vertex attribute stride" size="16" start="2:16" type="uint"/>
</struct>